Passed
Push — master ( 90c78c...318ab6 )
by Tomas Norre
07:41
created

CrawlerController::getUrlsForPageId()   C

Complexity

Conditions 16
Paths 96

Size

Total Lines 93
Code Lines 51

Duplication

Lines 0
Ratio 0 %

Code Coverage

Tests 46
CRAP Score 16.0585

Importance

Changes 3
Bugs 0 Features 0
Metric Value
cc 16
eloc 51
c 3
b 0
f 0
nc 96
nop 1
dl 0
loc 93
ccs 46
cts 49
cp 0.9388
crap 16.0585
rs 5.5666

How to fix   Long Method    Complexity   

Long Method

Small methods make your code easier to understand, in particular if combined with a good name. Besides, if your method is small, finding a good name is usually much easier.

For example, if you find yourself adding comments to a method's body, this is usually a good sign to extract the commented part to a new method, and use the comment as a starting point when coming up with a good name for this new method.

Commonly applied refactorings include:

1
<?php
2
3
declare(strict_types=1);
4
5
namespace AOE\Crawler\Controller;
6
7
/***************************************************************
8
 *  Copyright notice
9
 *
10
 *  (c) 2020 AOE GmbH <[email protected]>
11
 *
12
 *  All rights reserved
13
 *
14
 *  This script is part of the TYPO3 project. The TYPO3 project is
15
 *  free software; you can redistribute it and/or modify
16
 *  it under the terms of the GNU General Public License as published by
17
 *  the Free Software Foundation; either version 3 of the License, or
18
 *  (at your option) any later version.
19
 *
20
 *  The GNU General Public License can be found at
21
 *  http://www.gnu.org/copyleft/gpl.html.
22
 *
23
 *  This script is distributed in the hope that it will be useful,
24
 *  but WITHOUT ANY WARRANTY; without even the implied warranty of
25
 *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
26
 *  GNU General Public License for more details.
27
 *
28
 *  This copyright notice MUST APPEAR in all copies of the script!
29
 ***************************************************************/
30
31
use AOE\Crawler\Configuration\ExtensionConfigurationProvider;
32
use AOE\Crawler\Converter\JsonCompatibilityConverter;
33
use AOE\Crawler\Crawler;
34
use AOE\Crawler\CrawlStrategy\CrawlStrategyFactory;
35
use AOE\Crawler\Domain\Model\Process;
36
use AOE\Crawler\Domain\Repository\ConfigurationRepository;
37
use AOE\Crawler\Domain\Repository\ProcessRepository;
38
use AOE\Crawler\Domain\Repository\QueueRepository;
39
use AOE\Crawler\QueueExecutor;
40
use AOE\Crawler\Service\UrlService;
41
use AOE\Crawler\Utility\SignalSlotUtility;
42
use AOE\Crawler\Value\QueueFilter;
43
use Psr\Http\Message\UriInterface;
44
use Psr\Log\LoggerAwareInterface;
45
use Psr\Log\LoggerAwareTrait;
46
use TYPO3\CMS\Backend\Tree\View\PageTreeView;
47
use TYPO3\CMS\Backend\Utility\BackendUtility;
48
use TYPO3\CMS\Core\Authentication\BackendUserAuthentication;
49
use TYPO3\CMS\Core\Compatibility\PublicMethodDeprecationTrait;
50
use TYPO3\CMS\Core\Compatibility\PublicPropertyDeprecationTrait;
51
use TYPO3\CMS\Core\Core\Bootstrap;
52
use TYPO3\CMS\Core\Core\Environment;
53
use TYPO3\CMS\Core\Database\Connection;
54
use TYPO3\CMS\Core\Database\ConnectionPool;
55
use TYPO3\CMS\Core\Database\Query\Restriction\DeletedRestriction;
56
use TYPO3\CMS\Core\Imaging\Icon;
57
use TYPO3\CMS\Core\Imaging\IconFactory;
58
use TYPO3\CMS\Core\Site\Entity\Site;
59
use TYPO3\CMS\Core\Type\Bitmask\Permission;
60
use TYPO3\CMS\Core\TypoScript\Parser\TypoScriptParser;
61
use TYPO3\CMS\Core\Utility\DebugUtility;
62
use TYPO3\CMS\Core\Utility\GeneralUtility;
63
use TYPO3\CMS\Core\Utility\MathUtility;
64
use TYPO3\CMS\Extbase\Object\ObjectManager;
65
use TYPO3\CMS\Frontend\Page\PageRepository;
66
67
/**
68
 * Class CrawlerController
69
 *
70
 * @package AOE\Crawler\Controller
71
 */
72
class CrawlerController implements LoggerAwareInterface
73
{
74
    use LoggerAwareTrait;
75
    use PublicMethodDeprecationTrait;
76
    use PublicPropertyDeprecationTrait;
77
78
    public const CLI_STATUS_NOTHING_PROCCESSED = 0;
79
80
    //queue not empty
81
    public const CLI_STATUS_REMAIN = 1;
82
83
    //(some) queue items where processed
84
    public const CLI_STATUS_PROCESSED = 2;
85
86
    //instance didn't finish
87
    public const CLI_STATUS_ABORTED = 4;
88
89
    public const CLI_STATUS_POLLABLE_PROCESSED = 8;
90
91
    /**
92
     * @var integer
93
     */
94
    public $setID = 0;
95
96
    /**
97
     * @var string
98
     */
99
    public $processID = '';
100
101
    /**
102
     * @var array
103
     */
104
    public $duplicateTrack = [];
105
106
    /**
107
     * @var array
108
     */
109
    public $downloadUrls = [];
110
111
    /**
112
     * @var array
113
     */
114
    public $incomingProcInstructions = [];
115
116
    /**
117
     * @var array
118
     */
119
    public $incomingConfigurationSelection = [];
120
121
    /**
122
     * @var bool
123
     */
124
    public $registerQueueEntriesInternallyOnly = false;
125
126
    /**
127
     * @var array
128
     */
129
    public $queueEntries = [];
130
131
    /**
132
     * @var array
133
     */
134
    public $urlList = [];
135
136
    /**
137
     * @var array
138
     */
139
    public $extensionSettings = [];
140
141
    /**
142
     * Mount Point
143
     *
144
     * @var bool
145
     * Todo: Check what this is used for and adjust the type hint or code, as bool doesn't match the current code.
146
     */
147
    public $MP = false;
148
149
    /**
150
     * @var string
151
     * @deprecated
152
     */
153
    protected $processFilename;
154
155
    /**
156
     * Holds the internal access mode can be 'gui','cli' or 'cli_im'
157
     *
158
     * @var string
159
     * @deprecated
160
     */
161
    protected $accessMode;
162
163
    /**
164
     * @var QueueRepository
165
     */
166
    protected $queueRepository;
167
168
    /**
169
     * @var ProcessRepository
170
     */
171
    protected $processRepository;
172
173
    /**
174
     * @var ConfigurationRepository
175
     */
176
    protected $configurationRepository;
177
178
    /**
179
     * @var string
180
     */
181
    protected $tableName = 'tx_crawler_queue';
182
183
    /**
184
     * @var QueueExecutor
185
     */
186
    protected $queueExecutor;
187
188
    /**
189
     * @var int
190
     */
191
    protected $maximumUrlsToCompile = 10000;
192
193
    /**
194
     * @var IconFactory
195
     */
196
    protected $iconFactory;
197
198
    /**
199
     * @var string[]
200
     */
201
    private $deprecatedPublicMethods = [
0 ignored issues
show
introduced by
The private property $deprecatedPublicMethods is not used, and could be removed.
Loading history...
202
        'cleanUpOldQueueEntries' => 'Using CrawlerController::cleanUpOldQueueEntries() is deprecated since 9.0.1 and will be removed in v11.x, please use QueueRepository->cleanUpOldQueueEntries() instead.',
203
        'CLI_debug' => 'Using CrawlerController->CLI_debug() is deprecated since 9.1.3 and will be removed in v11.x',
204
        'getAccessMode' => 'Using CrawlerController->getAccessMode() is deprecated since 9.1.3 and will be removed in v11.x',
205
        'getLogEntriesForSetId' => 'Using crawlerController::getLogEntriesForSetId() is deprecated since 9.0.1 and will be removed in v11.x',
206
        'flushQueue' => 'Using CrawlerController::flushQueue() is deprecated since 9.0.1 and will be removed in v11.x, please use QueueRepository->flushQueue() instead.',
207
        'setAccessMode' => 'Using CrawlerController->setAccessMode() is deprecated since 9.1.3 and will be removed in v11.x',
208
        'getDisabled' => 'Using CrawlerController->getDisabled() is deprecated since 9.1.3 and will be removed in v11.x, please use Crawler->isDisabled() instead',
209
        'setDisabled' => 'Using CrawlerController->setDisabled() is deprecated since 9.1.3 and will be removed in v11.x, please use Crawler->setDisabled() instead',
210
        'getProcessFilename' => 'Using CrawlerController->getProcessFilename() is deprecated since 9.1.3 and will be removed in v11.x',
211
        'setProcessFilename' => 'Using CrawlerController->setProcessFilename() is deprecated since 9.1.3 and will be removed in v11.x',
212
213
    ];
214
215
    /**
216
     * @var string[]
217
     */
218
    private $deprecatedPublicProperties = [
1 ignored issue
show
introduced by
The private property $deprecatedPublicProperties is not used, and could be removed.
Loading history...
219
        'accessMode' => 'Using CrawlerController->accessMode is deprecated since 9.1.3 and will be removed in v11.x',
220
        'processFilename' => 'Using CrawlerController->accessMode is deprecated since 9.1.3 and will be removed in v11.x',
221
    ];
222
223
    /**
224
     * @var BackendUserAuthentication|null
225
     */
226
    private $backendUser;
227
228
    /**
229
     * @var integer
230
     */
231
    private $scheduledTime = 0;
232
233
    /**
234
     * @var integer
235
     */
236
    private $reqMinute = 0;
237
238
    /**
239
     * @var bool
240
     */
241
    private $submitCrawlUrls = false;
242
243
    /**
244
     * @var bool
245
     */
246
    private $downloadCrawlUrls = false;
247
248
    /**
249
     * @var PageRepository
250
     */
251
    private $pageRepository;
252
253
    /**
254
     * @var Crawler
255
     */
256
    private $crawler;
257
258
    /************************************
259
     *
260
     * Getting URLs based on Page TSconfig
261
     *
262
     ************************************/
263
264 36
    public function __construct()
265
    {
266 36
        $objectManager = GeneralUtility::makeInstance(ObjectManager::class);
267 36
        $crawlStrategyFactory = GeneralUtility::makeInstance(CrawlStrategyFactory::class);
268 36
        $this->queueRepository = $objectManager->get(QueueRepository::class);
269 36
        $this->processRepository = $objectManager->get(ProcessRepository::class);
270 36
        $this->configurationRepository = $objectManager->get(ConfigurationRepository::class);
271 36
        $this->pageRepository = $objectManager->get(PageRepository::class);
272 36
        $this->queueExecutor = GeneralUtility::makeInstance(QueueExecutor::class, $crawlStrategyFactory);
273 36
        $this->iconFactory = GeneralUtility::makeInstance(IconFactory::class);
274 36
        $this->crawler = GeneralUtility::makeInstance(Crawler::class);
275
276 36
        $this->processFilename = Environment::getVarPath() . '/lock/tx_crawler.proc';
0 ignored issues
show
Deprecated Code introduced by
The property AOE\Crawler\Controller\C...oller::$processFilename has been deprecated. ( Ignorable by Annotation )

If this is a false-positive, you can also ignore this issue in your code via the ignore-deprecated  annotation

276
        /** @scrutinizer ignore-deprecated */ $this->processFilename = Environment::getVarPath() . '/lock/tx_crawler.proc';
Loading history...
277
278
        /** @var ExtensionConfigurationProvider $configurationProvider */
279 36
        $configurationProvider = GeneralUtility::makeInstance(ExtensionConfigurationProvider::class);
280 36
        $settings = $configurationProvider->getExtensionConfiguration();
281 36
        $this->extensionSettings = is_array($settings) ? $settings : [];
0 ignored issues
show
introduced by
The condition is_array($settings) is always true.
Loading history...
282
283
        // set defaults:
284 36
        if (MathUtility::convertToPositiveInteger($this->extensionSettings['countInARun']) === 0) {
285
            $this->extensionSettings['countInARun'] = 100;
286
        }
287
288 36
        $this->extensionSettings['processLimit'] = MathUtility::forceIntegerInRange($this->extensionSettings['processLimit'], 1, 99, 1);
289 36
        $this->maximumUrlsToCompile = MathUtility::forceIntegerInRange($this->extensionSettings['maxCompileUrls'], 1, 1000000000, 10000);
290 36
    }
291
292
    /**
293
     * Method to set the accessMode can be gui, cli or cli_im
294
     *
295
     * @return string
296
     * @deprecated
297
     */
298 1
    public function getAccessMode()
299
    {
300 1
        return $this->accessMode;
0 ignored issues
show
Deprecated Code introduced by
The property AOE\Crawler\Controller\C...Controller::$accessMode has been deprecated. ( Ignorable by Annotation )

If this is a false-positive, you can also ignore this issue in your code via the ignore-deprecated  annotation

300
        return /** @scrutinizer ignore-deprecated */ $this->accessMode;
Loading history...
301
    }
302
303
    /**
304
     * @param string $accessMode
305
     * @deprecated
306
     */
307 1
    public function setAccessMode($accessMode): void
308
    {
309 1
        $this->accessMode = $accessMode;
0 ignored issues
show
Deprecated Code introduced by
The property AOE\Crawler\Controller\C...Controller::$accessMode has been deprecated. ( Ignorable by Annotation )

If this is a false-positive, you can also ignore this issue in your code via the ignore-deprecated  annotation

309
        /** @scrutinizer ignore-deprecated */ $this->accessMode = $accessMode;
Loading history...
310 1
    }
311
312
    /**
313
     * Set disabled status to prevent processes from being processed
314
     *
315
     * @param bool $disabled (optional, defaults to true)
316
     * @deprecated
317
     */
318 2
    public function setDisabled($disabled = true): void
319
    {
320 2
        if ($disabled) {
321 1
            GeneralUtility::writeFile($this->processFilename, '');
0 ignored issues
show
Deprecated Code introduced by
The property AOE\Crawler\Controller\C...oller::$processFilename has been deprecated. ( Ignorable by Annotation )

If this is a false-positive, you can also ignore this issue in your code via the ignore-deprecated  annotation

321
            GeneralUtility::writeFile(/** @scrutinizer ignore-deprecated */ $this->processFilename, '');
Loading history...
322
        } else {
323 1
            if (is_file($this->processFilename)) {
0 ignored issues
show
Deprecated Code introduced by
The property AOE\Crawler\Controller\C...oller::$processFilename has been deprecated. ( Ignorable by Annotation )

If this is a false-positive, you can also ignore this issue in your code via the ignore-deprecated  annotation

323
            if (is_file(/** @scrutinizer ignore-deprecated */ $this->processFilename)) {
Loading history...
324 1
                unlink($this->processFilename);
0 ignored issues
show
Deprecated Code introduced by
The property AOE\Crawler\Controller\C...oller::$processFilename has been deprecated. ( Ignorable by Annotation )

If this is a false-positive, you can also ignore this issue in your code via the ignore-deprecated  annotation

324
                unlink(/** @scrutinizer ignore-deprecated */ $this->processFilename);
Loading history...
325
            }
326
        }
327 2
    }
328
329
    /**
330
     * Get disable status
331
     *
332
     * @return bool true if disabled
333
     * @deprecated
334
     */
335 2
    public function getDisabled()
336
    {
337 2
        return is_file($this->processFilename);
0 ignored issues
show
Deprecated Code introduced by
The property AOE\Crawler\Controller\C...oller::$processFilename has been deprecated. ( Ignorable by Annotation )

If this is a false-positive, you can also ignore this issue in your code via the ignore-deprecated  annotation

337
        return is_file(/** @scrutinizer ignore-deprecated */ $this->processFilename);
Loading history...
338
    }
339
340
    /**
341
     * @param string $filenameWithPath
342
     * @deprecated
343
     */
344 3
    public function setProcessFilename($filenameWithPath): void
345
    {
346 3
        $this->processFilename = $filenameWithPath;
0 ignored issues
show
Deprecated Code introduced by
The property AOE\Crawler\Controller\C...oller::$processFilename has been deprecated. ( Ignorable by Annotation )

If this is a false-positive, you can also ignore this issue in your code via the ignore-deprecated  annotation

346
        /** @scrutinizer ignore-deprecated */ $this->processFilename = $filenameWithPath;
Loading history...
347 3
    }
348
349
    /**
350
     * @return string
351
     * @deprecated
352
     */
353 1
    public function getProcessFilename()
354
    {
355 1
        return $this->processFilename;
0 ignored issues
show
Deprecated Code introduced by
The property AOE\Crawler\Controller\C...oller::$processFilename has been deprecated. ( Ignorable by Annotation )

If this is a false-positive, you can also ignore this issue in your code via the ignore-deprecated  annotation

355
        return /** @scrutinizer ignore-deprecated */ $this->processFilename;
Loading history...
356
    }
357
358
    /**
359
     * Sets the extensions settings (unserialized pendant of $TYPO3_CONF_VARS['EXT']['extConf']['crawler']).
360
     */
361 14
    public function setExtensionSettings(array $extensionSettings): void
362
    {
363 14
        $this->extensionSettings = $extensionSettings;
364 14
    }
365
366
    /**
367
     * Check if the given page should be crawled
368
     *
369
     * @return false|string false if the page should be crawled (not excluded), true / skipMessage if it should be skipped
370
     */
371 12
    public function checkIfPageShouldBeSkipped(array $pageRow)
372
    {
373 12
        $skipPage = false;
374
        // message will be overwritten later
375 12
        $skipMessage = 'Skipped';
376
377
        // if page is hidden
378 12
        if (! $this->extensionSettings['crawlHiddenPages']) {
379 12
            if ($pageRow['hidden']) {
380 1
                $skipPage = true;
381 1
                $skipMessage = 'Because page is hidden';
382
            }
383
        }
384
385 12
        if (! $skipPage) {
386 11
            if (GeneralUtility::inList('3,4,199,254,255', $pageRow['doktype'])) {
387 3
                $skipPage = true;
388 3
                $skipMessage = 'Because doktype is not allowed';
389
            }
390
        }
391
392 12
        if (! $skipPage) {
393 8
            foreach ($GLOBALS['TYPO3_CONF_VARS']['EXTCONF']['crawler']['excludeDoktype'] ?? [] as $key => $doktypeList) {
394 1
                if (GeneralUtility::inList($doktypeList, $pageRow['doktype'])) {
395 1
                    $skipPage = true;
396 1
                    $skipMessage = 'Doktype was excluded by "' . $key . '"';
397 1
                    break;
398
                }
399
            }
400
        }
401
402 12
        if (! $skipPage) {
403
            // veto hook
404 7
            foreach ($GLOBALS['TYPO3_CONF_VARS']['EXTCONF']['crawler']['pageVeto'] ?? [] as $key => $func) {
405
                $params = [
406 2
                    'pageRow' => $pageRow,
407
                ];
408
                // expects "false" if page is ok and "true" or a skipMessage if this page should _not_ be crawled
409 2
                $veto = GeneralUtility::callUserFunction($func, $params, $this);
410 2
                if ($veto !== false) {
411 2
                    $skipPage = true;
412 2
                    if (is_string($veto)) {
413 1
                        $skipMessage = $veto;
414
                    } else {
415 1
                        $skipMessage = 'Veto from hook "' . htmlspecialchars($key) . '"';
416
                    }
417
                    // no need to execute other hooks if a previous one return a veto
418 2
                    break;
419
                }
420
            }
421
        }
422
423 12
        return $skipPage ? $skipMessage : false;
424
    }
425
426
    /**
427
     * Wrapper method for getUrlsForPageId()
428
     * It returns an array of configurations and no urls!
429
     *
430
     * @param array $pageRow Page record with at least dok-type and uid columns.
431
     * @param string $skipMessage
432
     * @return array
433
     * @see getUrlsForPageId()
434
     */
435 6
    public function getUrlsForPageRow(array $pageRow, &$skipMessage = '')
436
    {
437 6
        $message = $this->checkIfPageShouldBeSkipped($pageRow);
438 6
        if ($message === false) {
439 5
            $res = $this->getUrlsForPageId($pageRow['uid']);
440 5
            $skipMessage = '';
441
        } else {
442 1
            $skipMessage = $message;
443 1
            $res = [];
444
        }
445
446 6
        return $res;
447
    }
448
449
    /**
450
     * Creates a list of URLs from input array (and submits them to queue if asked for)
451
     * See Web > Info module script + "indexed_search"'s crawler hook-client using this!
452
     *
453
     * @param array $vv Information about URLs from pageRow to crawl.
454
     * @param array $pageRow Page row
455
     * @param int $scheduledTime Unix time to schedule indexing to, typically time()
456
     * @param int $reqMinute Number of requests per minute (creates the interleave between requests)
457
     * @param bool $submitCrawlUrls If set, submits the URLs to queue
458
     * @param bool $downloadCrawlUrls If set (and submitcrawlUrls is false) will fill $downloadUrls with entries)
459
     * @param array $duplicateTrack Array which is passed by reference and contains the an id per url to secure we will not crawl duplicates
460
     * @param array $downloadUrls Array which will be filled with URLS for download if flag is set.
461
     * @param array $incomingProcInstructions Array of processing instructions
462
     * @return string List of URLs (meant for display in backend module)
463
     */
464 4
    public function urlListFromUrlArray(
465
        array $vv,
466
        array $pageRow,
467
        $scheduledTime,
468
        $reqMinute,
469
        $submitCrawlUrls,
470
        $downloadCrawlUrls,
471
        array &$duplicateTrack,
472
        array &$downloadUrls,
473
        array $incomingProcInstructions
474
    ) {
475 4
        if (! is_array($vv['URLs'])) {
476
            return 'ERROR - no URL generated';
477
        }
478 4
        $urlLog = [];
479 4
        $pageId = (int) $pageRow['uid'];
480 4
        $configurationHash = $this->getConfigurationHash($vv);
481 4
        $skipInnerCheck = $this->queueRepository->noUnprocessedQueueEntriesForPageWithConfigurationHashExist($pageId, $configurationHash);
482
483 4
        $urlService = new UrlService();
484
485 4
        foreach ($vv['URLs'] as $urlQuery) {
486 4
            if (! $this->drawURLs_PIfilter($vv['subCfg']['procInstrFilter'], $incomingProcInstructions)) {
487
                continue;
488
            }
489 4
            $url = (string) $urlService->getUrlFromPageAndQueryParameters(
490 4
                $pageId,
491 4
                $urlQuery,
492 4
                $vv['subCfg']['baseUrl'] ?? null,
493 4
                $vv['subCfg']['force_ssl'] ?? 0
494
            );
495
496
            // Create key by which to determine unique-ness:
497 4
            $uKey = $url . '|' . $vv['subCfg']['userGroups'] . '|' . $vv['subCfg']['procInstrFilter'];
498
499 4
            if (isset($duplicateTrack[$uKey])) {
500
                //if the url key is registered just display it and do not resubmit is
501
                $urlLog[] = '<em><span class="text-muted">' . htmlspecialchars($url) . '</span></em>';
502
            } else {
503
                // Scheduled time:
504 4
                $schTime = $scheduledTime + round(count($duplicateTrack) * (60 / $reqMinute));
505 4
                $schTime = intval($schTime / 60) * 60;
506 4
                $formattedDate = BackendUtility::datetime($schTime);
507 4
                $this->urlList[] = '[' . $formattedDate . '] ' . $url;
508 4
                $urlList = '[' . $formattedDate . '] ' . htmlspecialchars($url);
509
510
                // Submit for crawling!
511 4
                if ($submitCrawlUrls) {
512 4
                    $added = $this->addUrl(
513 4
                        $pageId,
514 4
                        $url,
515 4
                        $vv['subCfg'],
516 4
                        $scheduledTime,
517 4
                        $configurationHash,
518 4
                        $skipInnerCheck
519
                    );
520 4
                    if ($added === false) {
521 4
                        $urlList .= ' (URL already existed)';
522
                    }
523
                } elseif ($downloadCrawlUrls) {
524
                    $downloadUrls[$url] = $url;
525
                }
526 4
                $urlLog[] = $urlList;
527
            }
528 4
            $duplicateTrack[$uKey] = true;
529
        }
530
531 4
        return implode('<br>', $urlLog);
532
    }
533
534
    /**
535
     * Returns true if input processing instruction is among registered ones.
536
     *
537
     * @param string $piString PI to test
538
     * @param array $incomingProcInstructions Processing instructions
539
     * @return boolean
540
     */
541 5
    public function drawURLs_PIfilter($piString, array $incomingProcInstructions)
542
    {
543 5
        if (empty($incomingProcInstructions)) {
544 1
            return true;
545
        }
546
547 4
        foreach ($incomingProcInstructions as $pi) {
548 4
            if (GeneralUtility::inList($piString, $pi)) {
549 2
                return true;
550
            }
551
        }
552 2
        return false;
553
    }
554
555 5
    public function getPageTSconfigForId($id): array
556
    {
557 5
        if (! $this->MP) {
558 5
            $pageTSconfig = BackendUtility::getPagesTSconfig($id);
0 ignored issues
show
Deprecated Code introduced by
The function TYPO3\CMS\Backend\Utilit...ity::getPagesTSconfig() has been deprecated. ( Ignorable by Annotation )

If this is a false-positive, you can also ignore this issue in your code via the ignore-deprecated  annotation

558
            $pageTSconfig = /** @scrutinizer ignore-deprecated */ BackendUtility::getPagesTSconfig($id);
Loading history...
559
        } else {
560
            // TODO: Please check, this makes no sense to split a boolean value.
561
            [, $mountPointId] = explode('-', $this->MP);
0 ignored issues
show
Bug introduced by
$this->MP of type true is incompatible with the type string expected by parameter $string of explode(). ( Ignorable by Annotation )

If this is a false-positive, you can also ignore this issue in your code via the ignore-type  annotation

561
            [, $mountPointId] = explode('-', /** @scrutinizer ignore-type */ $this->MP);
Loading history...
562
            $pageTSconfig = BackendUtility::getPagesTSconfig($mountPointId);
0 ignored issues
show
Bug introduced by
$mountPointId of type string is incompatible with the type integer expected by parameter $id of TYPO3\CMS\Backend\Utilit...ity::getPagesTSconfig(). ( Ignorable by Annotation )

If this is a false-positive, you can also ignore this issue in your code via the ignore-type  annotation

562
            $pageTSconfig = BackendUtility::getPagesTSconfig(/** @scrutinizer ignore-type */ $mountPointId);
Loading history...
Deprecated Code introduced by
The function TYPO3\CMS\Backend\Utilit...ity::getPagesTSconfig() has been deprecated. ( Ignorable by Annotation )

If this is a false-positive, you can also ignore this issue in your code via the ignore-deprecated  annotation

562
            $pageTSconfig = /** @scrutinizer ignore-deprecated */ BackendUtility::getPagesTSconfig($mountPointId);
Loading history...
563
        }
564
565
        // Call a hook to alter configuration
566 5
        if (is_array($GLOBALS['TYPO3_CONF_VARS']['EXTCONF']['crawler']['getPageTSconfigForId'])) {
567
            $params = [
568
                'pageId' => $id,
569
                'pageTSConfig' => &$pageTSconfig,
570
            ];
571
            foreach ($GLOBALS['TYPO3_CONF_VARS']['EXTCONF']['crawler']['getPageTSconfigForId'] as $userFunc) {
572
                GeneralUtility::callUserFunction($userFunc, $params, $this);
573
            }
574
        }
575 5
        return $pageTSconfig;
576
    }
577
578
    /**
579
     * This methods returns an array of configurations.
580
     * Adds no urls!
581
     */
582 4
    public function getUrlsForPageId(int $pageId): array
583
    {
584
        // Get page TSconfig for page ID
585 4
        $pageTSconfig = $this->getPageTSconfigForId($pageId);
586
587 4
        $res = [];
588
589
        // Fetch Crawler Configuration from pageTSconfig
590 4
        $crawlerCfg = $pageTSconfig['tx_crawler.']['crawlerCfg.']['paramSets.'] ?? [];
591 4
        foreach ($crawlerCfg as $key => $values) {
592 3
            if (! is_array($values)) {
593 3
                continue;
594
            }
595 3
            $key = str_replace('.', '', $key);
596
            // Sub configuration for a single configuration string:
597 3
            $subCfg = (array) $crawlerCfg[$key . '.'];
598 3
            $subCfg['key'] = $key;
599
600 3
            if (strcmp($subCfg['procInstrFilter'] ?? '', '')) {
601 3
                $subCfg['procInstrFilter'] = implode(',', GeneralUtility::trimExplode(',', $subCfg['procInstrFilter']));
602
            }
603 3
            $pidOnlyList = implode(',', GeneralUtility::trimExplode(',', $subCfg['pidsOnly'], true));
604
605
            // process configuration if it is not page-specific or if the specific page is the current page:
606
            // TODO: Check if $pidOnlyList can be kept as Array instead of imploded
607 3
            if (! strcmp((string) $subCfg['pidsOnly'], '') || GeneralUtility::inList($pidOnlyList, strval($pageId))) {
608
609
                // Explode, process etc.:
610 3
                $res[$key] = [];
611 3
                $res[$key]['subCfg'] = $subCfg;
612 3
                $res[$key]['paramParsed'] = GeneralUtility::explodeUrl2Array($crawlerCfg[$key]);
613 3
                $res[$key]['paramExpanded'] = $this->expandParameters($res[$key]['paramParsed'], $pageId);
614 3
                $res[$key]['origin'] = 'pagets';
615
616
                // recognize MP value
617 3
                if (! $this->MP) {
618 3
                    $res[$key]['URLs'] = $this->compileUrls($res[$key]['paramExpanded'], ['?id=' . $pageId]);
619
                } else {
620
                    $res[$key]['URLs'] = $this->compileUrls($res[$key]['paramExpanded'], ['?id=' . $pageId . '&MP=' . $this->MP]);
0 ignored issues
show
Bug introduced by
Are you sure $this->MP of type true can be used in concatenation? ( Ignorable by Annotation )

If this is a false-positive, you can also ignore this issue in your code via the ignore-type  annotation

620
                    $res[$key]['URLs'] = $this->compileUrls($res[$key]['paramExpanded'], ['?id=' . $pageId . '&MP=' . /** @scrutinizer ignore-type */ $this->MP]);
Loading history...
621
                }
622
            }
623
        }
624
625
        // Get configuration from tx_crawler_configuration records up the rootline
626 4
        $crawlerConfigurations = $this->configurationRepository->getCrawlerConfigurationRecordsFromRootLine($pageId);
627 4
        foreach ($crawlerConfigurations as $configurationRecord) {
628
629
            // check access to the configuration record
630 1
            if (empty($configurationRecord['begroups']) || $this->getBackendUser()->isAdmin() || $this->hasGroupAccess($this->getBackendUser()->user['usergroup_cached_list'], $configurationRecord['begroups'])) {
631 1
                $pidOnlyList = implode(',', GeneralUtility::trimExplode(',', $configurationRecord['pidsonly'], true));
632
633
                // process configuration if it is not page-specific or if the specific page is the current page:
634
                // TODO: Check if $pidOnlyList can be kept as Array instead of imploded
635 1
                if (! strcmp($configurationRecord['pidsonly'], '') || GeneralUtility::inList($pidOnlyList, strval($pageId))) {
636 1
                    $key = $configurationRecord['name'];
637
638
                    // don't overwrite previously defined paramSets
639 1
                    if (! isset($res[$key])) {
640
641
                        /* @var $TSparserObject \TYPO3\CMS\Core\TypoScript\Parser\TypoScriptParser */
642 1
                        $TSparserObject = GeneralUtility::makeInstance(TypoScriptParser::class);
643 1
                        $TSparserObject->parse($configurationRecord['processing_instruction_parameters_ts']);
644
645
                        $subCfg = [
646 1
                            'procInstrFilter' => $configurationRecord['processing_instruction_filter'],
647 1
                            'procInstrParams.' => $TSparserObject->setup,
648 1
                            'baseUrl' => $configurationRecord['base_url'],
649 1
                            'force_ssl' => (int) $configurationRecord['force_ssl'],
650 1
                            'userGroups' => $configurationRecord['fegroups'],
651 1
                            'exclude' => $configurationRecord['exclude'],
652 1
                            'key' => $key,
653
                        ];
654
655 1
                        if (! in_array($pageId, $this->expandExcludeString($subCfg['exclude']), true)) {
656 1
                            $res[$key] = [];
657 1
                            $res[$key]['subCfg'] = $subCfg;
658 1
                            $res[$key]['paramParsed'] = GeneralUtility::explodeUrl2Array($configurationRecord['configuration']);
659 1
                            $res[$key]['paramExpanded'] = $this->expandParameters($res[$key]['paramParsed'], $pageId);
660 1
                            $res[$key]['URLs'] = $this->compileUrls($res[$key]['paramExpanded'], ['?id=' . $pageId]);
661 1
                            $res[$key]['origin'] = 'tx_crawler_configuration_' . $configurationRecord['uid'];
662
                        }
663
                    }
664
                }
665
            }
666
        }
667
668 4
        foreach ($GLOBALS['TYPO3_CONF_VARS']['EXTCONF']['crawler']['processUrls'] ?? [] as $func) {
669
            $params = [
670
                'res' => &$res,
671
            ];
672
            GeneralUtility::callUserFunction($func, $params, $this);
673
        }
674 4
        return $res;
675
    }
676
677
    /**
678
     * Find all configurations of subpages of a page
679
     * TODO: Write Functional Tests
680
     */
681 1
    public function getConfigurationsForBranch(int $rootid, int $depth): array
682
    {
683 1
        $configurationsForBranch = [];
684 1
        $pageTSconfig = $this->getPageTSconfigForId($rootid);
685 1
        $sets = $pageTSconfig['tx_crawler.']['crawlerCfg.']['paramSets.'] ?? [];
686 1
        foreach ($sets as $key => $value) {
687
            if (! is_array($value)) {
688
                continue;
689
            }
690
            $configurationsForBranch[] = substr($key, -1) === '.' ? substr($key, 0, -1) : $key;
691
        }
692 1
        $pids = [];
693 1
        $rootLine = BackendUtility::BEgetRootLine($rootid);
694 1
        foreach ($rootLine as $node) {
695 1
            $pids[] = $node['uid'];
696
        }
697
        /* @var PageTreeView $tree */
698 1
        $tree = GeneralUtility::makeInstance(PageTreeView::class);
699 1
        $perms_clause = $this->getBackendUser()->getPagePermsClause(Permission::PAGE_SHOW);
700 1
        $tree->init(empty($perms_clause) ? '' : ('AND ' . $perms_clause));
701 1
        $tree->getTree($rootid, $depth, '');
702 1
        foreach ($tree->tree as $node) {
703
            $pids[] = $node['row']['uid'];
704
        }
705
706 1
        $queryBuilder = $this->getQueryBuilder('tx_crawler_configuration');
707
        $statement = $queryBuilder
708 1
            ->select('name')
709 1
            ->from('tx_crawler_configuration')
710 1
            ->where(
711 1
                $queryBuilder->expr()->in('pid', $queryBuilder->createNamedParameter($pids, Connection::PARAM_INT_ARRAY))
712
            )
713 1
            ->execute();
714
715 1
        while ($row = $statement->fetch()) {
716 1
            $configurationsForBranch[] = $row['name'];
717
        }
718 1
        return $configurationsForBranch;
719
    }
720
721
    /**
722
     * Check if a user has access to an item
723
     * (e.g. get the group list of the current logged in user from $GLOBALS['TSFE']->gr_list)
724
     *
725
     * @param string $groupList Comma-separated list of (fe_)group UIDs from a user
726
     * @param string $accessList Comma-separated list of (fe_)group UIDs of the item to access
727
     * @return bool TRUE if at least one of the users group UIDs is in the access list or the access list is empty
728
     * @see \TYPO3\CMS\Frontend\Page\PageRepository::getMultipleGroupsWhereClause()
729
     */
730 3
    public function hasGroupAccess($groupList, $accessList)
731
    {
732 3
        if (empty($accessList)) {
733 1
            return true;
734
        }
735 2
        foreach (GeneralUtility::intExplode(',', $groupList) as $groupUid) {
736 2
            if (GeneralUtility::inList($accessList, $groupUid)) {
737 1
                return true;
738
            }
739
        }
740 1
        return false;
741
    }
742
743
    /**
744
     * Will expand the parameters configuration to individual values. This follows a certain syntax of the value of each parameter.
745
     * Syntax of values:
746
     * - Basically: If the value is wrapped in [...] it will be expanded according to the following syntax, otherwise the value is taken literally
747
     * - Configuration is splitted by "|" and the parts are processed individually and finally added together
748
     * - For each configuration part:
749
     *         - "[int]-[int]" = Integer range, will be expanded to all values in between, values included, starting from low to high (max. 1000). Example "1-34" or "-40--30"
750
     *         - "_TABLE:[TCA table name];[_PID:[optional page id, default is current page]];[_ENABLELANG:1]" = Look up of table records from PID, filtering out deleted records. Example "_TABLE:tt_content; _PID:123"
751
     *        _ENABLELANG:1 picks only original records without their language overlays
752
     *         - Default: Literal value
753
     *
754
     * @param array $paramArray Array with key (GET var name) and values (value of GET var which is configuration for expansion)
755
     * @param integer $pid Current page ID
756
     * @return array
757
     *
758
     * TODO: Write Functional Tests
759
     */
760 11
    public function expandParameters($paramArray, $pid)
761
    {
762
        // Traverse parameter names:
763 11
        foreach ($paramArray as $p => $v) {
764 11
            $v = trim($v);
765
766
            // If value is encapsulated in square brackets it means there are some ranges of values to find, otherwise the value is literal
767 11
            if (strpos($v, '[') === 0 && substr($v, -1) === ']') {
768
                // So, find the value inside brackets and reset the paramArray value as an array.
769 11
                $v = substr($v, 1, -1);
770 11
                $paramArray[$p] = [];
771
772
                // Explode parts and traverse them:
773 11
                $parts = explode('|', $v);
774 11
                foreach ($parts as $pV) {
775
776
                    // Look for integer range: (fx. 1-34 or -40--30 // reads minus 40 to minus 30)
777 11
                    if (preg_match('/^(-?[0-9]+)\s*-\s*(-?[0-9]+)$/', trim($pV), $reg)) {
778 1
                        $reg = $this->swapIfFirstIsLargerThanSecond($reg);
779
780
                        // Traverse range, add values:
781
                        // Limit to size of range!
782 1
                        $runAwayBrake = 1000;
783 1
                        for ($a = $reg[1]; $a <= $reg[2]; $a++) {
784 1
                            $paramArray[$p][] = $a;
785 1
                            $runAwayBrake--;
786 1
                            if ($runAwayBrake <= 0) {
787
                                break;
788
                            }
789
                        }
790 10
                    } elseif (strpos(trim($pV), '_TABLE:') === 0) {
791
792
                        // Parse parameters:
793 6
                        $subparts = GeneralUtility::trimExplode(';', $pV);
794 6
                        $subpartParams = [];
795 6
                        foreach ($subparts as $spV) {
796 6
                            [$pKey, $pVal] = GeneralUtility::trimExplode(':', $spV);
797 6
                            $subpartParams[$pKey] = $pVal;
798
                        }
799
800
                        // Table exists:
801 6
                        if (isset($GLOBALS['TCA'][$subpartParams['_TABLE']])) {
802 6
                            $lookUpPid = isset($subpartParams['_PID']) ? intval($subpartParams['_PID']) : intval($pid);
803 6
                            $recursiveDepth = isset($subpartParams['_RECURSIVE']) ? intval($subpartParams['_RECURSIVE']) : 0;
804 6
                            $pidField = isset($subpartParams['_PIDFIELD']) ? trim($subpartParams['_PIDFIELD']) : 'pid';
805 6
                            $where = $subpartParams['_WHERE'] ?? '';
806 6
                            $addTable = $subpartParams['_ADDTABLE'] ?? '';
807
808 6
                            $fieldName = $subpartParams['_FIELD'] ? $subpartParams['_FIELD'] : 'uid';
809 6
                            if ($fieldName === 'uid' || $GLOBALS['TCA'][$subpartParams['_TABLE']]['columns'][$fieldName]) {
810 6
                                $queryBuilder = $this->getQueryBuilder($subpartParams['_TABLE']);
811
812 6
                                if ($recursiveDepth > 0) {
813
                                    /** @var \TYPO3\CMS\Core\Database\QueryGenerator $queryGenerator */
814 2
                                    $queryGenerator = GeneralUtility::makeInstance(\TYPO3\CMS\Core\Database\QueryGenerator::class);
815 2
                                    $pidList = $queryGenerator->getTreeList($lookUpPid, $recursiveDepth, 0, 1);
816 2
                                    $pidArray = GeneralUtility::intExplode(',', $pidList);
817
                                } else {
818 4
                                    $pidArray = [(string) $lookUpPid];
819
                                }
820
821 6
                                $queryBuilder->getRestrictions()
822 6
                                    ->removeAll()
823 6
                                    ->add(GeneralUtility::makeInstance(DeletedRestriction::class));
824
825
                                $queryBuilder
826 6
                                    ->select($fieldName)
827 6
                                    ->from($subpartParams['_TABLE'])
828 6
                                    ->where(
829 6
                                        $queryBuilder->expr()->in($pidField, $queryBuilder->createNamedParameter($pidArray, Connection::PARAM_INT_ARRAY)),
830 6
                                        $where
831
                                    );
832
833 6
                                if (! empty($addTable)) {
834
                                    // TODO: Check if this works as intended!
835
                                    $queryBuilder->add('from', $addTable);
836
                                }
837 6
                                $transOrigPointerField = $GLOBALS['TCA'][$subpartParams['_TABLE']]['ctrl']['transOrigPointerField'];
838
839 6
                                if ($subpartParams['_ENABLELANG'] && $transOrigPointerField) {
840
                                    $queryBuilder->andWhere(
841
                                        $queryBuilder->expr()->lte(
842
                                            $transOrigPointerField,
843
                                            0
844
                                        )
845
                                    );
846
                                }
847
848 6
                                $statement = $queryBuilder->execute();
849
850 6
                                $rows = [];
851 6
                                while ($row = $statement->fetch()) {
852 6
                                    $rows[$row[$fieldName]] = $row;
853
                                }
854
855 6
                                if (is_array($rows)) {
856 6
                                    $paramArray[$p] = array_merge($paramArray[$p], array_keys($rows));
857
                                }
858
                            }
859
                        }
860
                    } else {
861
                        // Just add value:
862 4
                        $paramArray[$p][] = $pV;
863
                    }
864
                    // Hook for processing own expandParameters place holder
865 11
                    if (is_array($GLOBALS['TYPO3_CONF_VARS']['SC_OPTIONS']['crawler/class.tx_crawler_lib.php']['expandParameters'])) {
866
                        $_params = [
867
                            'pObj' => &$this,
868
                            'paramArray' => &$paramArray,
869
                            'currentKey' => $p,
870
                            'currentValue' => $pV,
871
                            'pid' => $pid,
872
                        ];
873
                        foreach ($GLOBALS['TYPO3_CONF_VARS']['SC_OPTIONS']['crawler/class.tx_crawler_lib.php']['expandParameters'] as $_funcRef) {
874
                            GeneralUtility::callUserFunction($_funcRef, $_params, $this);
875
                        }
876
                    }
877
                }
878
879
                // Make unique set of values and sort array by key:
880 11
                $paramArray[$p] = array_unique($paramArray[$p]);
881 11
                ksort($paramArray);
882
            } else {
883
                // Set the literal value as only value in array:
884 4
                $paramArray[$p] = [$v];
885
            }
886
        }
887
888 11
        return $paramArray;
889
    }
890
891
    /**
892
     * Compiling URLs from parameter array (output of expandParameters())
893
     * The number of URLs will be the multiplication of the number of parameter values for each key
894
     *
895
     * @param array $paramArray Output of expandParameters(): Array with keys (GET var names) and for each an array of values
896
     * @param array $urls URLs accumulated in this array (for recursion)
897
     * @return array
898
     */
899 7
    public function compileUrls($paramArray, array $urls)
900
    {
901 7
        if (empty($paramArray)) {
902 7
            return $urls;
903
        }
904
        // shift first off stack:
905 6
        reset($paramArray);
906 6
        $varName = key($paramArray);
907 6
        $valueSet = array_shift($paramArray);
908
909
        // Traverse value set:
910 6
        $newUrls = [];
911 6
        foreach ($urls as $url) {
912 5
            foreach ($valueSet as $val) {
913 5
                $newUrls[] = $url . (strcmp((string) $val, '') ? '&' . rawurlencode($varName) . '=' . rawurlencode((string) $val) : '');
914
915 5
                if (count($newUrls) > $this->maximumUrlsToCompile) {
916
                    break;
917
                }
918
            }
919
        }
920 6
        return $this->compileUrls($paramArray, $newUrls);
921
    }
922
923
    /************************************
924
     *
925
     * Crawler log
926
     *
927
     ************************************/
928
929
    /**
930
     * Return array of records from crawler queue for input page ID
931
     *
932
     * @param integer $id Page ID for which to look up log entries.
933
     * @param boolean $doFlush If TRUE, then entries selected at DELETED(!) instead of selected!
934
     * @param boolean $doFullFlush
935
     * @param integer $itemsPerPage Limit the amount of entries per page default is 10
936
     * @return array
937
     */
938 4
    public function getLogEntriesForPageId($id, QueueFilter $queueFilter, $doFlush = false, $doFullFlush = false, $itemsPerPage = 10)
939
    {
940 4
        $queryBuilder = GeneralUtility::makeInstance(ConnectionPool::class)->getQueryBuilderForTable($this->tableName);
941
        $queryBuilder
942 4
            ->select('*')
943 4
            ->from($this->tableName)
944 4
            ->where(
945 4
                $queryBuilder->expr()->eq('page_id', $queryBuilder->createNamedParameter($id, \PDO::PARAM_INT))
946
            )
947 4
            ->orderBy('scheduled', 'DESC');
948
949 4
        $expressionBuilder = GeneralUtility::makeInstance(ConnectionPool::class)
950 4
            ->getConnectionForTable($this->tableName)
951 4
            ->getExpressionBuilder();
952 4
        $query = $expressionBuilder->andX();
0 ignored issues
show
Unused Code introduced by
The assignment to $query is dead and can be removed.
Loading history...
953
        // PHPStorm adds the highlight that the $addWhere is immediately overwritten,
954
        // but the $query = $expressionBuilder->andX() ensures that the $addWhere is written correctly with AND
955
        // between the statements, it's not a mistake in the code.
956 4
        switch ($queueFilter) {
957 4
            case 'pending':
958
                $queryBuilder->andWhere($queryBuilder->expr()->eq('exec_time', 0));
959
                break;
960 4
            case 'finished':
961
                $queryBuilder->andWhere($queryBuilder->expr()->gt('exec_time', 0));
962
                break;
963
        }
964
965 4
        if ($doFlush) {
966 2
            if ($doFullFlush) {
967 1
                $this->queueRepository->flushQueue($queueFilter);
968
            } else {
969 1
                $this->queueRepository->flushQueue($queueFilter);
970
            }
971
        }
972 4
        if ($itemsPerPage > 0) {
973
            $queryBuilder
974 4
                ->setMaxResults((int) $itemsPerPage);
975
        }
976
977 4
        return $queryBuilder->execute()->fetchAll();
978
    }
979
980
    /**
981
     * Return array of records from crawler queue for input set ID
982
     *
983
     * @param int $set_id Set ID for which to look up log entries.
984
     * @param string $filter Filter: "all" => all entries, "pending" => all that is not yet run, "finished" => all complete ones
985
     * @param bool $doFlush If TRUE, then entries selected at DELETED(!) instead of selected!
986
     * @param int $itemsPerPage Limit the amount of entries per page default is 10
987
     * @return array
988
     *
989
     * @deprecated
990
     */
991 6
    public function getLogEntriesForSetId(int $set_id, string $filter = '', bool $doFlush = false, bool $doFullFlush = false, int $itemsPerPage = 10)
992
    {
993 6
        $queryBuilder = GeneralUtility::makeInstance(ConnectionPool::class)->getQueryBuilderForTable($this->tableName);
994
        $queryBuilder
995 6
            ->select('*')
996 6
            ->from($this->tableName)
997 6
            ->where(
998 6
                $queryBuilder->expr()->eq('set_id', $queryBuilder->createNamedParameter($set_id, \PDO::PARAM_INT))
999
            )
1000 6
            ->orderBy('scheduled', 'DESC');
1001
1002 6
        $expressionBuilder = GeneralUtility::makeInstance(ConnectionPool::class)
1003 6
            ->getConnectionForTable($this->tableName)
1004 6
            ->getExpressionBuilder();
1005 6
        $query = $expressionBuilder->andX();
1006
        // PHPStorm adds the highlight that the $addWhere is immediately overwritten,
1007
        // but the $query = $expressionBuilder->andX() ensures that the $addWhere is written correctly with AND
1008
        // between the statements, it's not a mistake in the code.
1009 6
        $addWhere = '';
1010 6
        switch ($filter) {
1011 6
            case 'pending':
1012 1
                $queryBuilder->andWhere($queryBuilder->expr()->eq('exec_time', 0));
1013 1
                $addWhere = $query->add($expressionBuilder->eq('exec_time', 0));
0 ignored issues
show
Unused Code introduced by
The assignment to $addWhere is dead and can be removed.
Loading history...
1014 1
                break;
1015 5
            case 'finished':
1016 1
                $queryBuilder->andWhere($queryBuilder->expr()->gt('exec_time', 0));
1017 1
                $addWhere = $query->add($expressionBuilder->gt('exec_time', 0));
1018 1
                break;
1019
        }
1020 6
        if ($doFlush) {
1021 4
            $addWhere = $query->add($expressionBuilder->eq('set_id', (int) $set_id));
1022 4
            $this->flushQueue($doFullFlush ? '' : $addWhere);
0 ignored issues
show
Deprecated Code introduced by
The function AOE\Crawler\Controller\C...ontroller::flushQueue() has been deprecated. ( Ignorable by Annotation )

If this is a false-positive, you can also ignore this issue in your code via the ignore-deprecated  annotation

1022
            /** @scrutinizer ignore-deprecated */ $this->flushQueue($doFullFlush ? '' : $addWhere);
Loading history...
1023 4
            return [];
1024
        }
1025 2
        if ($itemsPerPage > 0) {
1026
            $queryBuilder
1027 2
                ->setMaxResults((int) $itemsPerPage);
1028
        }
1029
1030 2
        return $queryBuilder->execute()->fetchAll();
1031
    }
1032
1033
    /**
1034
     * Adding call back entries to log (called from hooks typically, see indexed search class "class.crawler.php"
1035
     *
1036
     * @param integer $setId Set ID
1037
     * @param array $params Parameters to pass to call back function
1038
     * @param string $callBack Call back object reference, eg. 'EXT:indexed_search/class.crawler.php:&tx_indexedsearch_crawler'
1039
     * @param integer $page_id Page ID to attach it to
1040
     * @param integer $schedule Time at which to activate
1041
     */
1042
    public function addQueueEntry_callBack($setId, $params, $callBack, $page_id = 0, $schedule = 0): void
1043
    {
1044
        if (! is_array($params)) {
0 ignored issues
show
introduced by
The condition is_array($params) is always true.
Loading history...
1045
            $params = [];
1046
        }
1047
        $params['_CALLBACKOBJ'] = $callBack;
1048
1049
        GeneralUtility::makeInstance(ConnectionPool::class)->getConnectionForTable('tx_crawler_queue')
1050
            ->insert(
1051
                'tx_crawler_queue',
1052
                [
1053
                    'page_id' => (int) $page_id,
1054
                    'parameters' => json_encode($params),
1055
                    'scheduled' => (int) $schedule ?: $this->getCurrentTime(),
1056
                    'exec_time' => 0,
1057
                    'set_id' => (int) $setId,
1058
                    'result_data' => '',
1059
                ]
1060
            );
1061
    }
1062
1063
    /************************************
1064
     *
1065
     * URL setting
1066
     *
1067
     ************************************/
1068
1069
    /**
1070
     * Setting a URL for crawling:
1071
     *
1072
     * @param integer $id Page ID
1073
     * @param string $url Complete URL
1074
     * @param array $subCfg Sub configuration array (from TS config)
1075
     * @param integer $tstamp Scheduled-time
1076
     * @param string $configurationHash (optional) configuration hash
1077
     * @param bool $skipInnerDuplicationCheck (optional) skip inner duplication check
1078
     * @return bool
1079
     */
1080 8
    public function addUrl(
1081
        $id,
1082
        $url,
1083
        array $subCfg,
1084
        $tstamp,
1085
        $configurationHash = '',
1086
        $skipInnerDuplicationCheck = false
1087
    ) {
1088 8
        $urlAdded = false;
1089 8
        $rows = [];
1090
1091
        // Creating parameters:
1092
        $parameters = [
1093 8
            'url' => $url,
1094
        ];
1095
1096
        // fe user group simulation:
1097 8
        $uGs = implode(',', array_unique(GeneralUtility::intExplode(',', $subCfg['userGroups'], true)));
1098 8
        if ($uGs) {
1099 1
            $parameters['feUserGroupList'] = $uGs;
1100
        }
1101
1102
        // Setting processing instructions
1103 8
        $parameters['procInstructions'] = GeneralUtility::trimExplode(',', $subCfg['procInstrFilter']);
1104 8
        if (is_array($subCfg['procInstrParams.'])) {
1105 5
            $parameters['procInstrParams'] = $subCfg['procInstrParams.'];
1106
        }
1107
1108
        // Compile value array:
1109 8
        $parameters_serialized = json_encode($parameters);
1110
        $fieldArray = [
1111 8
            'page_id' => (int) $id,
1112 8
            'parameters' => $parameters_serialized,
1113 8
            'parameters_hash' => GeneralUtility::shortMD5($parameters_serialized),
1114 8
            'configuration_hash' => $configurationHash,
1115 8
            'scheduled' => $tstamp,
1116 8
            'exec_time' => 0,
1117 8
            'set_id' => (int) $this->setID,
1118 8
            'result_data' => '',
1119 8
            'configuration' => $subCfg['key'],
1120
        ];
1121
1122 8
        if ($this->registerQueueEntriesInternallyOnly) {
1123
            //the entries will only be registered and not stored to the database
1124 1
            $this->queueEntries[] = $fieldArray;
1125
        } else {
1126 7
            if (! $skipInnerDuplicationCheck) {
1127
                // check if there is already an equal entry
1128 6
                $rows = $this->getDuplicateRowsIfExist($tstamp, $fieldArray);
1129
            }
1130
1131 7
            if (empty($rows)) {
1132 6
                $connectionForCrawlerQueue = GeneralUtility::makeInstance(ConnectionPool::class)->getConnectionForTable('tx_crawler_queue');
1133 6
                $connectionForCrawlerQueue->insert(
1134 6
                    'tx_crawler_queue',
1135 6
                    $fieldArray
1136
                );
1137 6
                $uid = $connectionForCrawlerQueue->lastInsertId('tx_crawler_queue', 'qid');
1138 6
                $rows[] = $uid;
1139 6
                $urlAdded = true;
1140
1141 6
                $signalPayload = ['uid' => $uid, 'fieldArray' => $fieldArray];
1142 6
                SignalSlotUtility::emitSignal(
1143 6
                    self::class,
1144 6
                    SignalSlotUtility::SIGNAL_URL_ADDED_TO_QUEUE,
1145 6
                    $signalPayload
1146
                );
1147
            } else {
1148 3
                $signalPayload = ['rows' => $rows, 'fieldArray' => $fieldArray];
1149 3
                SignalSlotUtility::emitSignal(
1150 3
                    self::class,
1151 3
                    SignalSlotUtility::SIGNAL_DUPLICATE_URL_IN_QUEUE,
1152 3
                    $signalPayload
1153
                );
1154
            }
1155
        }
1156
1157 8
        return $urlAdded;
1158
    }
1159
1160
    /**
1161
     * Returns the current system time
1162
     *
1163
     * @return int
1164
     */
1165
    public function getCurrentTime()
1166
    {
1167
        return time();
1168
    }
1169
1170
    /************************************
1171
     *
1172
     * URL reading
1173
     *
1174
     ************************************/
1175
1176
    /**
1177
     * Read URL for single queue entry
1178
     *
1179
     * @param integer $queueId
1180
     * @param boolean $force If set, will process even if exec_time has been set!
1181
     * @return integer
1182
     */
1183
    public function readUrl($queueId, $force = false)
1184
    {
1185
        $queryBuilder = GeneralUtility::makeInstance(ConnectionPool::class)->getQueryBuilderForTable($this->tableName);
1186
        $ret = 0;
1187
        $this->logger->debug('crawler-readurl start ' . microtime(true));
1188
        // Get entry:
1189
        $queryBuilder
1190
            ->select('*')
1191
            ->from('tx_crawler_queue')
1192
            ->where(
1193
                $queryBuilder->expr()->eq('qid', $queryBuilder->createNamedParameter($queueId, \PDO::PARAM_INT))
1194
            );
1195
        if (! $force) {
1196
            $queryBuilder
1197
                ->andWhere('exec_time = 0')
1198
                ->andWhere('process_scheduled > 0');
1199
        }
1200
        $queueRec = $queryBuilder->execute()->fetch();
1201
1202
        if (! is_array($queueRec)) {
1203
            return;
1204
        }
1205
1206
        SignalSlotUtility::emitSignal(
1207
            self::class,
1208
            SignalSlotUtility::SIGNAL_QUEUEITEM_PREPROCESS,
1209
            [$queueId, &$queueRec]
1210
        );
1211
1212
        // Set exec_time to lock record:
1213
        $field_array = ['exec_time' => $this->getCurrentTime()];
1214
1215
        if (isset($this->processID)) {
1216
            //if mulitprocessing is used we need to store the id of the process which has handled this entry
1217
            $field_array['process_id_completed'] = $this->processID;
1218
        }
1219
1220
        GeneralUtility::makeInstance(ConnectionPool::class)->getConnectionForTable('tx_crawler_queue')
1221
            ->update(
1222
                'tx_crawler_queue',
1223
                $field_array,
1224
                ['qid' => (int) $queueId]
1225
            );
1226
1227
        $result = $this->queueExecutor->executeQueueItem($queueRec, $this);
1228
        if ($result['content'] === null) {
1229
            $resultData = 'An errors happened';
1230
        } else {
1231
            /** @var JsonCompatibilityConverter $jsonCompatibilityConverter */
1232
            $jsonCompatibilityConverter = GeneralUtility::makeInstance(JsonCompatibilityConverter::class);
1233
            $resultData = $jsonCompatibilityConverter->convert($result['content']);
1234
        }
1235
1236
        //atm there's no need to point to specific pollable extensions
1237
        if (is_array($GLOBALS['TYPO3_CONF_VARS']['EXTCONF']['crawler']['pollSuccess'])) {
1238
            foreach ($GLOBALS['TYPO3_CONF_VARS']['EXTCONF']['crawler']['pollSuccess'] as $pollable) {
1239
                // only check the success value if the instruction is runnig
1240
                // it is important to name the pollSuccess key same as the procInstructions key
1241
                if (is_array($resultData['parameters']['procInstructions'])
1242
                    && in_array(
1243
                        $pollable,
1244
                        $resultData['parameters']['procInstructions'], true
1245
                    )
1246
                ) {
1247
                    if (! empty($resultData['success'][$pollable]) && $resultData['success'][$pollable]) {
1248
                        $ret |= self::CLI_STATUS_POLLABLE_PROCESSED;
1249
                    }
1250
                }
1251
            }
1252
        }
1253
1254
        // Set result in log which also denotes the end of the processing of this entry.
1255
        $field_array = ['result_data' => json_encode($result)];
1256
1257
        SignalSlotUtility::emitSignal(
1258
            self::class,
1259
            SignalSlotUtility::SIGNAL_QUEUEITEM_POSTPROCESS,
1260
            [$queueId, &$field_array]
1261
        );
1262
1263
        GeneralUtility::makeInstance(ConnectionPool::class)->getConnectionForTable('tx_crawler_queue')
1264
            ->update(
1265
                'tx_crawler_queue',
1266
                $field_array,
1267
                ['qid' => (int) $queueId]
1268
            );
1269
1270
        $this->logger->debug('crawler-readurl stop ' . microtime(true));
1271
        return $ret;
1272
    }
1273
1274
    /**
1275
     * Read URL for not-yet-inserted log-entry
1276
     *
1277
     * @param array $field_array Queue field array,
1278
     *
1279
     * @return array|bool|mixed|string
1280
     */
1281
    public function readUrlFromArray($field_array)
1282
    {
1283
        // Set exec_time to lock record:
1284
        $field_array['exec_time'] = $this->getCurrentTime();
1285
        $connectionForCrawlerQueue = GeneralUtility::makeInstance(ConnectionPool::class)->getConnectionForTable($this->tableName);
1286
        $connectionForCrawlerQueue->insert(
1287
            $this->tableName,
1288
            $field_array
1289
        );
1290
        $queueId = $connectionForCrawlerQueue->lastInsertId($this->tableName, 'qid');
1291
        $result = $this->queueExecutor->executeQueueItem($field_array, $this);
1292
1293
        // Set result in log which also denotes the end of the processing of this entry.
1294
        $field_array = ['result_data' => json_encode($result)];
1295
1296
        SignalSlotUtility::emitSignal(
1297
            self::class,
1298
            SignalSlotUtility::SIGNAL_QUEUEITEM_POSTPROCESS,
1299
            [$queueId, &$field_array]
1300
        );
1301
1302
        $connectionForCrawlerQueue->update(
1303
            $this->tableName,
1304
            $field_array,
1305
            ['qid' => $queueId]
1306
        );
1307
1308
        return $result;
1309
    }
1310
1311
    /*****************************
1312
     *
1313
     * Compiling URLs to crawl - tools
1314
     *
1315
     *****************************/
1316
1317
    /**
1318
     * @param integer $id Root page id to start from.
1319
     * @param integer $depth Depth of tree, 0=only id-page, 1= on sublevel, 99 = infinite
1320
     * @param integer $scheduledTime Unix Time when the URL is timed to be visited when put in queue
1321
     * @param integer $reqMinute Number of requests per minute (creates the interleave between requests)
1322
     * @param boolean $submitCrawlUrls If set, submits the URLs to queue in database (real crawling)
1323
     * @param boolean $downloadCrawlUrls If set (and submitcrawlUrls is false) will fill $downloadUrls with entries)
1324
     * @param array $incomingProcInstructions Array of processing instructions
1325
     * @param array $configurationSelection Array of configuration keys
1326
     * @return string
1327
     */
1328
    public function getPageTreeAndUrls(
1329
        $id,
1330
        $depth,
1331
        $scheduledTime,
1332
        $reqMinute,
1333
        $submitCrawlUrls,
1334
        $downloadCrawlUrls,
1335
        array $incomingProcInstructions,
1336
        array $configurationSelection
1337
    ) {
1338
        $this->scheduledTime = $scheduledTime;
1339
        $this->reqMinute = $reqMinute;
1340
        $this->submitCrawlUrls = $submitCrawlUrls;
1341
        $this->downloadCrawlUrls = $downloadCrawlUrls;
1342
        $this->incomingProcInstructions = $incomingProcInstructions;
1343
        $this->incomingConfigurationSelection = $configurationSelection;
1344
1345
        $this->duplicateTrack = [];
1346
        $this->downloadUrls = [];
1347
1348
        // Drawing tree:
1349
        /* @var PageTreeView $tree */
1350
        $tree = GeneralUtility::makeInstance(PageTreeView::class);
1351
        $perms_clause = $this->getBackendUser()->getPagePermsClause(Permission::PAGE_SHOW);
1352
        $tree->init('AND ' . $perms_clause);
1353
1354
        $pageInfo = BackendUtility::readPageAccess($id, $perms_clause);
1355
        if (is_array($pageInfo)) {
0 ignored issues
show
introduced by
The condition is_array($pageInfo) is always true.
Loading history...
1356
            // Set root row:
1357
            $tree->tree[] = [
1358
                'row' => $pageInfo,
1359
                'HTML' => $this->iconFactory->getIconForRecord('pages', $pageInfo, Icon::SIZE_SMALL),
1360
            ];
1361
        }
1362
1363
        // Get branch beneath:
1364
        if ($depth) {
1365
            $tree->getTree($id, $depth, '');
1366
        }
1367
1368
        // Traverse page tree:
1369
        $code = '';
1370
1371
        foreach ($tree->tree as $data) {
1372
            $this->MP = false;
1373
1374
            // recognize mount points
1375
            if ($data['row']['doktype'] === PageRepository::DOKTYPE_MOUNTPOINT) {
1376
                $mountpage = $this->pageRepository->getPage($data['row']['uid']);
1377
1378
                // fetch mounted pages
1379
                $this->MP = $mountpage[0]['mount_pid'] . '-' . $data['row']['uid'];
0 ignored issues
show
Documentation Bug introduced by
The property $MP was declared of type boolean, but $mountpage[0]['mount_pid...' . $data['row']['uid'] is of type string. Maybe add a type cast?

This check looks for assignments to scalar types that may be of the wrong type.

To ensure the code behaves as expected, it may be a good idea to add an explicit type cast.

$answer = 42;

$correct = false;

$correct = (bool) $answer;
Loading history...
1380
1381
                $mountTree = GeneralUtility::makeInstance(PageTreeView::class);
1382
                $mountTree->init('AND ' . $perms_clause);
1383
                $mountTree->getTree($mountpage[0]['mount_pid'], $depth);
1384
1385
                foreach ($mountTree->tree as $mountData) {
1386
                    $code .= $this->drawURLs_addRowsForPage(
1387
                        $mountData['row'],
1388
                        $mountData['HTML'] . BackendUtility::getRecordTitle('pages', $mountData['row'], true)
1389
                    );
1390
                }
1391
1392
                // replace page when mount_pid_ol is enabled
1393
                if ($mountpage[0]['mount_pid_ol']) {
1394
                    $data['row']['uid'] = $mountpage[0]['mount_pid'];
1395
                } else {
1396
                    // if the mount_pid_ol is not set the MP must not be used for the mountpoint page
1397
                    $this->MP = false;
1398
                }
1399
            }
1400
1401
            $code .= $this->drawURLs_addRowsForPage(
1402
                $data['row'],
1403
                $data['HTML'] . BackendUtility::getRecordTitle('pages', $data['row'], true)
1404
            );
1405
        }
1406
1407
        return $code;
1408
    }
1409
1410
    /**
1411
     * Expands exclude string
1412
     *
1413
     * @param string $excludeString Exclude string
1414
     * @return array
1415
     */
1416 2
    public function expandExcludeString($excludeString)
1417
    {
1418
        // internal static caches;
1419 2
        static $expandedExcludeStringCache;
1420 2
        static $treeCache;
1421
1422 2
        if (empty($expandedExcludeStringCache[$excludeString])) {
1423 2
            $pidList = [];
1424
1425 2
            if (! empty($excludeString)) {
1426
                /** @var PageTreeView $tree */
1427 1
                $tree = GeneralUtility::makeInstance(PageTreeView::class);
1428 1
                $tree->init('AND ' . $this->getBackendUser()->getPagePermsClause(Permission::PAGE_SHOW));
1429
1430 1
                $excludeParts = GeneralUtility::trimExplode(',', $excludeString);
1431
1432 1
                foreach ($excludeParts as $excludePart) {
1433 1
                    [$pid, $depth] = GeneralUtility::trimExplode('+', $excludePart);
1434
1435
                    // default is "page only" = "depth=0"
1436 1
                    if (empty($depth)) {
1437 1
                        $depth = (stristr($excludePart, '+')) ? 99 : 0;
1438
                    }
1439
1440 1
                    $pidList[] = (int) $pid;
1441
1442 1
                    if ($depth > 0) {
1443
                        if (empty($treeCache[$pid][$depth])) {
1444
                            $tree->reset();
1445
                            $tree->getTree($pid, $depth);
0 ignored issues
show
Bug introduced by
$pid of type string is incompatible with the type integer expected by parameter $uid of TYPO3\CMS\Backend\Tree\V...ractTreeView::getTree(). ( Ignorable by Annotation )

If this is a false-positive, you can also ignore this issue in your code via the ignore-type  annotation

1445
                            $tree->getTree(/** @scrutinizer ignore-type */ $pid, $depth);
Loading history...
1446
                            $treeCache[$pid][$depth] = $tree->tree;
1447
                        }
1448
1449
                        foreach ($treeCache[$pid][$depth] as $data) {
1450
                            $pidList[] = (int) $data['row']['uid'];
1451
                        }
1452
                    }
1453
                }
1454
            }
1455
1456 2
            $expandedExcludeStringCache[$excludeString] = array_unique($pidList);
1457
        }
1458
1459 2
        return $expandedExcludeStringCache[$excludeString];
1460
    }
1461
1462
    /**
1463
     * Create the rows for display of the page tree
1464
     * For each page a number of rows are shown displaying GET variable configuration
1465
     */
1466
    public function drawURLs_addRowsForPage(array $pageRow, string $pageTitle): string
1467
    {
1468
        $skipMessage = '';
1469
1470
        // Get list of configurations
1471
        $configurations = $this->getUrlsForPageRow($pageRow, $skipMessage);
1472
1473
        if (! empty($this->incomingConfigurationSelection)) {
1474
            // remove configuration that does not match the current selection
1475
            foreach ($configurations as $confKey => $confArray) {
1476
                if (! in_array($confKey, $this->incomingConfigurationSelection, true)) {
1477
                    unset($configurations[$confKey]);
1478
                }
1479
            }
1480
        }
1481
1482
        // Traverse parameter combinations:
1483
        $c = 0;
1484
        $content = '';
1485
        if (! empty($configurations)) {
1486
            foreach ($configurations as $confKey => $confArray) {
1487
1488
                // Title column:
1489
                if (! $c) {
1490
                    $titleClm = '<td rowspan="' . count($configurations) . '">' . $pageTitle . '</td>';
1491
                } else {
1492
                    $titleClm = '';
1493
                }
1494
1495
                if (! in_array($pageRow['uid'], $this->expandExcludeString($confArray['subCfg']['exclude']), true)) {
1496
1497
                    // URL list:
1498
                    $urlList = $this->urlListFromUrlArray(
1499
                        $confArray,
1500
                        $pageRow,
1501
                        $this->scheduledTime,
1502
                        $this->reqMinute,
1503
                        $this->submitCrawlUrls,
1504
                        $this->downloadCrawlUrls,
1505
                        $this->duplicateTrack,
1506
                        $this->downloadUrls,
1507
                        // if empty the urls won't be filtered by processing instructions
1508
                        $this->incomingProcInstructions
1509
                    );
1510
1511
                    // Expanded parameters:
1512
                    $paramExpanded = '';
1513
                    $calcAccu = [];
1514
                    $calcRes = 1;
1515
                    foreach ($confArray['paramExpanded'] as $gVar => $gVal) {
1516
                        $paramExpanded .= '
1517
                            <tr>
1518
                                <td>' . htmlspecialchars('&' . $gVar . '=') . '<br/>' .
1519
                            '(' . count($gVal) . ')' .
1520
                            '</td>
1521
                                <td nowrap="nowrap">' . nl2br(htmlspecialchars(implode(chr(10), $gVal))) . '</td>
1522
                            </tr>
1523
                        ';
1524
                        $calcRes *= count($gVal);
1525
                        $calcAccu[] = count($gVal);
1526
                    }
1527
                    $paramExpanded = '<table>' . $paramExpanded . '</table>';
1528
                    $paramExpanded .= 'Comb: ' . implode('*', $calcAccu) . '=' . $calcRes;
1529
1530
                    // Options
1531
                    $optionValues = '';
1532
                    if ($confArray['subCfg']['userGroups']) {
1533
                        $optionValues .= 'User Groups: ' . $confArray['subCfg']['userGroups'] . '<br/>';
1534
                    }
1535
                    if ($confArray['subCfg']['procInstrFilter']) {
1536
                        $optionValues .= 'ProcInstr: ' . $confArray['subCfg']['procInstrFilter'] . '<br/>';
1537
                    }
1538
1539
                    // Compile row:
1540
                    $content .= '
1541
                        <tr>
1542
                            ' . $titleClm . '
1543
                            <td>' . htmlspecialchars($confKey) . '</td>
1544
                            <td>' . nl2br(htmlspecialchars(rawurldecode(trim(str_replace('&', chr(10) . '&', GeneralUtility::implodeArrayForUrl('', $confArray['paramParsed'])))))) . '</td>
1545
                            <td>' . $paramExpanded . '</td>
1546
                            <td nowrap="nowrap">' . $urlList . '</td>
1547
                            <td nowrap="nowrap">' . $optionValues . '</td>
1548
                            <td nowrap="nowrap">' . DebugUtility::viewArray($confArray['subCfg']['procInstrParams.']) . '</td>
1549
                        </tr>';
1550
                } else {
1551
                    $content .= '<tr>
1552
                            ' . $titleClm . '
1553
                            <td>' . htmlspecialchars($confKey) . '</td>
1554
                            <td colspan="5"><em>No entries</em> (Page is excluded in this configuration)</td>
1555
                        </tr>';
1556
                }
1557
1558
                $c++;
1559
            }
1560
        } else {
1561
            $message = ! empty($skipMessage) ? ' (' . $skipMessage . ')' : '';
1562
1563
            // Compile row:
1564
            $content .= '
1565
                <tr>
1566
                    <td>' . $pageTitle . '</td>
1567
                    <td colspan="6"><em>No entries</em>' . $message . '</td>
1568
                </tr>';
1569
        }
1570
1571
        return $content;
1572
    }
1573
1574
    /*****************************
1575
     *
1576
     * CLI functions
1577
     *
1578
     *****************************/
1579
1580
    /**
1581
     * Running the functionality of the CLI (crawling URLs from queue)
1582
     */
1583
    public function CLI_run(int $countInARun, int $sleepTime, int $sleepAfterFinish): int
1584
    {
1585
        $result = 0;
1586
        $counter = 0;
1587
1588
        // First, run hooks:
1589
        $this->CLI_runHooks();
1590
1591
        // Clean up the queue
1592
        $this->queueRepository->cleanupQueue();
1593
1594
        // Select entries:
1595
        $rows = $this->queueRepository->fetchRecordsToBeCrawled($countInARun);
1596
1597
        if (! empty($rows)) {
1598
            $quidList = [];
1599
1600
            foreach ($rows as $r) {
1601
                $quidList[] = $r['qid'];
1602
            }
1603
1604
            $processId = $this->CLI_buildProcessId();
1605
1606
            //save the number of assigned queue entries to determine how many have been processed later
1607
            $numberOfAffectedRows = $this->queueRepository->updateProcessIdAndSchedulerForQueueIds($quidList, $processId);
1608
            $this->processRepository->updateProcessAssignItemsCount($numberOfAffectedRows, $processId);
1609
1610
            if ($numberOfAffectedRows !== count($quidList)) {
1611
                $this->CLI_debug('Nothing processed due to multi-process collision (' . $this->CLI_buildProcessId() . ')');
0 ignored issues
show
Deprecated Code introduced by
The function AOE\Crawler\Controller\C...Controller::CLI_debug() has been deprecated. ( Ignorable by Annotation )

If this is a false-positive, you can also ignore this issue in your code via the ignore-deprecated  annotation

1611
                /** @scrutinizer ignore-deprecated */ $this->CLI_debug('Nothing processed due to multi-process collision (' . $this->CLI_buildProcessId() . ')');
Loading history...
1612
                return ($result | self::CLI_STATUS_ABORTED);
1613
            }
1614
1615
            foreach ($rows as $r) {
1616
                $result |= $this->readUrl($r['qid']);
1617
1618
                $counter++;
1619
                // Just to relax the system
1620
                usleep((int) $sleepTime);
1621
1622
                // if during the start and the current read url the cli has been disable we need to return from the function
1623
                // mark the process NOT as ended.
1624
                if ($this->crawler->isDisabled()) {
1625
                    return ($result | self::CLI_STATUS_ABORTED);
1626
                }
1627
1628
                if (! $this->processRepository->isProcessActive($this->CLI_buildProcessId())) {
1629
                    $this->CLI_debug('conflict / timeout (' . $this->CLI_buildProcessId() . ')');
0 ignored issues
show
Deprecated Code introduced by
The function AOE\Crawler\Controller\C...Controller::CLI_debug() has been deprecated. ( Ignorable by Annotation )

If this is a false-positive, you can also ignore this issue in your code via the ignore-deprecated  annotation

1629
                    /** @scrutinizer ignore-deprecated */ $this->CLI_debug('conflict / timeout (' . $this->CLI_buildProcessId() . ')');
Loading history...
1630
                    $result |= self::CLI_STATUS_ABORTED;
1631
                    //possible timeout
1632
                    break;
1633
                }
1634
            }
1635
1636
            sleep((int) $sleepAfterFinish);
1637
1638
            $msg = 'Rows: ' . $counter;
1639
            $this->CLI_debug($msg . ' (' . $this->CLI_buildProcessId() . ')');
0 ignored issues
show
Deprecated Code introduced by
The function AOE\Crawler\Controller\C...Controller::CLI_debug() has been deprecated. ( Ignorable by Annotation )

If this is a false-positive, you can also ignore this issue in your code via the ignore-deprecated  annotation

1639
            /** @scrutinizer ignore-deprecated */ $this->CLI_debug($msg . ' (' . $this->CLI_buildProcessId() . ')');
Loading history...
1640
        } else {
1641
            $this->CLI_debug('Nothing within queue which needs to be processed (' . $this->CLI_buildProcessId() . ')');
0 ignored issues
show
Deprecated Code introduced by
The function AOE\Crawler\Controller\C...Controller::CLI_debug() has been deprecated. ( Ignorable by Annotation )

If this is a false-positive, you can also ignore this issue in your code via the ignore-deprecated  annotation

1641
            /** @scrutinizer ignore-deprecated */ $this->CLI_debug('Nothing within queue which needs to be processed (' . $this->CLI_buildProcessId() . ')');
Loading history...
1642
        }
1643
1644
        if ($counter > 0) {
1645
            $result |= self::CLI_STATUS_PROCESSED;
1646
        }
1647
1648
        return $result;
1649
    }
1650
1651
    /**
1652
     * Activate hooks
1653
     */
1654
    public function CLI_runHooks(): void
1655
    {
1656
        foreach ($GLOBALS['TYPO3_CONF_VARS']['EXTCONF']['crawler']['cli_hooks'] ?? [] as $objRef) {
1657
            $hookObj = GeneralUtility::makeInstance($objRef);
1658
            if (is_object($hookObj)) {
1659
                $hookObj->crawler_init($this);
1660
            }
1661
        }
1662
    }
1663
1664
    /**
1665
     * Try to acquire a new process with the given id
1666
     * also performs some auto-cleanup for orphan processes
1667
     * @param string $id identification string for the process
1668
     * @return boolean
1669
     * @todo preemption might not be the most elegant way to clean up
1670
     */
1671
    public function CLI_checkAndAcquireNewProcess($id)
1672
    {
1673
        $ret = true;
1674
1675
        $systemProcessId = getmypid();
1676
        if (! $systemProcessId) {
1677
            return false;
1678
        }
1679
1680
        $processCount = 0;
1681
        $orphanProcesses = [];
1682
1683
        $activeProcesses = $this->processRepository->findAllActive();
1684
        $currentTime = $this->getCurrentTime();
1685
1686
        /** @var Process $process */
1687
        foreach ($activeProcesses as $process) {
1688
            if ($process->getTtl() < $currentTime) {
1689
                $orphanProcesses[] = $process->getProcessId();
1690
            } else {
1691
                $processCount++;
1692
            }
1693
        }
1694
1695
        // if there are less than allowed active processes then add a new one
1696
        if ($processCount < (int) $this->extensionSettings['processLimit']) {
1697
            $this->CLI_debug('add process ' . $this->CLI_buildProcessId() . ' (' . ($processCount + 1) . '/' . (int) $this->extensionSettings['processLimit'] . ')');
0 ignored issues
show
Deprecated Code introduced by
The function AOE\Crawler\Controller\C...Controller::CLI_debug() has been deprecated. ( Ignorable by Annotation )

If this is a false-positive, you can also ignore this issue in your code via the ignore-deprecated  annotation

1697
            /** @scrutinizer ignore-deprecated */ $this->CLI_debug('add process ' . $this->CLI_buildProcessId() . ' (' . ($processCount + 1) . '/' . (int) $this->extensionSettings['processLimit'] . ')');
Loading history...
1698
1699
            GeneralUtility::makeInstance(ConnectionPool::class)->getConnectionForTable('tx_crawler_process')->insert(
1700
                'tx_crawler_process',
1701
                [
1702
                    'process_id' => $id,
1703
                    'active' => 1,
1704
                    'ttl' => $currentTime + (int) $this->extensionSettings['processMaxRunTime'],
1705
                    'system_process_id' => $systemProcessId,
1706
                ]
1707
            );
1708
        } else {
1709
            $this->CLI_debug('Processlimit reached (' . ($processCount) . '/' . (int) $this->extensionSettings['processLimit'] . ')');
0 ignored issues
show
Deprecated Code introduced by
The function AOE\Crawler\Controller\C...Controller::CLI_debug() has been deprecated. ( Ignorable by Annotation )

If this is a false-positive, you can also ignore this issue in your code via the ignore-deprecated  annotation

1709
            /** @scrutinizer ignore-deprecated */ $this->CLI_debug('Processlimit reached (' . ($processCount) . '/' . (int) $this->extensionSettings['processLimit'] . ')');
Loading history...
1710
            $ret = false;
1711
        }
1712
1713
        $this->processRepository->deleteProcessesMarkedAsDeleted();
1714
        $this->CLI_releaseProcesses($orphanProcesses);
1715
1716
        return $ret;
1717
    }
1718
1719
    /**
1720
     * Release a process and the required resources
1721
     *
1722
     * @param mixed $releaseIds string with a single process-id or array with multiple process-ids
1723
     * @return boolean
1724
     */
1725
    public function CLI_releaseProcesses($releaseIds)
1726
    {
1727
        $queryBuilder = GeneralUtility::makeInstance(ConnectionPool::class)->getQueryBuilderForTable($this->tableName);
1728
1729
        if (! is_array($releaseIds)) {
1730
            $releaseIds = [$releaseIds];
1731
        }
1732
1733
        if (empty($releaseIds)) {
1734
            //nothing to release
1735
            return false;
1736
        }
1737
1738
        // some kind of 2nd chance algo - this way you need at least 2 processes to have a real cleanup
1739
        // this ensures that a single process can't mess up the entire process table
1740
1741
        // mark all processes as deleted which have no "waiting" queue-entires and which are not active
1742
1743
        $queryBuilder
1744
            ->update($this->tableName, 'q')
1745
            ->where(
1746
                'q.process_id IN(SELECT p.process_id FROM tx_crawler_process as p WHERE p.active = 0)'
1747
            )
1748
            ->set('q.process_scheduled', 0)
1749
            ->set('q.process_id', '')
1750
            ->execute();
1751
1752
        // FIXME: Not entirely sure that this is equivalent to the previous version
1753
        $queryBuilder->resetQueryPart('set');
1754
1755
        $queryBuilder
1756
            ->update('tx_crawler_process')
1757
            ->where(
1758
                $queryBuilder->expr()->eq('active', 0),
1759
                'process_id IN(SELECT q.process_id FROM tx_crawler_queue as q WHERE q.exec_time = 0)'
1760
            )
1761
            ->set('system_process_id', 0)
1762
            ->execute();
1763
1764
        $this->processRepository->markRequestedProcessesAsNotActive($releaseIds);
1765
        $this->queueRepository->unsetProcessScheduledAndProcessIdForQueueEntries($releaseIds);
1766
1767
        return true;
1768
    }
1769
1770
    /**
1771
     * Create a unique Id for the current process
1772
     *
1773
     * @return string the ID
1774
     */
1775 1
    public function CLI_buildProcessId()
1776
    {
1777 1
        if (! $this->processID) {
1778
            $this->processID = GeneralUtility::shortMD5(microtime(true));
1779
        }
1780 1
        return $this->processID;
1781
    }
1782
1783
    /**
1784
     * Prints a message to the stdout (only if debug-mode is enabled)
1785
     *
1786
     * @param string $msg the message
1787
     * @deprecated
1788
     * @codeCoverageIgnore
1789
     */
1790
    public function CLI_debug($msg): void
1791
    {
1792
        if ((int) $this->extensionSettings['processDebug']) {
1793
            echo $msg . "\n";
1794
            flush();
1795
        }
1796
    }
1797
1798
    /**
1799
     * Cleans up entries that stayed for too long in the queue. These are:
1800
     * - processed entries that are over 1.5 days in age
1801
     * - scheduled entries that are over 7 days old
1802
     *
1803
     * @deprecated
1804
     */
1805 1
    public function cleanUpOldQueueEntries(): void
1806
    {
1807
        // 24*60*60 Seconds in 24 hours
1808 1
        $processedAgeInSeconds = $this->extensionSettings['cleanUpProcessedAge'] * 86400;
1809 1
        $scheduledAgeInSeconds = $this->extensionSettings['cleanUpScheduledAge'] * 86400;
1810
1811 1
        $now = time();
1812 1
        $condition = '(exec_time<>0 AND exec_time<' . ($now - $processedAgeInSeconds) . ') OR scheduled<=' . ($now - $scheduledAgeInSeconds);
1813 1
        $this->flushQueue($condition);
0 ignored issues
show
Deprecated Code introduced by
The function AOE\Crawler\Controller\C...ontroller::flushQueue() has been deprecated. ( Ignorable by Annotation )

If this is a false-positive, you can also ignore this issue in your code via the ignore-deprecated  annotation

1813
        /** @scrutinizer ignore-deprecated */ $this->flushQueue($condition);
Loading history...
1814 1
    }
1815
1816
    /**
1817
     * Removes queue entries
1818
     *
1819
     * @param string $where SQL related filter for the entries which should be removed
1820
     *
1821
     * @deprecated
1822
     */
1823 5
    protected function flushQueue($where = ''): void
1824
    {
1825 5
        $realWhere = strlen((string) $where) > 0 ? $where : '1=1';
1826
1827 5
        $queryBuilder = $this->getQueryBuilder($this->tableName);
1828
1829
        $groups = $queryBuilder
1830 5
            ->selectLiteral('DISTINCT set_id')
1831 5
            ->from($this->tableName)
1832 5
            ->where($realWhere)
1833 5
            ->execute()
1834 5
            ->fetchAll();
1835 5
        if (is_array($groups)) {
0 ignored issues
show
introduced by
The condition is_array($groups) is always true.
Loading history...
1836 5
            foreach ($groups as $group) {
1837
                $subSet = $queryBuilder
1838 4
                    ->select('qid', 'set_id')
1839 4
                    ->from($this->tableName)
1840 4
                    ->where(
1841 4
                        $realWhere,
1842 4
                        $queryBuilder->expr()->eq('set_id', $group['set_id'])
1843
                    )
1844 4
                    ->execute()
1845 4
                    ->fetchAll();
1846
1847 4
                $payLoad = ['subSet' => $subSet];
1848 4
                SignalSlotUtility::emitSignal(
1849 4
                    self::class,
1850 4
                    SignalSlotUtility::SIGNAL_QUEUE_ENTRY_FLUSH,
1851 4
                    $payLoad
1852
                );
1853
            }
1854
        }
1855
1856
        $queryBuilder
1857 5
            ->delete($this->tableName)
1858 5
            ->where($realWhere)
1859 5
            ->execute();
1860 5
    }
1861
1862
    /**
1863
     * This method determines duplicates for a queue entry with the same parameters and this timestamp.
1864
     * If the timestamp is in the past, it will check if there is any unprocessed queue entry in the past.
1865
     * If the timestamp is in the future it will check, if the queued entry has exactly the same timestamp
1866
     *
1867
     * @param int $tstamp
1868
     * @param array $fieldArray
1869
     *
1870
     * @return array
1871
     */
1872 9
    protected function getDuplicateRowsIfExist($tstamp, $fieldArray)
1873
    {
1874 9
        $rows = [];
1875
1876 9
        $currentTime = $this->getCurrentTime();
1877
1878 9
        $queryBuilder = GeneralUtility::makeInstance(ConnectionPool::class)->getQueryBuilderForTable($this->tableName);
1879
        $queryBuilder
1880 9
            ->select('qid')
1881 9
            ->from('tx_crawler_queue');
1882
        //if this entry is scheduled with "now"
1883 9
        if ($tstamp <= $currentTime) {
1884 3
            if ($this->extensionSettings['enableTimeslot']) {
1885 2
                $timeBegin = $currentTime - 100;
1886 2
                $timeEnd = $currentTime + 100;
1887
                $queryBuilder
1888 2
                    ->where(
1889 2
                        'scheduled BETWEEN ' . $timeBegin . ' AND ' . $timeEnd . ''
1890
                    )
1891 2
                    ->orWhere(
1892 2
                        $queryBuilder->expr()->lte('scheduled', $currentTime)
1893
                    );
1894
            } else {
1895
                $queryBuilder
1896 1
                    ->where(
1897 3
                        $queryBuilder->expr()->lte('scheduled', $currentTime)
1898
                    );
1899
            }
1900 6
        } elseif ($tstamp > $currentTime) {
1901
            //entry with a timestamp in the future need to have the same schedule time
1902
            $queryBuilder
1903 6
                ->where(
1904 6
                    $queryBuilder->expr()->eq('scheduled', $tstamp)
1905
                );
1906
        }
1907
1908
        $queryBuilder
1909 9
            ->andWhere('NOT exec_time')
1910 9
            ->andWhere('NOT process_id')
1911 9
            ->andWhere($queryBuilder->expr()->eq('page_id', $queryBuilder->createNamedParameter($fieldArray['page_id'], \PDO::PARAM_INT)))
1912 9
            ->andWhere($queryBuilder->expr()->eq('parameters_hash', $queryBuilder->createNamedParameter($fieldArray['parameters_hash'], \PDO::PARAM_STR)));
1913
1914 9
        $statement = $queryBuilder->execute();
1915
1916 9
        while ($row = $statement->fetch()) {
1917 7
            $rows[] = $row['qid'];
1918
        }
1919
1920 9
        return $rows;
1921
    }
1922
1923
    /**
1924
     * Returns a md5 hash generated from a serialized configuration array.
1925
     *
1926
     * @return string
1927
     */
1928 10
    protected function getConfigurationHash(array $configuration)
1929
    {
1930 10
        unset($configuration['paramExpanded']);
1931 10
        unset($configuration['URLs']);
1932 10
        return md5(serialize($configuration));
1933
    }
1934
1935
    /**
1936
     * Build a URL from a Page and the Query String. If the page has a Site configuration, it can be built by using
1937
     * the Site instance.
1938
     *
1939
     * @param int $httpsOrHttp see tx_crawler_configuration.force_ssl
1940
     * @throws \TYPO3\CMS\Core\Exception\SiteNotFoundException
1941
     * @throws \TYPO3\CMS\Core\Routing\InvalidRouteArgumentsException
1942
     *
1943
     * @deprecated Using CrawlerController::getUrlFromPageAndQueryParameters() is deprecated since 9.1.1 and will be removed in v11.x, please use UrlService->getUrlFromPageAndQueryParameters() instead.
1944
     * @codeCoverageIgnore
1945
     */
1946
    protected function getUrlFromPageAndQueryParameters(int $pageId, string $queryString, ?string $alternativeBaseUrl, int $httpsOrHttp): UriInterface
1947
    {
1948
        $urlService = new UrlService();
1949
        return $urlService->getUrlFromPageAndQueryParameters($pageId, $queryString, $alternativeBaseUrl, $httpsOrHttp);
1950
    }
1951
1952 1
    protected function swapIfFirstIsLargerThanSecond(array $reg): array
1953
    {
1954
        // Swap if first is larger than last:
1955 1
        if ($reg[1] > $reg[2]) {
1956
            $temp = $reg[2];
1957
            $reg[2] = $reg[1];
1958
            $reg[1] = $temp;
1959
        }
1960
1961 1
        return $reg;
1962
    }
1963
1964
    /**
1965
     * @return BackendUserAuthentication
1966
     */
1967 2
    private function getBackendUser()
1968
    {
1969
        // Make sure the _cli_ user is loaded
1970 2
        Bootstrap::initializeBackendAuthentication();
1971 2
        if ($this->backendUser === null) {
1972 2
            $this->backendUser = $GLOBALS['BE_USER'];
1973
        }
1974 2
        return $this->backendUser;
1975
    }
1976
1977
    /**
1978
     * Get querybuilder for given table
1979
     *
1980
     * @return \TYPO3\CMS\Core\Database\Query\QueryBuilder
1981
     */
1982 12
    private function getQueryBuilder(string $table)
1983
    {
1984 12
        return GeneralUtility::makeInstance(ConnectionPool::class)->getQueryBuilderForTable($table);
1985
    }
1986
}
1987