Passed
Push — l10n_master ( f2f1a9...57c117 )
by Tomas Norre
09:06 queued 04:57
created

CrawlerController::expandParameters()   F

Complexity

Conditions 25
Paths 831

Size

Total Lines 129
Code Lines 74

Duplication

Lines 0
Ratio 0 %

Code Coverage

Tests 57
CRAP Score 28.2868

Importance

Changes 1
Bugs 0 Features 0
Metric Value
cc 25
eloc 74
c 1
b 0
f 0
nc 831
nop 2
dl 0
loc 129
ccs 57
cts 69
cp 0.8261
crap 28.2868
rs 0.2347

How to fix   Long Method    Complexity   

Long Method

Small methods make your code easier to understand, in particular if combined with a good name. Besides, if your method is small, finding a good name is usually much easier.

For example, if you find yourself adding comments to a method's body, this is usually a good sign to extract the commented part to a new method, and use the comment as a starting point when coming up with a good name for this new method.

Commonly applied refactorings include:

1
<?php
2
3
declare(strict_types=1);
4
5
namespace AOE\Crawler\Controller;
6
7
/***************************************************************
8
 *  Copyright notice
9
 *
10
 *  (c) 2020 AOE GmbH <[email protected]>
11
 *
12
 *  All rights reserved
13
 *
14
 *  This script is part of the TYPO3 project. The TYPO3 project is
15
 *  free software; you can redistribute it and/or modify
16
 *  it under the terms of the GNU General Public License as published by
17
 *  the Free Software Foundation; either version 3 of the License, or
18
 *  (at your option) any later version.
19
 *
20
 *  The GNU General Public License can be found at
21
 *  http://www.gnu.org/copyleft/gpl.html.
22
 *
23
 *  This script is distributed in the hope that it will be useful,
24
 *  but WITHOUT ANY WARRANTY; without even the implied warranty of
25
 *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
26
 *  GNU General Public License for more details.
27
 *
28
 *  This copyright notice MUST APPEAR in all copies of the script!
29
 ***************************************************************/
30
31
use AOE\Crawler\Configuration\ExtensionConfigurationProvider;
32
use AOE\Crawler\Converter\JsonCompatibilityConverter;
33
use AOE\Crawler\Crawler;
34
use AOE\Crawler\CrawlStrategy\CrawlStrategyFactory;
35
use AOE\Crawler\Domain\Model\Process;
36
use AOE\Crawler\Domain\Repository\ConfigurationRepository;
37
use AOE\Crawler\Domain\Repository\ProcessRepository;
38
use AOE\Crawler\Domain\Repository\QueueRepository;
39
use AOE\Crawler\QueueExecutor;
40
use AOE\Crawler\Service\UrlService;
41
use AOE\Crawler\Utility\SignalSlotUtility;
42
use AOE\Crawler\Value\QueueFilter;
43
use Psr\Http\Message\UriInterface;
44
use Psr\Log\LoggerAwareInterface;
45
use Psr\Log\LoggerAwareTrait;
46
use TYPO3\CMS\Backend\Tree\View\PageTreeView;
47
use TYPO3\CMS\Backend\Utility\BackendUtility;
48
use TYPO3\CMS\Core\Authentication\BackendUserAuthentication;
49
use TYPO3\CMS\Core\Compatibility\PublicMethodDeprecationTrait;
50
use TYPO3\CMS\Core\Compatibility\PublicPropertyDeprecationTrait;
51
use TYPO3\CMS\Core\Core\Bootstrap;
52
use TYPO3\CMS\Core\Core\Environment;
53
use TYPO3\CMS\Core\Database\Connection;
54
use TYPO3\CMS\Core\Database\ConnectionPool;
55
use TYPO3\CMS\Core\Database\Query\Restriction\DeletedRestriction;
56
use TYPO3\CMS\Core\Imaging\Icon;
57
use TYPO3\CMS\Core\Imaging\IconFactory;
58
use TYPO3\CMS\Core\Site\Entity\Site;
59
use TYPO3\CMS\Core\Type\Bitmask\Permission;
60
use TYPO3\CMS\Core\TypoScript\Parser\TypoScriptParser;
61
use TYPO3\CMS\Core\Utility\DebugUtility;
62
use TYPO3\CMS\Core\Utility\GeneralUtility;
63
use TYPO3\CMS\Core\Utility\MathUtility;
64
use TYPO3\CMS\Extbase\Object\ObjectManager;
65
use TYPO3\CMS\Frontend\Page\PageRepository;
66
67
/**
68
 * Class CrawlerController
69
 *
70
 * @package AOE\Crawler\Controller
71
 */
72
class CrawlerController implements LoggerAwareInterface
73
{
74
    use LoggerAwareTrait;
75
    use PublicMethodDeprecationTrait;
76
    use PublicPropertyDeprecationTrait;
77
78
    public const CLI_STATUS_NOTHING_PROCCESSED = 0;
79
80
    //queue not empty
81
    public const CLI_STATUS_REMAIN = 1;
82
83
    //(some) queue items where processed
84
    public const CLI_STATUS_PROCESSED = 2;
85
86
    //instance didn't finish
87
    public const CLI_STATUS_ABORTED = 4;
88
89
    public const CLI_STATUS_POLLABLE_PROCESSED = 8;
90
91
    /**
92
     * @var integer
93
     */
94
    public $setID = 0;
95
96
    /**
97
     * @var string
98
     */
99
    public $processID = '';
100
101
    /**
102
     * @var array
103
     */
104
    public $duplicateTrack = [];
105
106
    /**
107
     * @var array
108
     */
109
    public $downloadUrls = [];
110
111
    /**
112
     * @var array
113
     */
114
    public $incomingProcInstructions = [];
115
116
    /**
117
     * @var array
118
     */
119
    public $incomingConfigurationSelection = [];
120
121
    /**
122
     * @var bool
123
     */
124
    public $registerQueueEntriesInternallyOnly = false;
125
126
    /**
127
     * @var array
128
     */
129
    public $queueEntries = [];
130
131
    /**
132
     * @var array
133
     */
134
    public $urlList = [];
135
136
    /**
137
     * @var array
138
     */
139
    public $extensionSettings = [];
140
141
    /**
142
     * Mount Point
143
     *
144
     * @var bool
145
     * Todo: Check what this is used for and adjust the type hint or code, as bool doesn't match the current code.
146
     */
147
    public $MP = false;
148
149
    /**
150
     * @var string
151
     * @deprecated
152
     */
153
    protected $processFilename;
154
155
    /**
156
     * Holds the internal access mode can be 'gui','cli' or 'cli_im'
157
     *
158
     * @var string
159
     * @deprecated
160
     */
161
    protected $accessMode;
162
163
    /**
164
     * @var QueueRepository
165
     */
166
    protected $queueRepository;
167
168
    /**
169
     * @var ProcessRepository
170
     */
171
    protected $processRepository;
172
173
    /**
174
     * @var ConfigurationRepository
175
     */
176
    protected $configurationRepository;
177
178
    /**
179
     * @var string
180
     */
181
    protected $tableName = 'tx_crawler_queue';
182
183
    /**
184
     * @var QueueExecutor
185
     */
186
    protected $queueExecutor;
187
188
    /**
189
     * @var int
190
     */
191
    protected $maximumUrlsToCompile = 10000;
192
193
    /**
194
     * @var IconFactory
195
     */
196
    protected $iconFactory;
197
198
    /**
199
     * @var string[]
200
     */
201
    private $deprecatedPublicMethods = [
0 ignored issues
show
introduced by
The private property $deprecatedPublicMethods is not used, and could be removed.
Loading history...
202
        'cleanUpOldQueueEntries' => 'Using CrawlerController::cleanUpOldQueueEntries() is deprecated since 9.0.1 and will be removed in v11.x, please use QueueRepository->cleanUpOldQueueEntries() instead.',
203
        'CLI_debug' => 'Using CrawlerController->CLI_debug() is deprecated since 9.1.3 and will be removed in v11.x',
204
        'getAccessMode' => 'Using CrawlerController->getAccessMode() is deprecated since 9.1.3 and will be removed in v11.x',
205
        'getLogEntriesForSetId' => 'Using crawlerController::getLogEntriesForSetId() is deprecated since 9.0.1 and will be removed in v11.x',
206
        'flushQueue' => 'Using CrawlerController::flushQueue() is deprecated since 9.0.1 and will be removed in v11.x, please use QueueRepository->flushQueue() instead.',
207
        'setAccessMode' => 'Using CrawlerController->setAccessMode() is deprecated since 9.1.3 and will be removed in v11.x',
208
        'getDisabled' => 'Using CrawlerController->getDisabled() is deprecated since 9.1.3 and will be removed in v11.x, please use Crawler->isDisabled() instead',
209
        'setDisabled' => 'Using CrawlerController->setDisabled() is deprecated since 9.1.3 and will be removed in v11.x, please use Crawler->setDisabled() instead',
210
        'getProcessFilename' => 'Using CrawlerController->getProcessFilename() is deprecated since 9.1.3 and will be removed in v11.x',
211
        'setProcessFilename' => 'Using CrawlerController->setProcessFilename() is deprecated since 9.1.3 and will be removed in v11.x',
212
        'getDuplicateRowsIfExist' => 'Using CrawlerController->getDuplicateRowsIfExist() is deprecated since 9.1.4 and will be remove in v11.x, please use QueueRepository->getDuplicateQueueItemsIfExists() instead',
213
214
    ];
215
216
    /**
217
     * @var string[]
218
     */
219
    private $deprecatedPublicProperties = [
1 ignored issue
show
introduced by
The private property $deprecatedPublicProperties is not used, and could be removed.
Loading history...
220
        'accessMode' => 'Using CrawlerController->accessMode is deprecated since 9.1.3 and will be removed in v11.x',
221
        'processFilename' => 'Using CrawlerController->accessMode is deprecated since 9.1.3 and will be removed in v11.x',
222
    ];
223
224
    /**
225
     * @var BackendUserAuthentication|null
226
     */
227
    private $backendUser;
228
229
    /**
230
     * @var integer
231
     */
232
    private $scheduledTime = 0;
233
234
    /**
235
     * @var integer
236
     */
237
    private $reqMinute = 0;
238
239
    /**
240
     * @var bool
241
     */
242
    private $submitCrawlUrls = false;
243
244
    /**
245
     * @var bool
246
     */
247
    private $downloadCrawlUrls = false;
248
249
    /**
250
     * @var PageRepository
251
     */
252
    private $pageRepository;
253
254
    /**
255
     * @var Crawler
256
     */
257
    private $crawler;
258
259
    /************************************
260
     *
261
     * Getting URLs based on Page TSconfig
262
     *
263
     ************************************/
264
265 36
    public function __construct()
266
    {
267 36
        $objectManager = GeneralUtility::makeInstance(ObjectManager::class);
268 36
        $crawlStrategyFactory = GeneralUtility::makeInstance(CrawlStrategyFactory::class);
269 36
        $this->queueRepository = $objectManager->get(QueueRepository::class);
270 36
        $this->processRepository = $objectManager->get(ProcessRepository::class);
271 36
        $this->configurationRepository = $objectManager->get(ConfigurationRepository::class);
272 36
        $this->pageRepository = $objectManager->get(PageRepository::class);
273 36
        $this->queueExecutor = GeneralUtility::makeInstance(QueueExecutor::class, $crawlStrategyFactory);
274 36
        $this->iconFactory = GeneralUtility::makeInstance(IconFactory::class);
275 36
        $this->crawler = GeneralUtility::makeInstance(Crawler::class);
276
277 36
        $this->processFilename = Environment::getVarPath() . '/lock/tx_crawler.proc';
0 ignored issues
show
Deprecated Code introduced by
The property AOE\Crawler\Controller\C...oller::$processFilename has been deprecated. ( Ignorable by Annotation )

If this is a false-positive, you can also ignore this issue in your code via the ignore-deprecated  annotation

277
        /** @scrutinizer ignore-deprecated */ $this->processFilename = Environment::getVarPath() . '/lock/tx_crawler.proc';
Loading history...
278
279
        /** @var ExtensionConfigurationProvider $configurationProvider */
280 36
        $configurationProvider = GeneralUtility::makeInstance(ExtensionConfigurationProvider::class);
281 36
        $settings = $configurationProvider->getExtensionConfiguration();
282 36
        $this->extensionSettings = is_array($settings) ? $settings : [];
0 ignored issues
show
introduced by
The condition is_array($settings) is always true.
Loading history...
283
284
        // set defaults:
285 36
        if (MathUtility::convertToPositiveInteger($this->extensionSettings['countInARun']) === 0) {
286
            $this->extensionSettings['countInARun'] = 100;
287
        }
288
289 36
        $this->extensionSettings['processLimit'] = MathUtility::forceIntegerInRange($this->extensionSettings['processLimit'], 1, 99, 1);
290 36
        $this->maximumUrlsToCompile = MathUtility::forceIntegerInRange($this->extensionSettings['maxCompileUrls'], 1, 1000000000, 10000);
291 36
    }
292
293
    /**
294
     * Method to set the accessMode can be gui, cli or cli_im
295
     *
296
     * @return string
297
     * @deprecated
298
     */
299 1
    public function getAccessMode()
300
    {
301 1
        return $this->accessMode;
0 ignored issues
show
Deprecated Code introduced by
The property AOE\Crawler\Controller\C...Controller::$accessMode has been deprecated. ( Ignorable by Annotation )

If this is a false-positive, you can also ignore this issue in your code via the ignore-deprecated  annotation

301
        return /** @scrutinizer ignore-deprecated */ $this->accessMode;
Loading history...
302
    }
303
304
    /**
305
     * @param string $accessMode
306
     * @deprecated
307
     */
308 1
    public function setAccessMode($accessMode): void
309
    {
310 1
        $this->accessMode = $accessMode;
0 ignored issues
show
Deprecated Code introduced by
The property AOE\Crawler\Controller\C...Controller::$accessMode has been deprecated. ( Ignorable by Annotation )

If this is a false-positive, you can also ignore this issue in your code via the ignore-deprecated  annotation

310
        /** @scrutinizer ignore-deprecated */ $this->accessMode = $accessMode;
Loading history...
311 1
    }
312
313
    /**
314
     * Set disabled status to prevent processes from being processed
315
     *
316
     * @param bool $disabled (optional, defaults to true)
317
     * @deprecated
318
     */
319 2
    public function setDisabled($disabled = true): void
320
    {
321 2
        if ($disabled) {
322 1
            GeneralUtility::writeFile($this->processFilename, '');
0 ignored issues
show
Deprecated Code introduced by
The property AOE\Crawler\Controller\C...oller::$processFilename has been deprecated. ( Ignorable by Annotation )

If this is a false-positive, you can also ignore this issue in your code via the ignore-deprecated  annotation

322
            GeneralUtility::writeFile(/** @scrutinizer ignore-deprecated */ $this->processFilename, '');
Loading history...
323
        } else {
324 1
            if (is_file($this->processFilename)) {
0 ignored issues
show
Deprecated Code introduced by
The property AOE\Crawler\Controller\C...oller::$processFilename has been deprecated. ( Ignorable by Annotation )

If this is a false-positive, you can also ignore this issue in your code via the ignore-deprecated  annotation

324
            if (is_file(/** @scrutinizer ignore-deprecated */ $this->processFilename)) {
Loading history...
325 1
                unlink($this->processFilename);
0 ignored issues
show
Deprecated Code introduced by
The property AOE\Crawler\Controller\C...oller::$processFilename has been deprecated. ( Ignorable by Annotation )

If this is a false-positive, you can also ignore this issue in your code via the ignore-deprecated  annotation

325
                unlink(/** @scrutinizer ignore-deprecated */ $this->processFilename);
Loading history...
326
            }
327
        }
328 2
    }
329
330
    /**
331
     * Get disable status
332
     *
333
     * @return bool true if disabled
334
     * @deprecated
335
     */
336 2
    public function getDisabled()
337
    {
338 2
        return is_file($this->processFilename);
0 ignored issues
show
Deprecated Code introduced by
The property AOE\Crawler\Controller\C...oller::$processFilename has been deprecated. ( Ignorable by Annotation )

If this is a false-positive, you can also ignore this issue in your code via the ignore-deprecated  annotation

338
        return is_file(/** @scrutinizer ignore-deprecated */ $this->processFilename);
Loading history...
339
    }
340
341
    /**
342
     * @param string $filenameWithPath
343
     * @deprecated
344
     */
345 3
    public function setProcessFilename($filenameWithPath): void
346
    {
347 3
        $this->processFilename = $filenameWithPath;
0 ignored issues
show
Deprecated Code introduced by
The property AOE\Crawler\Controller\C...oller::$processFilename has been deprecated. ( Ignorable by Annotation )

If this is a false-positive, you can also ignore this issue in your code via the ignore-deprecated  annotation

347
        /** @scrutinizer ignore-deprecated */ $this->processFilename = $filenameWithPath;
Loading history...
348 3
    }
349
350
    /**
351
     * @return string
352
     * @deprecated
353
     */
354 1
    public function getProcessFilename()
355
    {
356 1
        return $this->processFilename;
0 ignored issues
show
Deprecated Code introduced by
The property AOE\Crawler\Controller\C...oller::$processFilename has been deprecated. ( Ignorable by Annotation )

If this is a false-positive, you can also ignore this issue in your code via the ignore-deprecated  annotation

356
        return /** @scrutinizer ignore-deprecated */ $this->processFilename;
Loading history...
357
    }
358
359
    /**
360
     * Sets the extensions settings (unserialized pendant of $TYPO3_CONF_VARS['EXT']['extConf']['crawler']).
361
     */
362 14
    public function setExtensionSettings(array $extensionSettings): void
363
    {
364 14
        $this->extensionSettings = $extensionSettings;
365 14
    }
366
367
    /**
368
     * Check if the given page should be crawled
369
     *
370
     * @return false|string false if the page should be crawled (not excluded), true / skipMessage if it should be skipped
371
     */
372 12
    public function checkIfPageShouldBeSkipped(array $pageRow)
373
    {
374 12
        $skipPage = false;
375
        // message will be overwritten later
376 12
        $skipMessage = 'Skipped';
377
378
        // if page is hidden
379 12
        if (! $this->extensionSettings['crawlHiddenPages']) {
380 12
            if ($pageRow['hidden']) {
381 1
                $skipPage = true;
382 1
                $skipMessage = 'Because page is hidden';
383
            }
384
        }
385
386 12
        if (! $skipPage) {
387 11
            if (GeneralUtility::inList('3,4,199,254,255', $pageRow['doktype'])) {
388 3
                $skipPage = true;
389 3
                $skipMessage = 'Because doktype is not allowed';
390
            }
391
        }
392
393 12
        if (! $skipPage) {
394 8
            foreach ($GLOBALS['TYPO3_CONF_VARS']['EXTCONF']['crawler']['excludeDoktype'] ?? [] as $key => $doktypeList) {
395 1
                if (GeneralUtility::inList($doktypeList, $pageRow['doktype'])) {
396 1
                    $skipPage = true;
397 1
                    $skipMessage = 'Doktype was excluded by "' . $key . '"';
398 1
                    break;
399
                }
400
            }
401
        }
402
403 12
        if (! $skipPage) {
404
            // veto hook
405 7
            foreach ($GLOBALS['TYPO3_CONF_VARS']['EXTCONF']['crawler']['pageVeto'] ?? [] as $key => $func) {
406
                $params = [
407 2
                    'pageRow' => $pageRow,
408
                ];
409
                // expects "false" if page is ok and "true" or a skipMessage if this page should _not_ be crawled
410 2
                $veto = GeneralUtility::callUserFunction($func, $params, $this);
411 2
                if ($veto !== false) {
412 2
                    $skipPage = true;
413 2
                    if (is_string($veto)) {
414 1
                        $skipMessage = $veto;
415
                    } else {
416 1
                        $skipMessage = 'Veto from hook "' . htmlspecialchars($key) . '"';
417
                    }
418
                    // no need to execute other hooks if a previous one return a veto
419 2
                    break;
420
                }
421
            }
422
        }
423
424 12
        return $skipPage ? $skipMessage : false;
425
    }
426
427
    /**
428
     * Wrapper method for getUrlsForPageId()
429
     * It returns an array of configurations and no urls!
430
     *
431
     * @param array $pageRow Page record with at least dok-type and uid columns.
432
     * @param string $skipMessage
433
     * @return array
434
     * @see getUrlsForPageId()
435
     */
436 6
    public function getUrlsForPageRow(array $pageRow, &$skipMessage = '')
437
    {
438 6
        $message = $this->checkIfPageShouldBeSkipped($pageRow);
439 6
        if ($message === false) {
440 5
            $res = $this->getUrlsForPageId($pageRow['uid']);
441 5
            $skipMessage = '';
442
        } else {
443 1
            $skipMessage = $message;
444 1
            $res = [];
445
        }
446
447 6
        return $res;
448
    }
449
450
    /**
451
     * Creates a list of URLs from input array (and submits them to queue if asked for)
452
     * See Web > Info module script + "indexed_search"'s crawler hook-client using this!
453
     *
454
     * @param array $vv Information about URLs from pageRow to crawl.
455
     * @param array $pageRow Page row
456
     * @param int $scheduledTime Unix time to schedule indexing to, typically time()
457
     * @param int $reqMinute Number of requests per minute (creates the interleave between requests)
458
     * @param bool $submitCrawlUrls If set, submits the URLs to queue
459
     * @param bool $downloadCrawlUrls If set (and submitcrawlUrls is false) will fill $downloadUrls with entries)
460
     * @param array $duplicateTrack Array which is passed by reference and contains the an id per url to secure we will not crawl duplicates
461
     * @param array $downloadUrls Array which will be filled with URLS for download if flag is set.
462
     * @param array $incomingProcInstructions Array of processing instructions
463
     * @return string List of URLs (meant for display in backend module)
464
     */
465 4
    public function urlListFromUrlArray(
466
        array $vv,
467
        array $pageRow,
468
        $scheduledTime,
469
        $reqMinute,
470
        $submitCrawlUrls,
471
        $downloadCrawlUrls,
472
        array &$duplicateTrack,
473
        array &$downloadUrls,
474
        array $incomingProcInstructions
475
    ) {
476 4
        if (! is_array($vv['URLs'])) {
477
            return 'ERROR - no URL generated';
478
        }
479 4
        $urlLog = [];
480 4
        $pageId = (int) $pageRow['uid'];
481 4
        $configurationHash = $this->getConfigurationHash($vv);
482 4
        $skipInnerCheck = $this->queueRepository->noUnprocessedQueueEntriesForPageWithConfigurationHashExist($pageId, $configurationHash);
483
484 4
        $urlService = new UrlService();
485
486 4
        foreach ($vv['URLs'] as $urlQuery) {
487 4
            if (! $this->drawURLs_PIfilter($vv['subCfg']['procInstrFilter'], $incomingProcInstructions)) {
488
                continue;
489
            }
490 4
            $url = (string) $urlService->getUrlFromPageAndQueryParameters(
491 4
                $pageId,
492
                $urlQuery,
493 4
                $vv['subCfg']['baseUrl'] ?? null,
494 4
                $vv['subCfg']['force_ssl'] ?? 0
495
            );
496
497
            // Create key by which to determine unique-ness:
498 4
            $uKey = $url . '|' . $vv['subCfg']['userGroups'] . '|' . $vv['subCfg']['procInstrFilter'];
499
500 4
            if (isset($duplicateTrack[$uKey])) {
501
                //if the url key is registered just display it and do not resubmit is
502
                $urlLog[] = '<em><span class="text-muted">' . htmlspecialchars($url) . '</span></em>';
503
            } else {
504
                // Scheduled time:
505 4
                $schTime = $scheduledTime + round(count($duplicateTrack) * (60 / $reqMinute));
506 4
                $schTime = intval($schTime / 60) * 60;
507 4
                $formattedDate = BackendUtility::datetime($schTime);
508 4
                $this->urlList[] = '[' . $formattedDate . '] ' . $url;
509 4
                $urlList = '[' . $formattedDate . '] ' . htmlspecialchars($url);
510
511
                // Submit for crawling!
512 4
                if ($submitCrawlUrls) {
513 4
                    $added = $this->addUrl(
514 4
                        $pageId,
515
                        $url,
516 4
                        $vv['subCfg'],
517
                        $scheduledTime,
518
                        $configurationHash,
519
                        $skipInnerCheck
520
                    );
521 4
                    if ($added === false) {
522 4
                        $urlList .= ' (URL already existed)';
523
                    }
524
                } elseif ($downloadCrawlUrls) {
525
                    $downloadUrls[$url] = $url;
526
                }
527 4
                $urlLog[] = $urlList;
528
            }
529 4
            $duplicateTrack[$uKey] = true;
530
        }
531
532 4
        return implode('<br>', $urlLog);
533
    }
534
535
    /**
536
     * Returns true if input processing instruction is among registered ones.
537
     *
538
     * @param string $piString PI to test
539
     * @param array $incomingProcInstructions Processing instructions
540
     * @return boolean
541
     */
542 5
    public function drawURLs_PIfilter($piString, array $incomingProcInstructions)
543
    {
544 5
        if (empty($incomingProcInstructions)) {
545 1
            return true;
546
        }
547
548 4
        foreach ($incomingProcInstructions as $pi) {
549 4
            if (GeneralUtility::inList($piString, $pi)) {
550 2
                return true;
551
            }
552
        }
553 2
        return false;
554
    }
555
556 5
    public function getPageTSconfigForId($id): array
557
    {
558 5
        if (! $this->MP) {
559 5
            $pageTSconfig = BackendUtility::getPagesTSconfig($id);
0 ignored issues
show
Deprecated Code introduced by
The function TYPO3\CMS\Backend\Utilit...ity::getPagesTSconfig() has been deprecated. ( Ignorable by Annotation )

If this is a false-positive, you can also ignore this issue in your code via the ignore-deprecated  annotation

559
            $pageTSconfig = /** @scrutinizer ignore-deprecated */ BackendUtility::getPagesTSconfig($id);
Loading history...
560
        } else {
561
            // TODO: Please check, this makes no sense to split a boolean value.
562
            [, $mountPointId] = explode('-', $this->MP);
0 ignored issues
show
Bug introduced by
$this->MP of type true is incompatible with the type string expected by parameter $string of explode(). ( Ignorable by Annotation )

If this is a false-positive, you can also ignore this issue in your code via the ignore-type  annotation

562
            [, $mountPointId] = explode('-', /** @scrutinizer ignore-type */ $this->MP);
Loading history...
563
            $pageTSconfig = BackendUtility::getPagesTSconfig($mountPointId);
0 ignored issues
show
Bug introduced by
$mountPointId of type string is incompatible with the type integer expected by parameter $id of TYPO3\CMS\Backend\Utilit...ity::getPagesTSconfig(). ( Ignorable by Annotation )

If this is a false-positive, you can also ignore this issue in your code via the ignore-type  annotation

563
            $pageTSconfig = BackendUtility::getPagesTSconfig(/** @scrutinizer ignore-type */ $mountPointId);
Loading history...
Deprecated Code introduced by
The function TYPO3\CMS\Backend\Utilit...ity::getPagesTSconfig() has been deprecated. ( Ignorable by Annotation )

If this is a false-positive, you can also ignore this issue in your code via the ignore-deprecated  annotation

563
            $pageTSconfig = /** @scrutinizer ignore-deprecated */ BackendUtility::getPagesTSconfig($mountPointId);
Loading history...
564
        }
565
566
        // Call a hook to alter configuration
567 5
        if (is_array($GLOBALS['TYPO3_CONF_VARS']['EXTCONF']['crawler']['getPageTSconfigForId'])) {
568
            $params = [
569
                'pageId' => $id,
570
                'pageTSConfig' => &$pageTSconfig,
571
            ];
572
            foreach ($GLOBALS['TYPO3_CONF_VARS']['EXTCONF']['crawler']['getPageTSconfigForId'] as $userFunc) {
573
                GeneralUtility::callUserFunction($userFunc, $params, $this);
574
            }
575
        }
576 5
        return $pageTSconfig;
577
    }
578
579
    /**
580
     * This methods returns an array of configurations.
581
     * Adds no urls!
582
     */
583 4
    public function getUrlsForPageId(int $pageId): array
584
    {
585
        // Get page TSconfig for page ID
586 4
        $pageTSconfig = $this->getPageTSconfigForId($pageId);
587
588 4
        $res = [];
589
590
        // Fetch Crawler Configuration from pageTSconfig
591 4
        $crawlerCfg = $pageTSconfig['tx_crawler.']['crawlerCfg.']['paramSets.'] ?? [];
592 4
        foreach ($crawlerCfg as $key => $values) {
593 3
            if (! is_array($values)) {
594 3
                continue;
595
            }
596 3
            $key = str_replace('.', '', $key);
597
            // Sub configuration for a single configuration string:
598 3
            $subCfg = (array) $crawlerCfg[$key . '.'];
599 3
            $subCfg['key'] = $key;
600
601 3
            if (strcmp($subCfg['procInstrFilter'] ?? '', '')) {
602 3
                $subCfg['procInstrFilter'] = implode(',', GeneralUtility::trimExplode(',', $subCfg['procInstrFilter']));
603
            }
604 3
            $pidOnlyList = implode(',', GeneralUtility::trimExplode(',', $subCfg['pidsOnly'], true));
605
606
            // process configuration if it is not page-specific or if the specific page is the current page:
607
            // TODO: Check if $pidOnlyList can be kept as Array instead of imploded
608 3
            if (! strcmp((string) $subCfg['pidsOnly'], '') || GeneralUtility::inList($pidOnlyList, strval($pageId))) {
609
610
                // Explode, process etc.:
611 3
                $res[$key] = [];
612 3
                $res[$key]['subCfg'] = $subCfg;
613 3
                $res[$key]['paramParsed'] = GeneralUtility::explodeUrl2Array($crawlerCfg[$key]);
614 3
                $res[$key]['paramExpanded'] = $this->expandParameters($res[$key]['paramParsed'], $pageId);
615 3
                $res[$key]['origin'] = 'pagets';
616
617
                // recognize MP value
618 3
                if (! $this->MP) {
619 3
                    $res[$key]['URLs'] = $this->compileUrls($res[$key]['paramExpanded'], ['?id=' . $pageId]);
620
                } else {
621
                    $res[$key]['URLs'] = $this->compileUrls($res[$key]['paramExpanded'], ['?id=' . $pageId . '&MP=' . $this->MP]);
0 ignored issues
show
Bug introduced by
Are you sure $this->MP of type true can be used in concatenation? ( Ignorable by Annotation )

If this is a false-positive, you can also ignore this issue in your code via the ignore-type  annotation

621
                    $res[$key]['URLs'] = $this->compileUrls($res[$key]['paramExpanded'], ['?id=' . $pageId . '&MP=' . /** @scrutinizer ignore-type */ $this->MP]);
Loading history...
622
                }
623
            }
624
        }
625
626
        // Get configuration from tx_crawler_configuration records up the rootline
627 4
        $crawlerConfigurations = $this->configurationRepository->getCrawlerConfigurationRecordsFromRootLine($pageId);
628 4
        foreach ($crawlerConfigurations as $configurationRecord) {
629
630
            // check access to the configuration record
631 1
            if (empty($configurationRecord['begroups']) || $this->getBackendUser()->isAdmin() || $this->hasGroupAccess($this->getBackendUser()->user['usergroup_cached_list'], $configurationRecord['begroups'])) {
632 1
                $pidOnlyList = implode(',', GeneralUtility::trimExplode(',', $configurationRecord['pidsonly'], true));
633
634
                // process configuration if it is not page-specific or if the specific page is the current page:
635
                // TODO: Check if $pidOnlyList can be kept as Array instead of imploded
636 1
                if (! strcmp($configurationRecord['pidsonly'], '') || GeneralUtility::inList($pidOnlyList, strval($pageId))) {
637 1
                    $key = $configurationRecord['name'];
638
639
                    // don't overwrite previously defined paramSets
640 1
                    if (! isset($res[$key])) {
641
642
                        /* @var $TSparserObject \TYPO3\CMS\Core\TypoScript\Parser\TypoScriptParser */
643 1
                        $TSparserObject = GeneralUtility::makeInstance(TypoScriptParser::class);
644 1
                        $TSparserObject->parse($configurationRecord['processing_instruction_parameters_ts']);
645
646
                        $subCfg = [
647 1
                            'procInstrFilter' => $configurationRecord['processing_instruction_filter'],
648 1
                            'procInstrParams.' => $TSparserObject->setup,
649 1
                            'baseUrl' => $configurationRecord['base_url'],
650 1
                            'force_ssl' => (int) $configurationRecord['force_ssl'],
651 1
                            'userGroups' => $configurationRecord['fegroups'],
652 1
                            'exclude' => $configurationRecord['exclude'],
653 1
                            'key' => $key,
654
                        ];
655
656 1
                        if (! in_array($pageId, $this->expandExcludeString($subCfg['exclude']), true)) {
657 1
                            $res[$key] = [];
658 1
                            $res[$key]['subCfg'] = $subCfg;
659 1
                            $res[$key]['paramParsed'] = GeneralUtility::explodeUrl2Array($configurationRecord['configuration']);
660 1
                            $res[$key]['paramExpanded'] = $this->expandParameters($res[$key]['paramParsed'], $pageId);
661 1
                            $res[$key]['URLs'] = $this->compileUrls($res[$key]['paramExpanded'], ['?id=' . $pageId]);
662 1
                            $res[$key]['origin'] = 'tx_crawler_configuration_' . $configurationRecord['uid'];
663
                        }
664
                    }
665
                }
666
            }
667
        }
668
669 4
        foreach ($GLOBALS['TYPO3_CONF_VARS']['EXTCONF']['crawler']['processUrls'] ?? [] as $func) {
670
            $params = [
671
                'res' => &$res,
672
            ];
673
            GeneralUtility::callUserFunction($func, $params, $this);
674
        }
675 4
        return $res;
676
    }
677
678
    /**
679
     * Find all configurations of subpages of a page
680
     * TODO: Write Functional Tests
681
     */
682 1
    public function getConfigurationsForBranch(int $rootid, int $depth): array
683
    {
684 1
        $configurationsForBranch = [];
685 1
        $pageTSconfig = $this->getPageTSconfigForId($rootid);
686 1
        $sets = $pageTSconfig['tx_crawler.']['crawlerCfg.']['paramSets.'] ?? [];
687 1
        foreach ($sets as $key => $value) {
688
            if (! is_array($value)) {
689
                continue;
690
            }
691
            $configurationsForBranch[] = substr($key, -1) === '.' ? substr($key, 0, -1) : $key;
692
        }
693 1
        $pids = [];
694 1
        $rootLine = BackendUtility::BEgetRootLine($rootid);
695 1
        foreach ($rootLine as $node) {
696 1
            $pids[] = $node['uid'];
697
        }
698
        /* @var PageTreeView $tree */
699 1
        $tree = GeneralUtility::makeInstance(PageTreeView::class);
700 1
        $perms_clause = $this->getBackendUser()->getPagePermsClause(Permission::PAGE_SHOW);
701 1
        $tree->init(empty($perms_clause) ? '' : ('AND ' . $perms_clause));
702 1
        $tree->getTree($rootid, $depth, '');
703 1
        foreach ($tree->tree as $node) {
704
            $pids[] = $node['row']['uid'];
705
        }
706
707 1
        $queryBuilder = $this->getQueryBuilder('tx_crawler_configuration');
708
        $statement = $queryBuilder
709 1
            ->select('name')
710 1
            ->from('tx_crawler_configuration')
711 1
            ->where(
712 1
                $queryBuilder->expr()->in('pid', $queryBuilder->createNamedParameter($pids, Connection::PARAM_INT_ARRAY))
713
            )
714 1
            ->execute();
715
716 1
        while ($row = $statement->fetch()) {
717 1
            $configurationsForBranch[] = $row['name'];
718
        }
719 1
        return $configurationsForBranch;
720
    }
721
722
    /**
723
     * Check if a user has access to an item
724
     * (e.g. get the group list of the current logged in user from $GLOBALS['TSFE']->gr_list)
725
     *
726
     * @param string $groupList Comma-separated list of (fe_)group UIDs from a user
727
     * @param string $accessList Comma-separated list of (fe_)group UIDs of the item to access
728
     * @return bool TRUE if at least one of the users group UIDs is in the access list or the access list is empty
729
     * @see \TYPO3\CMS\Frontend\Page\PageRepository::getMultipleGroupsWhereClause()
730
     */
731 3
    public function hasGroupAccess($groupList, $accessList)
732
    {
733 3
        if (empty($accessList)) {
734 1
            return true;
735
        }
736 2
        foreach (GeneralUtility::intExplode(',', $groupList) as $groupUid) {
737 2
            if (GeneralUtility::inList($accessList, $groupUid)) {
738 1
                return true;
739
            }
740
        }
741 1
        return false;
742
    }
743
744
    /**
745
     * Will expand the parameters configuration to individual values. This follows a certain syntax of the value of each parameter.
746
     * Syntax of values:
747
     * - Basically: If the value is wrapped in [...] it will be expanded according to the following syntax, otherwise the value is taken literally
748
     * - Configuration is splitted by "|" and the parts are processed individually and finally added together
749
     * - For each configuration part:
750
     *         - "[int]-[int]" = Integer range, will be expanded to all values in between, values included, starting from low to high (max. 1000). Example "1-34" or "-40--30"
751
     *         - "_TABLE:[TCA table name];[_PID:[optional page id, default is current page]];[_ENABLELANG:1]" = Look up of table records from PID, filtering out deleted records. Example "_TABLE:tt_content; _PID:123"
752
     *        _ENABLELANG:1 picks only original records without their language overlays
753
     *         - Default: Literal value
754
     *
755
     * @param array $paramArray Array with key (GET var name) and values (value of GET var which is configuration for expansion)
756
     * @param integer $pid Current page ID
757
     * @return array
758
     *
759
     * TODO: Write Functional Tests
760
     */
761 11
    public function expandParameters($paramArray, $pid)
762
    {
763
        // Traverse parameter names:
764 11
        foreach ($paramArray as $p => $v) {
765 11
            $v = trim($v);
766
767
            // If value is encapsulated in square brackets it means there are some ranges of values to find, otherwise the value is literal
768 11
            if (strpos($v, '[') === 0 && substr($v, -1) === ']') {
769
                // So, find the value inside brackets and reset the paramArray value as an array.
770 11
                $v = substr($v, 1, -1);
771 11
                $paramArray[$p] = [];
772
773
                // Explode parts and traverse them:
774 11
                $parts = explode('|', $v);
775 11
                foreach ($parts as $pV) {
776
777
                    // Look for integer range: (fx. 1-34 or -40--30 // reads minus 40 to minus 30)
778 11
                    if (preg_match('/^(-?[0-9]+)\s*-\s*(-?[0-9]+)$/', trim($pV), $reg)) {
779 1
                        $reg = $this->swapIfFirstIsLargerThanSecond($reg);
780
781
                        // Traverse range, add values:
782
                        // Limit to size of range!
783 1
                        $runAwayBrake = 1000;
784 1
                        for ($a = $reg[1]; $a <= $reg[2]; $a++) {
785 1
                            $paramArray[$p][] = $a;
786 1
                            $runAwayBrake--;
787 1
                            if ($runAwayBrake <= 0) {
788
                                break;
789
                            }
790
                        }
791 10
                    } elseif (strpos(trim($pV), '_TABLE:') === 0) {
792
793
                        // Parse parameters:
794 6
                        $subparts = GeneralUtility::trimExplode(';', $pV);
795 6
                        $subpartParams = [];
796 6
                        foreach ($subparts as $spV) {
797 6
                            [$pKey, $pVal] = GeneralUtility::trimExplode(':', $spV);
798 6
                            $subpartParams[$pKey] = $pVal;
799
                        }
800
801
                        // Table exists:
802 6
                        if (isset($GLOBALS['TCA'][$subpartParams['_TABLE']])) {
803 6
                            $lookUpPid = isset($subpartParams['_PID']) ? intval($subpartParams['_PID']) : intval($pid);
804 6
                            $recursiveDepth = isset($subpartParams['_RECURSIVE']) ? intval($subpartParams['_RECURSIVE']) : 0;
805 6
                            $pidField = isset($subpartParams['_PIDFIELD']) ? trim($subpartParams['_PIDFIELD']) : 'pid';
806 6
                            $where = $subpartParams['_WHERE'] ?? '';
807 6
                            $addTable = $subpartParams['_ADDTABLE'] ?? '';
808
809 6
                            $fieldName = $subpartParams['_FIELD'] ? $subpartParams['_FIELD'] : 'uid';
810 6
                            if ($fieldName === 'uid' || $GLOBALS['TCA'][$subpartParams['_TABLE']]['columns'][$fieldName]) {
811 6
                                $queryBuilder = $this->getQueryBuilder($subpartParams['_TABLE']);
812
813 6
                                if ($recursiveDepth > 0) {
814
                                    /** @var \TYPO3\CMS\Core\Database\QueryGenerator $queryGenerator */
815 2
                                    $queryGenerator = GeneralUtility::makeInstance(\TYPO3\CMS\Core\Database\QueryGenerator::class);
816 2
                                    $pidList = $queryGenerator->getTreeList($lookUpPid, $recursiveDepth, 0, 1);
817 2
                                    $pidArray = GeneralUtility::intExplode(',', $pidList);
818
                                } else {
819 4
                                    $pidArray = [(string) $lookUpPid];
820
                                }
821
822 6
                                $queryBuilder->getRestrictions()
823 6
                                    ->removeAll()
824 6
                                    ->add(GeneralUtility::makeInstance(DeletedRestriction::class));
825
826
                                $queryBuilder
827 6
                                    ->select($fieldName)
828 6
                                    ->from($subpartParams['_TABLE'])
829 6
                                    ->where(
830 6
                                        $queryBuilder->expr()->in($pidField, $queryBuilder->createNamedParameter($pidArray, Connection::PARAM_INT_ARRAY)),
831
                                        $where
832
                                    );
833
834 6
                                if (! empty($addTable)) {
835
                                    // TODO: Check if this works as intended!
836
                                    $queryBuilder->add('from', $addTable);
837
                                }
838 6
                                $transOrigPointerField = $GLOBALS['TCA'][$subpartParams['_TABLE']]['ctrl']['transOrigPointerField'];
839
840 6
                                if ($subpartParams['_ENABLELANG'] && $transOrigPointerField) {
841
                                    $queryBuilder->andWhere(
842
                                        $queryBuilder->expr()->lte(
843
                                            $transOrigPointerField,
844
                                            0
845
                                        )
846
                                    );
847
                                }
848
849 6
                                $statement = $queryBuilder->execute();
850
851 6
                                $rows = [];
852 6
                                while ($row = $statement->fetch()) {
853 6
                                    $rows[$row[$fieldName]] = $row;
854
                                }
855
856 6
                                if (is_array($rows)) {
857 6
                                    $paramArray[$p] = array_merge($paramArray[$p], array_keys($rows));
858
                                }
859
                            }
860
                        }
861
                    } else {
862
                        // Just add value:
863 4
                        $paramArray[$p][] = $pV;
864
                    }
865
                    // Hook for processing own expandParameters place holder
866 11
                    if (is_array($GLOBALS['TYPO3_CONF_VARS']['SC_OPTIONS']['crawler/class.tx_crawler_lib.php']['expandParameters'])) {
867
                        $_params = [
868
                            'pObj' => &$this,
869
                            'paramArray' => &$paramArray,
870
                            'currentKey' => $p,
871
                            'currentValue' => $pV,
872
                            'pid' => $pid,
873
                        ];
874
                        foreach ($GLOBALS['TYPO3_CONF_VARS']['SC_OPTIONS']['crawler/class.tx_crawler_lib.php']['expandParameters'] as $_funcRef) {
875
                            GeneralUtility::callUserFunction($_funcRef, $_params, $this);
876
                        }
877
                    }
878
                }
879
880
                // Make unique set of values and sort array by key:
881 11
                $paramArray[$p] = array_unique($paramArray[$p]);
882 11
                ksort($paramArray);
883
            } else {
884
                // Set the literal value as only value in array:
885 4
                $paramArray[$p] = [$v];
886
            }
887
        }
888
889 11
        return $paramArray;
890
    }
891
892
    /**
893
     * Compiling URLs from parameter array (output of expandParameters())
894
     * The number of URLs will be the multiplication of the number of parameter values for each key
895
     *
896
     * @param array $paramArray Output of expandParameters(): Array with keys (GET var names) and for each an array of values
897
     * @param array $urls URLs accumulated in this array (for recursion)
898
     * @return array
899
     */
900 7
    public function compileUrls($paramArray, array $urls)
901
    {
902 7
        if (empty($paramArray)) {
903 7
            return $urls;
904
        }
905
        // shift first off stack:
906 6
        reset($paramArray);
907 6
        $varName = key($paramArray);
908 6
        $valueSet = array_shift($paramArray);
909
910
        // Traverse value set:
911 6
        $newUrls = [];
912 6
        foreach ($urls as $url) {
913 5
            foreach ($valueSet as $val) {
914 5
                $newUrls[] = $url . (strcmp((string) $val, '') ? '&' . rawurlencode($varName) . '=' . rawurlencode((string) $val) : '');
915
916 5
                if (count($newUrls) > $this->maximumUrlsToCompile) {
917
                    break;
918
                }
919
            }
920
        }
921 6
        return $this->compileUrls($paramArray, $newUrls);
922
    }
923
924
    /************************************
925
     *
926
     * Crawler log
927
     *
928
     ************************************/
929
930
    /**
931
     * Return array of records from crawler queue for input page ID
932
     *
933
     * @param integer $id Page ID for which to look up log entries.
934
     * @param boolean $doFlush If TRUE, then entries selected at DELETED(!) instead of selected!
935
     * @param boolean $doFullFlush
936
     * @param integer $itemsPerPage Limit the amount of entries per page default is 10
937
     * @return array
938
     */
939 4
    public function getLogEntriesForPageId($id, QueueFilter $queueFilter, $doFlush = false, $doFullFlush = false, $itemsPerPage = 10)
940
    {
941 4
        $queryBuilder = GeneralUtility::makeInstance(ConnectionPool::class)->getQueryBuilderForTable($this->tableName);
942
        $queryBuilder
943 4
            ->select('*')
944 4
            ->from($this->tableName)
945 4
            ->where(
946 4
                $queryBuilder->expr()->eq('page_id', $queryBuilder->createNamedParameter($id, \PDO::PARAM_INT))
947
            )
948 4
            ->orderBy('scheduled', 'DESC');
949
950 4
        $expressionBuilder = GeneralUtility::makeInstance(ConnectionPool::class)
951 4
            ->getConnectionForTable($this->tableName)
952 4
            ->getExpressionBuilder();
953 4
        $query = $expressionBuilder->andX();
0 ignored issues
show
Unused Code introduced by
The assignment to $query is dead and can be removed.
Loading history...
954
        // PHPStorm adds the highlight that the $addWhere is immediately overwritten,
955
        // but the $query = $expressionBuilder->andX() ensures that the $addWhere is written correctly with AND
956
        // between the statements, it's not a mistake in the code.
957 4
        switch ($queueFilter) {
958 4
            case 'pending':
959
                $queryBuilder->andWhere($queryBuilder->expr()->eq('exec_time', 0));
960
                break;
961 4
            case 'finished':
962
                $queryBuilder->andWhere($queryBuilder->expr()->gt('exec_time', 0));
963
                break;
964
        }
965
966 4
        if ($doFlush) {
967 2
            if ($doFullFlush) {
968 1
                $this->queueRepository->flushQueue($queueFilter);
969
            } else {
970 1
                $this->queueRepository->flushQueue($queueFilter);
971
            }
972
        }
973 4
        if ($itemsPerPage > 0) {
974
            $queryBuilder
975 4
                ->setMaxResults((int) $itemsPerPage);
976
        }
977
978 4
        return $queryBuilder->execute()->fetchAll();
979
    }
980
981
    /**
982
     * Return array of records from crawler queue for input set ID
983
     *
984
     * @param int $set_id Set ID for which to look up log entries.
985
     * @param string $filter Filter: "all" => all entries, "pending" => all that is not yet run, "finished" => all complete ones
986
     * @param bool $doFlush If TRUE, then entries selected at DELETED(!) instead of selected!
987
     * @param int $itemsPerPage Limit the amount of entries per page default is 10
988
     * @return array
989
     *
990
     * @deprecated
991
     */
992 6
    public function getLogEntriesForSetId(int $set_id, string $filter = '', bool $doFlush = false, bool $doFullFlush = false, int $itemsPerPage = 10)
993
    {
994 6
        $queryBuilder = GeneralUtility::makeInstance(ConnectionPool::class)->getQueryBuilderForTable($this->tableName);
995
        $queryBuilder
996 6
            ->select('*')
997 6
            ->from($this->tableName)
998 6
            ->where(
999 6
                $queryBuilder->expr()->eq('set_id', $queryBuilder->createNamedParameter($set_id, \PDO::PARAM_INT))
1000
            )
1001 6
            ->orderBy('scheduled', 'DESC');
1002
1003 6
        $expressionBuilder = GeneralUtility::makeInstance(ConnectionPool::class)
1004 6
            ->getConnectionForTable($this->tableName)
1005 6
            ->getExpressionBuilder();
1006 6
        $query = $expressionBuilder->andX();
1007
        // PHPStorm adds the highlight that the $addWhere is immediately overwritten,
1008
        // but the $query = $expressionBuilder->andX() ensures that the $addWhere is written correctly with AND
1009
        // between the statements, it's not a mistake in the code.
1010 6
        $addWhere = '';
1011 6
        switch ($filter) {
1012 6
            case 'pending':
1013 1
                $queryBuilder->andWhere($queryBuilder->expr()->eq('exec_time', 0));
1014 1
                $addWhere = $query->add($expressionBuilder->eq('exec_time', 0));
0 ignored issues
show
Unused Code introduced by
The assignment to $addWhere is dead and can be removed.
Loading history...
1015 1
                break;
1016 5
            case 'finished':
1017 1
                $queryBuilder->andWhere($queryBuilder->expr()->gt('exec_time', 0));
1018 1
                $addWhere = $query->add($expressionBuilder->gt('exec_time', 0));
1019 1
                break;
1020
        }
1021 6
        if ($doFlush) {
1022 4
            $addWhere = $query->add($expressionBuilder->eq('set_id', (int) $set_id));
1023 4
            $this->flushQueue($doFullFlush ? '' : $addWhere);
0 ignored issues
show
Deprecated Code introduced by
The function AOE\Crawler\Controller\C...ontroller::flushQueue() has been deprecated. ( Ignorable by Annotation )

If this is a false-positive, you can also ignore this issue in your code via the ignore-deprecated  annotation

1023
            /** @scrutinizer ignore-deprecated */ $this->flushQueue($doFullFlush ? '' : $addWhere);
Loading history...
1024 4
            return [];
1025
        }
1026 2
        if ($itemsPerPage > 0) {
1027
            $queryBuilder
1028 2
                ->setMaxResults((int) $itemsPerPage);
1029
        }
1030
1031 2
        return $queryBuilder->execute()->fetchAll();
1032
    }
1033
1034
    /**
1035
     * Adding call back entries to log (called from hooks typically, see indexed search class "class.crawler.php"
1036
     *
1037
     * @param integer $setId Set ID
1038
     * @param array $params Parameters to pass to call back function
1039
     * @param string $callBack Call back object reference, eg. 'EXT:indexed_search/class.crawler.php:&tx_indexedsearch_crawler'
1040
     * @param integer $page_id Page ID to attach it to
1041
     * @param integer $schedule Time at which to activate
1042
     */
1043
    public function addQueueEntry_callBack($setId, $params, $callBack, $page_id = 0, $schedule = 0): void
1044
    {
1045
        if (! is_array($params)) {
0 ignored issues
show
introduced by
The condition is_array($params) is always true.
Loading history...
1046
            $params = [];
1047
        }
1048
        $params['_CALLBACKOBJ'] = $callBack;
1049
1050
        GeneralUtility::makeInstance(ConnectionPool::class)->getConnectionForTable('tx_crawler_queue')
1051
            ->insert(
1052
                'tx_crawler_queue',
1053
                [
1054
                    'page_id' => (int) $page_id,
1055
                    'parameters' => json_encode($params),
1056
                    'scheduled' => (int) $schedule ?: $this->getCurrentTime(),
1057
                    'exec_time' => 0,
1058
                    'set_id' => (int) $setId,
1059
                    'result_data' => '',
1060
                ]
1061
            );
1062
    }
1063
1064
    /************************************
1065
     *
1066
     * URL setting
1067
     *
1068
     ************************************/
1069
1070
    /**
1071
     * Setting a URL for crawling:
1072
     *
1073
     * @param integer $id Page ID
1074
     * @param string $url Complete URL
1075
     * @param array $subCfg Sub configuration array (from TS config)
1076
     * @param integer $tstamp Scheduled-time
1077
     * @param string $configurationHash (optional) configuration hash
1078
     * @param bool $skipInnerDuplicationCheck (optional) skip inner duplication check
1079
     * @return bool
1080
     */
1081 8
    public function addUrl(
1082
        $id,
1083
        $url,
1084
        array $subCfg,
1085
        $tstamp,
1086
        $configurationHash = '',
1087
        $skipInnerDuplicationCheck = false
1088
    ) {
1089 8
        $urlAdded = false;
1090 8
        $rows = [];
1091
1092
        // Creating parameters:
1093
        $parameters = [
1094 8
            'url' => $url,
1095
        ];
1096
1097
        // fe user group simulation:
1098 8
        $uGs = implode(',', array_unique(GeneralUtility::intExplode(',', $subCfg['userGroups'], true)));
1099 8
        if ($uGs) {
1100 1
            $parameters['feUserGroupList'] = $uGs;
1101
        }
1102
1103
        // Setting processing instructions
1104 8
        $parameters['procInstructions'] = GeneralUtility::trimExplode(',', $subCfg['procInstrFilter']);
1105 8
        if (is_array($subCfg['procInstrParams.'])) {
1106 5
            $parameters['procInstrParams'] = $subCfg['procInstrParams.'];
1107
        }
1108
1109
        // Compile value array:
1110 8
        $parameters_serialized = json_encode($parameters);
1111
        $fieldArray = [
1112 8
            'page_id' => (int) $id,
1113 8
            'parameters' => $parameters_serialized,
1114 8
            'parameters_hash' => GeneralUtility::shortMD5($parameters_serialized),
1115 8
            'configuration_hash' => $configurationHash,
1116 8
            'scheduled' => $tstamp,
1117 8
            'exec_time' => 0,
1118 8
            'set_id' => (int) $this->setID,
1119 8
            'result_data' => '',
1120 8
            'configuration' => $subCfg['key'],
1121
        ];
1122
1123 8
        if ($this->registerQueueEntriesInternallyOnly) {
1124
            //the entries will only be registered and not stored to the database
1125 1
            $this->queueEntries[] = $fieldArray;
1126
        } else {
1127 7
            if (! $skipInnerDuplicationCheck) {
1128
                // check if there is already an equal entry
1129 6
                $rows = $this->queueRepository->getDuplicateQueueItemsIfExists(
1130 6
                    (bool) $this->extensionSettings['enableTimeslot'],
1131
                    $tstamp,
1132 6
                    $this->getCurrentTime(),
1133 6
                    $fieldArray['page_id'],
1134 6
                    $fieldArray['parameters_hash']
1135
                );
1136
            }
1137
1138 7
            if (empty($rows)) {
1139 6
                $connectionForCrawlerQueue = GeneralUtility::makeInstance(ConnectionPool::class)->getConnectionForTable('tx_crawler_queue');
1140 6
                $connectionForCrawlerQueue->insert(
1141 6
                    'tx_crawler_queue',
1142
                    $fieldArray
1143
                );
1144 6
                $uid = $connectionForCrawlerQueue->lastInsertId('tx_crawler_queue', 'qid');
1145 6
                $rows[] = $uid;
1146 6
                $urlAdded = true;
1147
1148 6
                $signalPayload = ['uid' => $uid, 'fieldArray' => $fieldArray];
1149 6
                SignalSlotUtility::emitSignal(
1150 6
                    self::class,
1151 6
                    SignalSlotUtility::SIGNAL_URL_ADDED_TO_QUEUE,
1152
                    $signalPayload
1153
                );
1154
            } else {
1155 3
                $signalPayload = ['rows' => $rows, 'fieldArray' => $fieldArray];
1156 3
                SignalSlotUtility::emitSignal(
1157 3
                    self::class,
1158 3
                    SignalSlotUtility::SIGNAL_DUPLICATE_URL_IN_QUEUE,
1159
                    $signalPayload
1160
                );
1161
            }
1162
        }
1163
1164 8
        return $urlAdded;
1165
    }
1166
1167
    /**
1168
     * Returns the current system time
1169
     *
1170
     * @return int
1171
     */
1172 2
    public function getCurrentTime()
1173
    {
1174 2
        return time();
1175
    }
1176
1177
    /************************************
1178
     *
1179
     * URL reading
1180
     *
1181
     ************************************/
1182
1183
    /**
1184
     * Read URL for single queue entry
1185
     *
1186
     * @param integer $queueId
1187
     * @param boolean $force If set, will process even if exec_time has been set!
1188
     * @return integer
1189
     */
1190
    public function readUrl($queueId, $force = false)
1191
    {
1192
        $queryBuilder = GeneralUtility::makeInstance(ConnectionPool::class)->getQueryBuilderForTable($this->tableName);
1193
        $ret = 0;
1194
        $this->logger->debug('crawler-readurl start ' . microtime(true));
1195
        // Get entry:
1196
        $queryBuilder
1197
            ->select('*')
1198
            ->from('tx_crawler_queue')
1199
            ->where(
1200
                $queryBuilder->expr()->eq('qid', $queryBuilder->createNamedParameter($queueId, \PDO::PARAM_INT))
1201
            );
1202
        if (! $force) {
1203
            $queryBuilder
1204
                ->andWhere('exec_time = 0')
1205
                ->andWhere('process_scheduled > 0');
1206
        }
1207
        $queueRec = $queryBuilder->execute()->fetch();
1208
1209
        if (! is_array($queueRec)) {
1210
            return;
1211
        }
1212
1213
        SignalSlotUtility::emitSignal(
1214
            self::class,
1215
            SignalSlotUtility::SIGNAL_QUEUEITEM_PREPROCESS,
1216
            [$queueId, &$queueRec]
1217
        );
1218
1219
        // Set exec_time to lock record:
1220
        $field_array = ['exec_time' => $this->getCurrentTime()];
1221
1222
        if (isset($this->processID)) {
1223
            //if mulitprocessing is used we need to store the id of the process which has handled this entry
1224
            $field_array['process_id_completed'] = $this->processID;
1225
        }
1226
1227
        GeneralUtility::makeInstance(ConnectionPool::class)->getConnectionForTable('tx_crawler_queue')
1228
            ->update(
1229
                'tx_crawler_queue',
1230
                $field_array,
1231
                ['qid' => (int) $queueId]
1232
            );
1233
1234
        $result = $this->queueExecutor->executeQueueItem($queueRec, $this);
1235
        if ($result['content'] === null) {
1236
            $resultData = 'An errors happened';
1237
        } else {
1238
            /** @var JsonCompatibilityConverter $jsonCompatibilityConverter */
1239
            $jsonCompatibilityConverter = GeneralUtility::makeInstance(JsonCompatibilityConverter::class);
1240
            $resultData = $jsonCompatibilityConverter->convert($result['content']);
1241
        }
1242
1243
        //atm there's no need to point to specific pollable extensions
1244
        if (is_array($GLOBALS['TYPO3_CONF_VARS']['EXTCONF']['crawler']['pollSuccess'])) {
1245
            foreach ($GLOBALS['TYPO3_CONF_VARS']['EXTCONF']['crawler']['pollSuccess'] as $pollable) {
1246
                // only check the success value if the instruction is runnig
1247
                // it is important to name the pollSuccess key same as the procInstructions key
1248
                if (is_array($resultData['parameters']['procInstructions'])
1249
                    && in_array(
1250
                        $pollable,
1251
                        $resultData['parameters']['procInstructions'], true
1252
                    )
1253
                ) {
1254
                    if (! empty($resultData['success'][$pollable]) && $resultData['success'][$pollable]) {
1255
                        $ret |= self::CLI_STATUS_POLLABLE_PROCESSED;
1256
                    }
1257
                }
1258
            }
1259
        }
1260
1261
        // Set result in log which also denotes the end of the processing of this entry.
1262
        $field_array = ['result_data' => json_encode($result)];
1263
1264
        SignalSlotUtility::emitSignal(
1265
            self::class,
1266
            SignalSlotUtility::SIGNAL_QUEUEITEM_POSTPROCESS,
1267
            [$queueId, &$field_array]
1268
        );
1269
1270
        GeneralUtility::makeInstance(ConnectionPool::class)->getConnectionForTable('tx_crawler_queue')
1271
            ->update(
1272
                'tx_crawler_queue',
1273
                $field_array,
1274
                ['qid' => (int) $queueId]
1275
            );
1276
1277
        $this->logger->debug('crawler-readurl stop ' . microtime(true));
1278
        return $ret;
1279
    }
1280
1281
    /**
1282
     * Read URL for not-yet-inserted log-entry
1283
     *
1284
     * @param array $field_array Queue field array,
1285
     *
1286
     * @return array|bool|mixed|string
1287
     */
1288
    public function readUrlFromArray($field_array)
1289
    {
1290
        // Set exec_time to lock record:
1291
        $field_array['exec_time'] = $this->getCurrentTime();
1292
        $connectionForCrawlerQueue = GeneralUtility::makeInstance(ConnectionPool::class)->getConnectionForTable($this->tableName);
1293
        $connectionForCrawlerQueue->insert(
1294
            $this->tableName,
1295
            $field_array
1296
        );
1297
        $queueId = $connectionForCrawlerQueue->lastInsertId($this->tableName, 'qid');
1298
        $result = $this->queueExecutor->executeQueueItem($field_array, $this);
1299
1300
        // Set result in log which also denotes the end of the processing of this entry.
1301
        $field_array = ['result_data' => json_encode($result)];
1302
1303
        SignalSlotUtility::emitSignal(
1304
            self::class,
1305
            SignalSlotUtility::SIGNAL_QUEUEITEM_POSTPROCESS,
1306
            [$queueId, &$field_array]
1307
        );
1308
1309
        $connectionForCrawlerQueue->update(
1310
            $this->tableName,
1311
            $field_array,
1312
            ['qid' => $queueId]
1313
        );
1314
1315
        return $result;
1316
    }
1317
1318
    /*****************************
1319
     *
1320
     * Compiling URLs to crawl - tools
1321
     *
1322
     *****************************/
1323
1324
    /**
1325
     * @param integer $id Root page id to start from.
1326
     * @param integer $depth Depth of tree, 0=only id-page, 1= on sublevel, 99 = infinite
1327
     * @param integer $scheduledTime Unix Time when the URL is timed to be visited when put in queue
1328
     * @param integer $reqMinute Number of requests per minute (creates the interleave between requests)
1329
     * @param boolean $submitCrawlUrls If set, submits the URLs to queue in database (real crawling)
1330
     * @param boolean $downloadCrawlUrls If set (and submitcrawlUrls is false) will fill $downloadUrls with entries)
1331
     * @param array $incomingProcInstructions Array of processing instructions
1332
     * @param array $configurationSelection Array of configuration keys
1333
     * @return string
1334
     */
1335
    public function getPageTreeAndUrls(
1336
        $id,
1337
        $depth,
1338
        $scheduledTime,
1339
        $reqMinute,
1340
        $submitCrawlUrls,
1341
        $downloadCrawlUrls,
1342
        array $incomingProcInstructions,
1343
        array $configurationSelection
1344
    ) {
1345
        $this->scheduledTime = $scheduledTime;
1346
        $this->reqMinute = $reqMinute;
1347
        $this->submitCrawlUrls = $submitCrawlUrls;
1348
        $this->downloadCrawlUrls = $downloadCrawlUrls;
1349
        $this->incomingProcInstructions = $incomingProcInstructions;
1350
        $this->incomingConfigurationSelection = $configurationSelection;
1351
1352
        $this->duplicateTrack = [];
1353
        $this->downloadUrls = [];
1354
1355
        // Drawing tree:
1356
        /* @var PageTreeView $tree */
1357
        $tree = GeneralUtility::makeInstance(PageTreeView::class);
1358
        $perms_clause = $this->getBackendUser()->getPagePermsClause(Permission::PAGE_SHOW);
1359
        $tree->init('AND ' . $perms_clause);
1360
1361
        $pageInfo = BackendUtility::readPageAccess($id, $perms_clause);
1362
        if (is_array($pageInfo)) {
0 ignored issues
show
introduced by
The condition is_array($pageInfo) is always true.
Loading history...
1363
            // Set root row:
1364
            $tree->tree[] = [
1365
                'row' => $pageInfo,
1366
                'HTML' => $this->iconFactory->getIconForRecord('pages', $pageInfo, Icon::SIZE_SMALL),
1367
            ];
1368
        }
1369
1370
        // Get branch beneath:
1371
        if ($depth) {
1372
            $tree->getTree($id, $depth, '');
1373
        }
1374
1375
        // Traverse page tree:
1376
        $code = '';
1377
1378
        foreach ($tree->tree as $data) {
1379
            $this->MP = false;
1380
1381
            // recognize mount points
1382
            if ($data['row']['doktype'] === PageRepository::DOKTYPE_MOUNTPOINT) {
1383
                $mountpage = $this->pageRepository->getPage($data['row']['uid']);
1384
1385
                // fetch mounted pages
1386
                $this->MP = $mountpage[0]['mount_pid'] . '-' . $data['row']['uid'];
0 ignored issues
show
Documentation Bug introduced by
The property $MP was declared of type boolean, but $mountpage[0]['mount_pid...' . $data['row']['uid'] is of type string. Maybe add a type cast?

This check looks for assignments to scalar types that may be of the wrong type.

To ensure the code behaves as expected, it may be a good idea to add an explicit type cast.

$answer = 42;

$correct = false;

$correct = (bool) $answer;
Loading history...
1387
1388
                $mountTree = GeneralUtility::makeInstance(PageTreeView::class);
1389
                $mountTree->init('AND ' . $perms_clause);
1390
                $mountTree->getTree($mountpage[0]['mount_pid'], $depth);
1391
1392
                foreach ($mountTree->tree as $mountData) {
1393
                    $code .= $this->drawURLs_addRowsForPage(
1394
                        $mountData['row'],
1395
                        $mountData['HTML'] . BackendUtility::getRecordTitle('pages', $mountData['row'], true)
1396
                    );
1397
                }
1398
1399
                // replace page when mount_pid_ol is enabled
1400
                if ($mountpage[0]['mount_pid_ol']) {
1401
                    $data['row']['uid'] = $mountpage[0]['mount_pid'];
1402
                } else {
1403
                    // if the mount_pid_ol is not set the MP must not be used for the mountpoint page
1404
                    $this->MP = false;
1405
                }
1406
            }
1407
1408
            $code .= $this->drawURLs_addRowsForPage(
1409
                $data['row'],
1410
                $data['HTML'] . BackendUtility::getRecordTitle('pages', $data['row'], true)
1411
            );
1412
        }
1413
1414
        return $code;
1415
    }
1416
1417
    /**
1418
     * Expands exclude string
1419
     *
1420
     * @param string $excludeString Exclude string
1421
     * @return array
1422
     */
1423 2
    public function expandExcludeString($excludeString)
1424
    {
1425
        // internal static caches;
1426 2
        static $expandedExcludeStringCache;
1427 2
        static $treeCache;
1428
1429 2
        if (empty($expandedExcludeStringCache[$excludeString])) {
1430 2
            $pidList = [];
1431
1432 2
            if (! empty($excludeString)) {
1433
                /** @var PageTreeView $tree */
1434 1
                $tree = GeneralUtility::makeInstance(PageTreeView::class);
1435 1
                $tree->init('AND ' . $this->getBackendUser()->getPagePermsClause(Permission::PAGE_SHOW));
1436
1437 1
                $excludeParts = GeneralUtility::trimExplode(',', $excludeString);
1438
1439 1
                foreach ($excludeParts as $excludePart) {
1440 1
                    [$pid, $depth] = GeneralUtility::trimExplode('+', $excludePart);
1441
1442
                    // default is "page only" = "depth=0"
1443 1
                    if (empty($depth)) {
1444 1
                        $depth = (stristr($excludePart, '+')) ? 99 : 0;
1445
                    }
1446
1447 1
                    $pidList[] = (int) $pid;
1448
1449 1
                    if ($depth > 0) {
1450
                        if (empty($treeCache[$pid][$depth])) {
1451
                            $tree->reset();
1452
                            $tree->getTree($pid, $depth);
0 ignored issues
show
Bug introduced by
$pid of type string is incompatible with the type integer expected by parameter $uid of TYPO3\CMS\Backend\Tree\V...ractTreeView::getTree(). ( Ignorable by Annotation )

If this is a false-positive, you can also ignore this issue in your code via the ignore-type  annotation

1452
                            $tree->getTree(/** @scrutinizer ignore-type */ $pid, $depth);
Loading history...
1453
                            $treeCache[$pid][$depth] = $tree->tree;
1454
                        }
1455
1456
                        foreach ($treeCache[$pid][$depth] as $data) {
1457
                            $pidList[] = (int) $data['row']['uid'];
1458
                        }
1459
                    }
1460
                }
1461
            }
1462
1463 2
            $expandedExcludeStringCache[$excludeString] = array_unique($pidList);
1464
        }
1465
1466 2
        return $expandedExcludeStringCache[$excludeString];
1467
    }
1468
1469
    /**
1470
     * Create the rows for display of the page tree
1471
     * For each page a number of rows are shown displaying GET variable configuration
1472
     */
1473
    public function drawURLs_addRowsForPage(array $pageRow, string $pageTitle): string
1474
    {
1475
        $skipMessage = '';
1476
1477
        // Get list of configurations
1478
        $configurations = $this->getUrlsForPageRow($pageRow, $skipMessage);
1479
1480
        if (! empty($this->incomingConfigurationSelection)) {
1481
            // remove configuration that does not match the current selection
1482
            foreach ($configurations as $confKey => $confArray) {
1483
                if (! in_array($confKey, $this->incomingConfigurationSelection, true)) {
1484
                    unset($configurations[$confKey]);
1485
                }
1486
            }
1487
        }
1488
1489
        // Traverse parameter combinations:
1490
        $c = 0;
1491
        $content = '';
1492
        if (! empty($configurations)) {
1493
            foreach ($configurations as $confKey => $confArray) {
1494
1495
                // Title column:
1496
                if (! $c) {
1497
                    $titleClm = '<td rowspan="' . count($configurations) . '">' . $pageTitle . '</td>';
1498
                } else {
1499
                    $titleClm = '';
1500
                }
1501
1502
                if (! in_array($pageRow['uid'], $this->expandExcludeString($confArray['subCfg']['exclude']), true)) {
1503
1504
                    // URL list:
1505
                    $urlList = $this->urlListFromUrlArray(
1506
                        $confArray,
1507
                        $pageRow,
1508
                        $this->scheduledTime,
1509
                        $this->reqMinute,
1510
                        $this->submitCrawlUrls,
1511
                        $this->downloadCrawlUrls,
1512
                        $this->duplicateTrack,
1513
                        $this->downloadUrls,
1514
                        // if empty the urls won't be filtered by processing instructions
1515
                        $this->incomingProcInstructions
1516
                    );
1517
1518
                    // Expanded parameters:
1519
                    $paramExpanded = '';
1520
                    $calcAccu = [];
1521
                    $calcRes = 1;
1522
                    foreach ($confArray['paramExpanded'] as $gVar => $gVal) {
1523
                        $paramExpanded .= '
1524
                            <tr>
1525
                                <td>' . htmlspecialchars('&' . $gVar . '=') . '<br/>' .
1526
                            '(' . count($gVal) . ')' .
1527
                            '</td>
1528
                                <td nowrap="nowrap">' . nl2br(htmlspecialchars(implode(chr(10), $gVal))) . '</td>
1529
                            </tr>
1530
                        ';
1531
                        $calcRes *= count($gVal);
1532
                        $calcAccu[] = count($gVal);
1533
                    }
1534
                    $paramExpanded = '<table>' . $paramExpanded . '</table>';
1535
                    $paramExpanded .= 'Comb: ' . implode('*', $calcAccu) . '=' . $calcRes;
1536
1537
                    // Options
1538
                    $optionValues = '';
1539
                    if ($confArray['subCfg']['userGroups']) {
1540
                        $optionValues .= 'User Groups: ' . $confArray['subCfg']['userGroups'] . '<br/>';
1541
                    }
1542
                    if ($confArray['subCfg']['procInstrFilter']) {
1543
                        $optionValues .= 'ProcInstr: ' . $confArray['subCfg']['procInstrFilter'] . '<br/>';
1544
                    }
1545
1546
                    // Compile row:
1547
                    $content .= '
1548
                        <tr>
1549
                            ' . $titleClm . '
1550
                            <td>' . htmlspecialchars($confKey) . '</td>
1551
                            <td>' . nl2br(htmlspecialchars(rawurldecode(trim(str_replace('&', chr(10) . '&', GeneralUtility::implodeArrayForUrl('', $confArray['paramParsed'])))))) . '</td>
1552
                            <td>' . $paramExpanded . '</td>
1553
                            <td nowrap="nowrap">' . $urlList . '</td>
1554
                            <td nowrap="nowrap">' . $optionValues . '</td>
1555
                            <td nowrap="nowrap">' . DebugUtility::viewArray($confArray['subCfg']['procInstrParams.']) . '</td>
1556
                        </tr>';
1557
                } else {
1558
                    $content .= '<tr>
1559
                            ' . $titleClm . '
1560
                            <td>' . htmlspecialchars($confKey) . '</td>
1561
                            <td colspan="5"><em>No entries</em> (Page is excluded in this configuration)</td>
1562
                        </tr>';
1563
                }
1564
1565
                $c++;
1566
            }
1567
        } else {
1568
            $message = ! empty($skipMessage) ? ' (' . $skipMessage . ')' : '';
1569
1570
            // Compile row:
1571
            $content .= '
1572
                <tr>
1573
                    <td>' . $pageTitle . '</td>
1574
                    <td colspan="6"><em>No entries</em>' . $message . '</td>
1575
                </tr>';
1576
        }
1577
1578
        return $content;
1579
    }
1580
1581
    /*****************************
1582
     *
1583
     * CLI functions
1584
     *
1585
     *****************************/
1586
1587
    /**
1588
     * Running the functionality of the CLI (crawling URLs from queue)
1589
     */
1590
    public function CLI_run(int $countInARun, int $sleepTime, int $sleepAfterFinish): int
1591
    {
1592
        $result = 0;
1593
        $counter = 0;
1594
1595
        // First, run hooks:
1596
        $this->CLI_runHooks();
1597
1598
        // Clean up the queue
1599
        $this->queueRepository->cleanupQueue();
1600
1601
        // Select entries:
1602
        $rows = $this->queueRepository->fetchRecordsToBeCrawled($countInARun);
1603
1604
        if (! empty($rows)) {
1605
            $quidList = [];
1606
1607
            foreach ($rows as $r) {
1608
                $quidList[] = $r['qid'];
1609
            }
1610
1611
            $processId = $this->CLI_buildProcessId();
1612
1613
            //save the number of assigned queue entries to determine how many have been processed later
1614
            $numberOfAffectedRows = $this->queueRepository->updateProcessIdAndSchedulerForQueueIds($quidList, $processId);
1615
            $this->processRepository->updateProcessAssignItemsCount($numberOfAffectedRows, $processId);
1616
1617
            if ($numberOfAffectedRows !== count($quidList)) {
1618
                $this->CLI_debug('Nothing processed due to multi-process collision (' . $this->CLI_buildProcessId() . ')');
0 ignored issues
show
Deprecated Code introduced by
The function AOE\Crawler\Controller\C...Controller::CLI_debug() has been deprecated. ( Ignorable by Annotation )

If this is a false-positive, you can also ignore this issue in your code via the ignore-deprecated  annotation

1618
                /** @scrutinizer ignore-deprecated */ $this->CLI_debug('Nothing processed due to multi-process collision (' . $this->CLI_buildProcessId() . ')');
Loading history...
1619
                return ($result | self::CLI_STATUS_ABORTED);
1620
            }
1621
1622
            foreach ($rows as $r) {
1623
                $result |= $this->readUrl($r['qid']);
1624
1625
                $counter++;
1626
                // Just to relax the system
1627
                usleep((int) $sleepTime);
1628
1629
                // if during the start and the current read url the cli has been disable we need to return from the function
1630
                // mark the process NOT as ended.
1631
                if ($this->crawler->isDisabled()) {
1632
                    return ($result | self::CLI_STATUS_ABORTED);
1633
                }
1634
1635
                if (! $this->processRepository->isProcessActive($this->CLI_buildProcessId())) {
1636
                    $this->CLI_debug('conflict / timeout (' . $this->CLI_buildProcessId() . ')');
0 ignored issues
show
Deprecated Code introduced by
The function AOE\Crawler\Controller\C...Controller::CLI_debug() has been deprecated. ( Ignorable by Annotation )

If this is a false-positive, you can also ignore this issue in your code via the ignore-deprecated  annotation

1636
                    /** @scrutinizer ignore-deprecated */ $this->CLI_debug('conflict / timeout (' . $this->CLI_buildProcessId() . ')');
Loading history...
1637
                    $result |= self::CLI_STATUS_ABORTED;
1638
                    //possible timeout
1639
                    break;
1640
                }
1641
            }
1642
1643
            sleep((int) $sleepAfterFinish);
1644
1645
            $msg = 'Rows: ' . $counter;
1646
            $this->CLI_debug($msg . ' (' . $this->CLI_buildProcessId() . ')');
0 ignored issues
show
Deprecated Code introduced by
The function AOE\Crawler\Controller\C...Controller::CLI_debug() has been deprecated. ( Ignorable by Annotation )

If this is a false-positive, you can also ignore this issue in your code via the ignore-deprecated  annotation

1646
            /** @scrutinizer ignore-deprecated */ $this->CLI_debug($msg . ' (' . $this->CLI_buildProcessId() . ')');
Loading history...
1647
        } else {
1648
            $this->CLI_debug('Nothing within queue which needs to be processed (' . $this->CLI_buildProcessId() . ')');
0 ignored issues
show
Deprecated Code introduced by
The function AOE\Crawler\Controller\C...Controller::CLI_debug() has been deprecated. ( Ignorable by Annotation )

If this is a false-positive, you can also ignore this issue in your code via the ignore-deprecated  annotation

1648
            /** @scrutinizer ignore-deprecated */ $this->CLI_debug('Nothing within queue which needs to be processed (' . $this->CLI_buildProcessId() . ')');
Loading history...
1649
        }
1650
1651
        if ($counter > 0) {
1652
            $result |= self::CLI_STATUS_PROCESSED;
1653
        }
1654
1655
        return $result;
1656
    }
1657
1658
    /**
1659
     * Activate hooks
1660
     */
1661
    public function CLI_runHooks(): void
1662
    {
1663
        foreach ($GLOBALS['TYPO3_CONF_VARS']['EXTCONF']['crawler']['cli_hooks'] ?? [] as $objRef) {
1664
            $hookObj = GeneralUtility::makeInstance($objRef);
1665
            if (is_object($hookObj)) {
1666
                $hookObj->crawler_init($this);
1667
            }
1668
        }
1669
    }
1670
1671
    /**
1672
     * Try to acquire a new process with the given id
1673
     * also performs some auto-cleanup for orphan processes
1674
     * @param string $id identification string for the process
1675
     * @return boolean
1676
     * @todo preemption might not be the most elegant way to clean up
1677
     */
1678
    public function CLI_checkAndAcquireNewProcess($id)
1679
    {
1680
        $ret = true;
1681
1682
        $systemProcessId = getmypid();
1683
        if (! $systemProcessId) {
1684
            return false;
1685
        }
1686
1687
        $processCount = 0;
1688
        $orphanProcesses = [];
1689
1690
        $activeProcesses = $this->processRepository->findAllActive();
1691
        $currentTime = $this->getCurrentTime();
1692
1693
        /** @var Process $process */
1694
        foreach ($activeProcesses as $process) {
1695
            if ($process->getTtl() < $currentTime) {
1696
                $orphanProcesses[] = $process->getProcessId();
1697
            } else {
1698
                $processCount++;
1699
            }
1700
        }
1701
1702
        // if there are less than allowed active processes then add a new one
1703
        if ($processCount < (int) $this->extensionSettings['processLimit']) {
1704
            $this->CLI_debug('add process ' . $this->CLI_buildProcessId() . ' (' . ($processCount + 1) . '/' . (int) $this->extensionSettings['processLimit'] . ')');
0 ignored issues
show
Deprecated Code introduced by
The function AOE\Crawler\Controller\C...Controller::CLI_debug() has been deprecated. ( Ignorable by Annotation )

If this is a false-positive, you can also ignore this issue in your code via the ignore-deprecated  annotation

1704
            /** @scrutinizer ignore-deprecated */ $this->CLI_debug('add process ' . $this->CLI_buildProcessId() . ' (' . ($processCount + 1) . '/' . (int) $this->extensionSettings['processLimit'] . ')');
Loading history...
1705
1706
            GeneralUtility::makeInstance(ConnectionPool::class)->getConnectionForTable('tx_crawler_process')->insert(
1707
                'tx_crawler_process',
1708
                [
1709
                    'process_id' => $id,
1710
                    'active' => 1,
1711
                    'ttl' => $currentTime + (int) $this->extensionSettings['processMaxRunTime'],
1712
                    'system_process_id' => $systemProcessId,
1713
                ]
1714
            );
1715
        } else {
1716
            $this->CLI_debug('Processlimit reached (' . ($processCount) . '/' . (int) $this->extensionSettings['processLimit'] . ')');
0 ignored issues
show
Deprecated Code introduced by
The function AOE\Crawler\Controller\C...Controller::CLI_debug() has been deprecated. ( Ignorable by Annotation )

If this is a false-positive, you can also ignore this issue in your code via the ignore-deprecated  annotation

1716
            /** @scrutinizer ignore-deprecated */ $this->CLI_debug('Processlimit reached (' . ($processCount) . '/' . (int) $this->extensionSettings['processLimit'] . ')');
Loading history...
1717
            $ret = false;
1718
        }
1719
1720
        $this->processRepository->deleteProcessesMarkedAsDeleted();
1721
        $this->CLI_releaseProcesses($orphanProcesses);
1722
1723
        return $ret;
1724
    }
1725
1726
    /**
1727
     * Release a process and the required resources
1728
     *
1729
     * @param mixed $releaseIds string with a single process-id or array with multiple process-ids
1730
     * @return boolean
1731
     */
1732
    public function CLI_releaseProcesses($releaseIds)
1733
    {
1734
        $queryBuilder = GeneralUtility::makeInstance(ConnectionPool::class)->getQueryBuilderForTable($this->tableName);
1735
1736
        if (! is_array($releaseIds)) {
1737
            $releaseIds = [$releaseIds];
1738
        }
1739
1740
        if (empty($releaseIds)) {
1741
            //nothing to release
1742
            return false;
1743
        }
1744
1745
        // some kind of 2nd chance algo - this way you need at least 2 processes to have a real cleanup
1746
        // this ensures that a single process can't mess up the entire process table
1747
1748
        // mark all processes as deleted which have no "waiting" queue-entires and which are not active
1749
1750
        $queryBuilder
1751
            ->update($this->tableName, 'q')
1752
            ->where(
1753
                'q.process_id IN(SELECT p.process_id FROM tx_crawler_process as p WHERE p.active = 0)'
1754
            )
1755
            ->set('q.process_scheduled', 0)
1756
            ->set('q.process_id', '')
1757
            ->execute();
1758
1759
        // FIXME: Not entirely sure that this is equivalent to the previous version
1760
        $queryBuilder->resetQueryPart('set');
1761
1762
        $queryBuilder
1763
            ->update('tx_crawler_process')
1764
            ->where(
1765
                $queryBuilder->expr()->eq('active', 0),
1766
                'process_id IN(SELECT q.process_id FROM tx_crawler_queue as q WHERE q.exec_time = 0)'
1767
            )
1768
            ->set('system_process_id', 0)
1769
            ->execute();
1770
1771
        $this->processRepository->markRequestedProcessesAsNotActive($releaseIds);
1772
        $this->queueRepository->unsetProcessScheduledAndProcessIdForQueueEntries($releaseIds);
1773
1774
        return true;
1775
    }
1776
1777
    /**
1778
     * Create a unique Id for the current process
1779
     *
1780
     * @return string the ID
1781
     */
1782 1
    public function CLI_buildProcessId()
1783
    {
1784 1
        if (! $this->processID) {
1785
            $this->processID = GeneralUtility::shortMD5(microtime(true));
1786
        }
1787 1
        return $this->processID;
1788
    }
1789
1790
    /**
1791
     * Prints a message to the stdout (only if debug-mode is enabled)
1792
     *
1793
     * @param string $msg the message
1794
     * @deprecated
1795
     * @codeCoverageIgnore
1796
     */
1797
    public function CLI_debug($msg): void
1798
    {
1799
        if ((int) $this->extensionSettings['processDebug']) {
1800
            echo $msg . "\n";
1801
            flush();
1802
        }
1803
    }
1804
1805
    /**
1806
     * Cleans up entries that stayed for too long in the queue. These are:
1807
     * - processed entries that are over 1.5 days in age
1808
     * - scheduled entries that are over 7 days old
1809
     *
1810
     * @deprecated
1811
     */
1812 1
    public function cleanUpOldQueueEntries(): void
1813
    {
1814
        // 24*60*60 Seconds in 24 hours
1815 1
        $processedAgeInSeconds = $this->extensionSettings['cleanUpProcessedAge'] * 86400;
1816 1
        $scheduledAgeInSeconds = $this->extensionSettings['cleanUpScheduledAge'] * 86400;
1817
1818 1
        $now = time();
1819 1
        $condition = '(exec_time<>0 AND exec_time<' . ($now - $processedAgeInSeconds) . ') OR scheduled<=' . ($now - $scheduledAgeInSeconds);
1820 1
        $this->flushQueue($condition);
0 ignored issues
show
Deprecated Code introduced by
The function AOE\Crawler\Controller\C...ontroller::flushQueue() has been deprecated. ( Ignorable by Annotation )

If this is a false-positive, you can also ignore this issue in your code via the ignore-deprecated  annotation

1820
        /** @scrutinizer ignore-deprecated */ $this->flushQueue($condition);
Loading history...
1821 1
    }
1822
1823
    /**
1824
     * Removes queue entries
1825
     *
1826
     * @param string $where SQL related filter for the entries which should be removed
1827
     *
1828
     * @deprecated
1829
     */
1830 5
    protected function flushQueue($where = ''): void
1831
    {
1832 5
        $realWhere = strlen((string) $where) > 0 ? $where : '1=1';
1833
1834 5
        $queryBuilder = $this->getQueryBuilder($this->tableName);
1835
1836
        $groups = $queryBuilder
1837 5
            ->selectLiteral('DISTINCT set_id')
1838 5
            ->from($this->tableName)
1839 5
            ->where($realWhere)
1840 5
            ->execute()
1841 5
            ->fetchAll();
1842 5
        if (is_array($groups)) {
0 ignored issues
show
introduced by
The condition is_array($groups) is always true.
Loading history...
1843 5
            foreach ($groups as $group) {
1844
                $subSet = $queryBuilder
1845 4
                    ->select('qid', 'set_id')
1846 4
                    ->from($this->tableName)
1847 4
                    ->where(
1848 4
                        $realWhere,
1849 4
                        $queryBuilder->expr()->eq('set_id', $group['set_id'])
1850
                    )
1851 4
                    ->execute()
1852 4
                    ->fetchAll();
1853
1854 4
                $payLoad = ['subSet' => $subSet];
1855 4
                SignalSlotUtility::emitSignal(
1856 4
                    self::class,
1857 4
                    SignalSlotUtility::SIGNAL_QUEUE_ENTRY_FLUSH,
1858
                    $payLoad
1859
                );
1860
            }
1861
        }
1862
1863
        $queryBuilder
1864 5
            ->delete($this->tableName)
1865 5
            ->where($realWhere)
1866 5
            ->execute();
1867 5
    }
1868
1869
    /**
1870
     * This method determines duplicates for a queue entry with the same parameters and this timestamp.
1871
     * If the timestamp is in the past, it will check if there is any unprocessed queue entry in the past.
1872
     * If the timestamp is in the future it will check, if the queued entry has exactly the same timestamp
1873
     *
1874
     * @param int $tstamp
1875
     * @param array $fieldArray
1876
     *
1877
     * @return array
1878
     * @deprecated
1879
     */
1880 5
    protected function getDuplicateRowsIfExist($tstamp, $fieldArray)
1881
    {
1882 5
        $rows = [];
1883
1884 5
        $currentTime = $this->getCurrentTime();
1885
1886 5
        $queryBuilder = GeneralUtility::makeInstance(ConnectionPool::class)->getQueryBuilderForTable($this->tableName);
1887
        $queryBuilder
1888 5
            ->select('qid')
1889 5
            ->from('tx_crawler_queue');
1890
        //if this entry is scheduled with "now"
1891 5
        if ($tstamp <= $currentTime) {
1892 2
            if ($this->extensionSettings['enableTimeslot']) {
1893 1
                $timeBegin = $currentTime - 100;
1894 1
                $timeEnd = $currentTime + 100;
1895
                $queryBuilder
1896 1
                    ->where(
1897 1
                        'scheduled BETWEEN ' . $timeBegin . ' AND ' . $timeEnd . ''
1898
                    )
1899 1
                    ->orWhere(
1900 1
                        $queryBuilder->expr()->lte('scheduled', $currentTime)
1901
                    );
1902
            } else {
1903
                $queryBuilder
1904 1
                    ->where(
1905 2
                        $queryBuilder->expr()->lte('scheduled', $currentTime)
1906
                    );
1907
            }
1908 3
        } elseif ($tstamp > $currentTime) {
1909
            //entry with a timestamp in the future need to have the same schedule time
1910
            $queryBuilder
1911 3
                ->where(
1912 3
                    $queryBuilder->expr()->eq('scheduled', $tstamp)
1913
                );
1914
        }
1915
1916
        $queryBuilder
1917 5
            ->andWhere('NOT exec_time')
1918 5
            ->andWhere('NOT process_id')
1919 5
            ->andWhere($queryBuilder->expr()->eq('page_id', $queryBuilder->createNamedParameter($fieldArray['page_id'], \PDO::PARAM_INT)))
1920 5
            ->andWhere($queryBuilder->expr()->eq('parameters_hash', $queryBuilder->createNamedParameter($fieldArray['parameters_hash'], \PDO::PARAM_STR)));
1921
1922 5
        $statement = $queryBuilder->execute();
1923
1924 5
        while ($row = $statement->fetch()) {
1925 5
            $rows[] = $row['qid'];
1926
        }
1927
1928 5
        return $rows;
1929
    }
1930
1931
    /**
1932
     * Returns a md5 hash generated from a serialized configuration array.
1933
     *
1934
     * @return string
1935
     */
1936 10
    protected function getConfigurationHash(array $configuration)
1937
    {
1938 10
        unset($configuration['paramExpanded']);
1939 10
        unset($configuration['URLs']);
1940 10
        return md5(serialize($configuration));
1941
    }
1942
1943
    /**
1944
     * Build a URL from a Page and the Query String. If the page has a Site configuration, it can be built by using
1945
     * the Site instance.
1946
     *
1947
     * @param int $httpsOrHttp see tx_crawler_configuration.force_ssl
1948
     * @throws \TYPO3\CMS\Core\Exception\SiteNotFoundException
1949
     * @throws \TYPO3\CMS\Core\Routing\InvalidRouteArgumentsException
1950
     *
1951
     * @deprecated Using CrawlerController::getUrlFromPageAndQueryParameters() is deprecated since 9.1.1 and will be removed in v11.x, please use UrlService->getUrlFromPageAndQueryParameters() instead.
1952
     * @codeCoverageIgnore
1953
     */
1954
    protected function getUrlFromPageAndQueryParameters(int $pageId, string $queryString, ?string $alternativeBaseUrl, int $httpsOrHttp): UriInterface
1955
    {
1956
        $urlService = new UrlService();
1957
        return $urlService->getUrlFromPageAndQueryParameters($pageId, $queryString, $alternativeBaseUrl, $httpsOrHttp);
1958
    }
1959
1960 1
    protected function swapIfFirstIsLargerThanSecond(array $reg): array
1961
    {
1962
        // Swap if first is larger than last:
1963 1
        if ($reg[1] > $reg[2]) {
1964
            $temp = $reg[2];
1965
            $reg[2] = $reg[1];
1966
            $reg[1] = $temp;
1967
        }
1968
1969 1
        return $reg;
1970
    }
1971
1972
    /**
1973
     * @return BackendUserAuthentication
1974
     */
1975 2
    private function getBackendUser()
1976
    {
1977
        // Make sure the _cli_ user is loaded
1978 2
        Bootstrap::initializeBackendAuthentication();
1979 2
        if ($this->backendUser === null) {
1980 2
            $this->backendUser = $GLOBALS['BE_USER'];
1981
        }
1982 2
        return $this->backendUser;
1983
    }
1984
1985
    /**
1986
     * Get querybuilder for given table
1987
     *
1988
     * @return \TYPO3\CMS\Core\Database\Query\QueryBuilder
1989
     */
1990 12
    private function getQueryBuilder(string $table)
1991
    {
1992 12
        return GeneralUtility::makeInstance(ConnectionPool::class)->getQueryBuilderForTable($table);
1993
    }
1994
}
1995