Passed
Push — test/hardning ( 60f42e )
by Tomas Norre
05:52
created

CrawlerController::setMaximumUrlsToCompile()   A

Complexity

Conditions 1
Paths 1

Size

Total Lines 3
Code Lines 1

Duplication

Lines 0
Ratio 0 %

Code Coverage

Tests 2
CRAP Score 1

Importance

Changes 0
Metric Value
cc 1
eloc 1
nc 1
nop 1
dl 0
loc 3
ccs 2
cts 2
cp 1
crap 1
rs 10
c 0
b 0
f 0
1
<?php
2
3
declare(strict_types=1);
4
5
namespace AOE\Crawler\Controller;
6
7
/***************************************************************
8
 *  Copyright notice
9
 *
10
 *  (c) 2020 AOE GmbH <[email protected]>
11
 *
12
 *  All rights reserved
13
 *
14
 *  This script is part of the TYPO3 project. The TYPO3 project is
15
 *  free software; you can redistribute it and/or modify
16
 *  it under the terms of the GNU General Public License as published by
17
 *  the Free Software Foundation; either version 3 of the License, or
18
 *  (at your option) any later version.
19
 *
20
 *  The GNU General Public License can be found at
21
 *  http://www.gnu.org/copyleft/gpl.html.
22
 *
23
 *  This script is distributed in the hope that it will be useful,
24
 *  but WITHOUT ANY WARRANTY; without even the implied warranty of
25
 *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
26
 *  GNU General Public License for more details.
27
 *
28
 *  This copyright notice MUST APPEAR in all copies of the script!
29
 ***************************************************************/
30
31
use AOE\Crawler\Configuration\ExtensionConfigurationProvider;
32
use AOE\Crawler\Converter\JsonCompatibilityConverter;
33
use AOE\Crawler\Crawler;
34
use AOE\Crawler\CrawlStrategy\CrawlStrategyFactory;
35
use AOE\Crawler\Domain\Model\Process;
36
use AOE\Crawler\Domain\Repository\ConfigurationRepository;
37
use AOE\Crawler\Domain\Repository\ProcessRepository;
38
use AOE\Crawler\Domain\Repository\QueueRepository;
39
use AOE\Crawler\QueueExecutor;
40
use AOE\Crawler\Service\ConfigurationService;
41
use AOE\Crawler\Service\UrlService;
42
use AOE\Crawler\Utility\SignalSlotUtility;
43
use AOE\Crawler\Value\QueueFilter;
44
use PDO;
45
use Psr\Http\Message\UriInterface;
46
use Psr\Log\LoggerAwareInterface;
47
use Psr\Log\LoggerAwareTrait;
48
use TYPO3\CMS\Backend\Tree\View\PageTreeView;
49
use TYPO3\CMS\Backend\Utility\BackendUtility;
50
use TYPO3\CMS\Core\Authentication\BackendUserAuthentication;
51
use TYPO3\CMS\Core\Compatibility\PublicMethodDeprecationTrait;
52
use TYPO3\CMS\Core\Compatibility\PublicPropertyDeprecationTrait;
53
use TYPO3\CMS\Core\Core\Bootstrap;
54
use TYPO3\CMS\Core\Core\Environment;
55
use TYPO3\CMS\Core\Database\Connection;
56
use TYPO3\CMS\Core\Database\ConnectionPool;
57
use TYPO3\CMS\Core\Database\Query\QueryBuilder;
58
use TYPO3\CMS\Core\Database\Query\Restriction\DeletedRestriction;
59
use TYPO3\CMS\Core\Database\QueryGenerator;
60
use TYPO3\CMS\Core\Domain\Repository\PageRepository;
61
use TYPO3\CMS\Core\Exception\SiteNotFoundException;
62
use TYPO3\CMS\Core\Imaging\Icon;
63
use TYPO3\CMS\Core\Imaging\IconFactory;
64
use TYPO3\CMS\Core\Routing\InvalidRouteArgumentsException;
65
use TYPO3\CMS\Core\Site\Entity\Site;
66
use TYPO3\CMS\Core\Type\Bitmask\Permission;
67
use TYPO3\CMS\Core\TypoScript\Parser\TypoScriptParser;
68
use TYPO3\CMS\Core\Utility\DebugUtility;
69
use TYPO3\CMS\Core\Utility\GeneralUtility;
70
use TYPO3\CMS\Core\Utility\MathUtility;
71
use TYPO3\CMS\Extbase\Object\ObjectManager;
72
73
/**
74
 * Class CrawlerController
75
 *
76
 * @package AOE\Crawler\Controller
77
 */
78
class CrawlerController implements LoggerAwareInterface
79
{
80
    use LoggerAwareTrait;
81
    use PublicMethodDeprecationTrait;
82
    use PublicPropertyDeprecationTrait;
83
84
    public const CLI_STATUS_NOTHING_PROCCESSED = 0;
85
86
    //queue not empty
87
    public const CLI_STATUS_REMAIN = 1;
88
89
    //(some) queue items where processed
90
    public const CLI_STATUS_PROCESSED = 2;
91
92
    //instance didn't finish
93
    public const CLI_STATUS_ABORTED = 4;
94
95
    public const CLI_STATUS_POLLABLE_PROCESSED = 8;
96
97
    /**
98
     * @var integer
99
     */
100
    public $setID = 0;
101
102
    /**
103
     * @var string
104
     */
105
    public $processID = '';
106
107
    /**
108
     * @var array
109
     */
110
    public $duplicateTrack = [];
111
112
    /**
113
     * @var array
114
     */
115
    public $downloadUrls = [];
116
117
    /**
118
     * @var array
119
     */
120
    public $incomingProcInstructions = [];
121
122
    /**
123
     * @var array
124
     */
125
    public $incomingConfigurationSelection = [];
126
127
    /**
128
     * @var bool
129
     */
130
    public $registerQueueEntriesInternallyOnly = false;
131
132
    /**
133
     * @var array
134
     */
135
    public $queueEntries = [];
136
137
    /**
138
     * @var array
139
     */
140
    public $urlList = [];
141
142
    /**
143
     * @var array
144
     */
145
    public $extensionSettings = [];
146
147
    /**
148
     * Mount Point
149
     *
150
     * @var bool
151
     * Todo: Check what this is used for and adjust the type hint or code, as bool doesn't match the current code.
152
     */
153
    public $MP = false;
154
155
    /**
156
     * @var string
157
     * @deprecated
158
     */
159
    protected $processFilename;
160
161
    /**
162
     * Holds the internal access mode can be 'gui','cli' or 'cli_im'
163
     *
164
     * @var string
165
     * @deprecated
166
     */
167
    protected $accessMode;
168
169
    /**
170
     * @var QueueRepository
171
     */
172
    protected $queueRepository;
173
174
    /**
175
     * @var ProcessRepository
176
     */
177
    protected $processRepository;
178
179
    /**
180
     * @var ConfigurationRepository
181
     */
182
    protected $configurationRepository;
183
184
    /**
185
     * @var string
186
     */
187
    protected $tableName = 'tx_crawler_queue';
188
189
    /**
190
     * @var QueueExecutor
191
     */
192
    protected $queueExecutor;
193
194
    /**
195
     * @var int
196
     */
197
    protected $maximumUrlsToCompile = 10000;
198
199
    /**
200
     * @var IconFactory
201
     */
202
    protected $iconFactory;
203
204
    /**
205
     * @var string[]
206
     */
207
    private $deprecatedPublicMethods = [
0 ignored issues
show
introduced by
The private property $deprecatedPublicMethods is not used, and could be removed.
Loading history...
208
        'cleanUpOldQueueEntries' => 'Using CrawlerController::cleanUpOldQueueEntries() is deprecated since 9.0.1 and will be removed in v11.x, please use QueueRepository->cleanUpOldQueueEntries() instead.',
209
        'CLI_debug' => 'Using CrawlerController->CLI_debug() is deprecated since 9.1.3 and will be removed in v11.x',
210
        'CLI_runHooks' => 'Using CrawlerController->CLI_runHooks() is deprecated since 9.1.5 and will be removed in v11.x',
211
        'getAccessMode' => 'Using CrawlerController->getAccessMode() is deprecated since 9.1.3 and will be removed in v11.x',
212
        'getLogEntriesForPageId' => 'Using CrawlerController->getLogEntriesForPageId() is deprecated since 9.1.5 and will be remove in v11.x',
213
        'getLogEntriesForSetId' => 'Using crawlerController::getLogEntriesForSetId() is deprecated since 9.0.1 and will be removed in v11.x',
214
        'flushQueue' => 'Using CrawlerController::flushQueue() is deprecated since 9.0.1 and will be removed in v11.x, please use QueueRepository->flushQueue() instead.',
215
        'setAccessMode' => 'Using CrawlerController->setAccessMode() is deprecated since 9.1.3 and will be removed in v11.x',
216
        'getDisabled' => 'Using CrawlerController->getDisabled() is deprecated since 9.1.3 and will be removed in v11.x, please use Crawler->isDisabled() instead',
217
        'setDisabled' => 'Using CrawlerController->setDisabled() is deprecated since 9.1.3 and will be removed in v11.x, please use Crawler->setDisabled() instead',
218
        'getProcessFilename' => 'Using CrawlerController->getProcessFilename() is deprecated since 9.1.3 and will be removed in v11.x',
219
        'setProcessFilename' => 'Using CrawlerController->setProcessFilename() is deprecated since 9.1.3 and will be removed in v11.x',
220
        'getDuplicateRowsIfExist' => 'Using CrawlerController->getDuplicateRowsIfExist() is deprecated since 9.1.4 and will be remove in v11.x, please use QueueRepository->getDuplicateQueueItemsIfExists() instead',
221
    ];
222
223
    /**
224
     * @var string[]
225
     */
226
    private $deprecatedPublicProperties = [
1 ignored issue
show
introduced by
The private property $deprecatedPublicProperties is not used, and could be removed.
Loading history...
227
        'accessMode' => 'Using CrawlerController->accessMode is deprecated since 9.1.3 and will be removed in v11.x',
228
        'processFilename' => 'Using CrawlerController->accessMode is deprecated since 9.1.3 and will be removed in v11.x',
229
    ];
230
231
    /**
232
     * @var BackendUserAuthentication|null
233
     */
234
    private $backendUser;
235
236
    /**
237
     * @var integer
238
     */
239
    private $scheduledTime = 0;
240
241
    /**
242
     * @var integer
243
     */
244
    private $reqMinute = 0;
245
246
    /**
247
     * @var bool
248
     */
249
    private $submitCrawlUrls = false;
250
251
    /**
252
     * @var bool
253
     */
254
    private $downloadCrawlUrls = false;
255
256
    /**
257
     * @var PageRepository
258
     */
259
    private $pageRepository;
260
261
    /**
262
     * @var Crawler
263
     */
264
    private $crawler;
265
266
    /************************************
267
     *
268
     * Getting URLs based on Page TSconfig
269
     *
270
     ************************************/
271
272 36
    public function __construct()
273
    {
274 36
        $objectManager = GeneralUtility::makeInstance(ObjectManager::class);
275 36
        $crawlStrategyFactory = GeneralUtility::makeInstance(CrawlStrategyFactory::class);
276 36
        $this->queueRepository = $objectManager->get(QueueRepository::class);
277 36
        $this->processRepository = $objectManager->get(ProcessRepository::class);
278 36
        $this->configurationRepository = $objectManager->get(ConfigurationRepository::class);
279 36
        $this->pageRepository = GeneralUtility::makeInstance(PageRepository::class);
280 36
        $this->queueExecutor = GeneralUtility::makeInstance(QueueExecutor::class, $crawlStrategyFactory);
281 36
        $this->iconFactory = GeneralUtility::makeInstance(IconFactory::class);
282 36
        $this->crawler = GeneralUtility::makeInstance(Crawler::class);
283
284 36
        $this->processFilename = Environment::getVarPath() . '/lock/tx_crawler.proc';
0 ignored issues
show
Deprecated Code introduced by
The property AOE\Crawler\Controller\C...oller::$processFilename has been deprecated. ( Ignorable by Annotation )

If this is a false-positive, you can also ignore this issue in your code via the ignore-deprecated  annotation

284
        /** @scrutinizer ignore-deprecated */ $this->processFilename = Environment::getVarPath() . '/lock/tx_crawler.proc';
Loading history...
285
286
        /** @var ExtensionConfigurationProvider $configurationProvider */
287 36
        $configurationProvider = GeneralUtility::makeInstance(ExtensionConfigurationProvider::class);
288 36
        $settings = $configurationProvider->getExtensionConfiguration();
289 36
        $this->extensionSettings = is_array($settings) ? $settings : [];
0 ignored issues
show
introduced by
The condition is_array($settings) is always true.
Loading history...
290
291
        // set defaults:
292 36
        if (MathUtility::convertToPositiveInteger($this->extensionSettings['countInARun']) === 0) {
293
            $this->extensionSettings['countInARun'] = 100;
294
        }
295
296 36
        $this->extensionSettings['processLimit'] = MathUtility::forceIntegerInRange($this->extensionSettings['processLimit'], 1, 99, 1);
297 36
        $this->setMaximumUrlsToCompile(MathUtility::forceIntegerInRange($this->extensionSettings['maxCompileUrls'], 1, 1000000000, 10000));
298 36
    }
299
300 40
    public function setMaximumUrlsToCompile(int $maximumUrlsToCompile): void
301
    {
302 40
        $this->maximumUrlsToCompile = $maximumUrlsToCompile;
303 40
    }
304
305
    /**
306
     * Method to set the accessMode can be gui, cli or cli_im
307
     *
308
     * @return string
309
     * @deprecated
310
     */
311 1
    public function getAccessMode()
312
    {
313 1
        return $this->accessMode;
0 ignored issues
show
Deprecated Code introduced by
The property AOE\Crawler\Controller\C...Controller::$accessMode has been deprecated. ( Ignorable by Annotation )

If this is a false-positive, you can also ignore this issue in your code via the ignore-deprecated  annotation

313
        return /** @scrutinizer ignore-deprecated */ $this->accessMode;
Loading history...
314
    }
315
316
    /**
317
     * @param string $accessMode
318
     * @deprecated
319
     */
320 1
    public function setAccessMode($accessMode): void
321
    {
322 1
        $this->accessMode = $accessMode;
0 ignored issues
show
Deprecated Code introduced by
The property AOE\Crawler\Controller\C...Controller::$accessMode has been deprecated. ( Ignorable by Annotation )

If this is a false-positive, you can also ignore this issue in your code via the ignore-deprecated  annotation

322
        /** @scrutinizer ignore-deprecated */ $this->accessMode = $accessMode;
Loading history...
323 1
    }
324
325
    /**
326
     * Set disabled status to prevent processes from being processed
327
     * @deprecated
328
     */
329 2
    public function setDisabled(?bool $disabled = true): void
330
    {
331 2
        if ($disabled) {
332 1
            GeneralUtility::writeFile($this->processFilename, 'disabled');
0 ignored issues
show
Deprecated Code introduced by
The property AOE\Crawler\Controller\C...oller::$processFilename has been deprecated. ( Ignorable by Annotation )

If this is a false-positive, you can also ignore this issue in your code via the ignore-deprecated  annotation

332
            GeneralUtility::writeFile(/** @scrutinizer ignore-deprecated */ $this->processFilename, 'disabled');
Loading history...
333 1
        } elseif (is_file($this->processFilename)) {
0 ignored issues
show
Deprecated Code introduced by
The property AOE\Crawler\Controller\C...oller::$processFilename has been deprecated. ( Ignorable by Annotation )

If this is a false-positive, you can also ignore this issue in your code via the ignore-deprecated  annotation

333
        } elseif (is_file(/** @scrutinizer ignore-deprecated */ $this->processFilename)) {
Loading history...
334 1
            unlink($this->processFilename);
0 ignored issues
show
Deprecated Code introduced by
The property AOE\Crawler\Controller\C...oller::$processFilename has been deprecated. ( Ignorable by Annotation )

If this is a false-positive, you can also ignore this issue in your code via the ignore-deprecated  annotation

334
            unlink(/** @scrutinizer ignore-deprecated */ $this->processFilename);
Loading history...
335
        }
336 2
    }
337
338
    /**
339
     * Get disable status
340
     * @deprecated
341
     */
342 2
    public function getDisabled(): bool
343
    {
344 2
        return is_file($this->processFilename);
0 ignored issues
show
Deprecated Code introduced by
The property AOE\Crawler\Controller\C...oller::$processFilename has been deprecated. ( Ignorable by Annotation )

If this is a false-positive, you can also ignore this issue in your code via the ignore-deprecated  annotation

344
        return is_file(/** @scrutinizer ignore-deprecated */ $this->processFilename);
Loading history...
345
    }
346
347
    /**
348
     * @param string $filenameWithPath
349
     * @deprecated
350
     */
351 3
    public function setProcessFilename($filenameWithPath): void
352
    {
353 3
        $this->processFilename = $filenameWithPath;
0 ignored issues
show
Deprecated Code introduced by
The property AOE\Crawler\Controller\C...oller::$processFilename has been deprecated. ( Ignorable by Annotation )

If this is a false-positive, you can also ignore this issue in your code via the ignore-deprecated  annotation

353
        /** @scrutinizer ignore-deprecated */ $this->processFilename = $filenameWithPath;
Loading history...
354 3
    }
355
356
    /**
357
     * @return string
358
     * @deprecated
359
     */
360 1
    public function getProcessFilename()
361
    {
362 1
        return $this->processFilename;
0 ignored issues
show
Deprecated Code introduced by
The property AOE\Crawler\Controller\C...oller::$processFilename has been deprecated. ( Ignorable by Annotation )

If this is a false-positive, you can also ignore this issue in your code via the ignore-deprecated  annotation

362
        return /** @scrutinizer ignore-deprecated */ $this->processFilename;
Loading history...
363
    }
364
365
    /**
366
     * Sets the extensions settings (unserialized pendant of $TYPO3_CONF_VARS['EXT']['extConf']['crawler']).
367
     */
368 14
    public function setExtensionSettings(array $extensionSettings): void
369
    {
370 14
        $this->extensionSettings = $extensionSettings;
371 14
    }
372
373
    /**
374
     * Check if the given page should be crawled
375
     *
376
     * @return false|string false if the page should be crawled (not excluded), true / skipMessage if it should be skipped
377
     */
378 12
    public function checkIfPageShouldBeSkipped(array $pageRow)
379
    {
380
        // if page is hidden
381 12
        if (! $this->extensionSettings['crawlHiddenPages'] && $pageRow['hidden']) {
382 1
            return 'Because page is hidden';
383
        }
384
385 11
        if (GeneralUtility::inList('3,4,199,254,255', $pageRow['doktype'])) {
386 3
            return 'Because doktype is not allowed';
387
        }
388
389 8
        foreach ($GLOBALS['TYPO3_CONF_VARS']['EXTCONF']['crawler']['excludeDoktype'] ?? [] as $key => $doktypeList) {
390 1
            if (GeneralUtility::inList($doktypeList, $pageRow['doktype'])) {
391 1
                return 'Doktype was excluded by "' . $key . '"';
392
            }
393
        }
394
395
        // veto hook
396 7
        foreach ($GLOBALS['TYPO3_CONF_VARS']['EXTCONF']['crawler']['pageVeto'] ?? [] as $key => $func) {
397
            $params = [
398 2
                'pageRow' => $pageRow,
399
            ];
400
            // expects "false" if page is ok and "true" or a skipMessage if this page should _not_ be crawled
401 2
            $veto = GeneralUtility::callUserFunction($func, $params, $this);
402 2
            if ($veto !== false) {
403 2
                if (is_string($veto)) {
404 1
                    return $veto;
405
                }
406 1
                return 'Veto from hook "' . htmlspecialchars($key) . '"';
407
            }
408
        }
409
410 5
        return false;
411
    }
412
413
    /**
414
     * Wrapper method for getUrlsForPageId()
415
     * It returns an array of configurations and no urls!
416
     *
417
     * @param array $pageRow Page record with at least dok-type and uid columns.
418
     * @param string $skipMessage
419
     * @return array
420
     * @see getUrlsForPageId()
421
     */
422 6
    public function getUrlsForPageRow(array $pageRow, &$skipMessage = '')
423
    {
424 6
        if (! is_int($pageRow['uid'])) {
425
            $skipMessage = 'PageUid ' . $pageRow['uid'] . ' was not an integer';
426
            return [];
427
        }
428
429 6
        $message = $this->checkIfPageShouldBeSkipped($pageRow);
430 6
        if ($message === false) {
431 5
            $res = $this->getUrlsForPageId($pageRow['uid']);
432 5
            $skipMessage = '';
433
        } else {
434 1
            $skipMessage = $message;
435 1
            $res = [];
436
        }
437
438 6
        return $res;
439
    }
440
441
    /**
442
     * Creates a list of URLs from input array (and submits them to queue if asked for)
443
     * See Web > Info module script + "indexed_search"'s crawler hook-client using this!
444
     *
445
     * @param array $vv Information about URLs from pageRow to crawl.
446
     * @param array $pageRow Page row
447
     * @param int $scheduledTime Unix time to schedule indexing to, typically time()
448
     * @param int $reqMinute Number of requests per minute (creates the interleave between requests)
449
     * @param bool $submitCrawlUrls If set, submits the URLs to queue
450
     * @param bool $downloadCrawlUrls If set (and submitcrawlUrls is false) will fill $downloadUrls with entries)
451
     * @param array $duplicateTrack Array which is passed by reference and contains the an id per url to secure we will not crawl duplicates
452
     * @param array $downloadUrls Array which will be filled with URLS for download if flag is set.
453
     * @param array $incomingProcInstructions Array of processing instructions
454
     * @return string List of URLs (meant for display in backend module)
455
     */
456 4
    public function urlListFromUrlArray(
457
        array $vv,
458
        array $pageRow,
459
        $scheduledTime,
460
        $reqMinute,
461
        $submitCrawlUrls,
462
        $downloadCrawlUrls,
463
        array &$duplicateTrack,
464
        array &$downloadUrls,
465
        array $incomingProcInstructions
466
    ) {
467 4
        if (! is_array($vv['URLs'])) {
468
            return 'ERROR - no URL generated';
469
        }
470 4
        $urlLog = [];
471 4
        $pageId = (int) $pageRow['uid'];
472 4
        $configurationHash = $this->getConfigurationHash($vv);
473 4
        $skipInnerCheck = $this->queueRepository->noUnprocessedQueueEntriesForPageWithConfigurationHashExist($pageId, $configurationHash);
474
475 4
        $urlService = new UrlService();
476
477 4
        foreach ($vv['URLs'] as $urlQuery) {
478 4
            if (! $this->drawURLs_PIfilter($vv['subCfg']['procInstrFilter'], $incomingProcInstructions)) {
479
                continue;
480
            }
481 4
            $url = (string) $urlService->getUrlFromPageAndQueryParameters(
482 4
                $pageId,
483
                $urlQuery,
484 4
                $vv['subCfg']['baseUrl'] ?? null,
485 4
                $vv['subCfg']['force_ssl'] ?? 0
486
            );
487
488
            // Create key by which to determine unique-ness:
489 4
            $uKey = $url . '|' . $vv['subCfg']['userGroups'] . '|' . $vv['subCfg']['procInstrFilter'];
490
491 4
            if (isset($duplicateTrack[$uKey])) {
492
                //if the url key is registered just display it and do not resubmit is
493
                $urlLog[] = '<em><span class="text-muted">' . htmlspecialchars($url) . '</span></em>';
494
            } else {
495
                // Scheduled time:
496 4
                $schTime = $scheduledTime + round(count($duplicateTrack) * (60 / $reqMinute));
497 4
                $schTime = intval($schTime / 60) * 60;
498 4
                $formattedDate = BackendUtility::datetime($schTime);
499 4
                $this->urlList[] = '[' . $formattedDate . '] ' . $url;
500 4
                $urlList = '[' . $formattedDate . '] ' . htmlspecialchars($url);
501
502
                // Submit for crawling!
503 4
                if ($submitCrawlUrls) {
504 4
                    $added = $this->addUrl(
505 4
                        $pageId,
506
                        $url,
507 4
                        $vv['subCfg'],
508
                        $scheduledTime,
509
                        $configurationHash,
510
                        $skipInnerCheck
511
                    );
512 4
                    if ($added === false) {
513 4
                        $urlList .= ' (URL already existed)';
514
                    }
515
                } elseif ($downloadCrawlUrls) {
516
                    $downloadUrls[$url] = $url;
517
                }
518 4
                $urlLog[] = $urlList;
519
            }
520 4
            $duplicateTrack[$uKey] = true;
521
        }
522
523 4
        return implode('<br>', $urlLog);
524
    }
525
526
    /**
527
     * Returns true if input processing instruction is among registered ones.
528
     *
529
     * @param string $piString PI to test
530
     * @param array $incomingProcInstructions Processing instructions
531
     * @return boolean
532
     */
533 5
    public function drawURLs_PIfilter($piString, array $incomingProcInstructions)
534
    {
535 5
        if (empty($incomingProcInstructions)) {
536 1
            return true;
537
        }
538
539 4
        foreach ($incomingProcInstructions as $pi) {
540 4
            if (GeneralUtility::inList($piString, $pi)) {
541 2
                return true;
542
            }
543
        }
544 2
        return false;
545
    }
546
547 5
    public function getPageTSconfigForId($id): array
548
    {
549 5
        if (! $this->MP) {
550 5
            $pageTSconfig = BackendUtility::getPagesTSconfig($id);
551
        } else {
552
            // TODO: Please check, this makes no sense to split a boolean value.
553
            [, $mountPointId] = explode('-', $this->MP);
0 ignored issues
show
Bug introduced by
$this->MP of type true is incompatible with the type string expected by parameter $string of explode(). ( Ignorable by Annotation )

If this is a false-positive, you can also ignore this issue in your code via the ignore-type  annotation

553
            [, $mountPointId] = explode('-', /** @scrutinizer ignore-type */ $this->MP);
Loading history...
554
            $pageTSconfig = BackendUtility::getPagesTSconfig($mountPointId);
0 ignored issues
show
Bug introduced by
$mountPointId of type string is incompatible with the type integer expected by parameter $id of TYPO3\CMS\Backend\Utilit...ity::getPagesTSconfig(). ( Ignorable by Annotation )

If this is a false-positive, you can also ignore this issue in your code via the ignore-type  annotation

554
            $pageTSconfig = BackendUtility::getPagesTSconfig(/** @scrutinizer ignore-type */ $mountPointId);
Loading history...
555
        }
556
557
        // Call a hook to alter configuration
558 5
        if (is_array($GLOBALS['TYPO3_CONF_VARS']['EXTCONF']['crawler']['getPageTSconfigForId'])) {
559
            $params = [
560
                'pageId' => $id,
561
                'pageTSConfig' => &$pageTSconfig,
562
            ];
563
            foreach ($GLOBALS['TYPO3_CONF_VARS']['EXTCONF']['crawler']['getPageTSconfigForId'] as $userFunc) {
564
                GeneralUtility::callUserFunction($userFunc, $params, $this);
565
            }
566
        }
567 5
        return $pageTSconfig;
568
    }
569
570
    /**
571
     * This methods returns an array of configurations.
572
     * Adds no urls!
573
     */
574 4
    public function getUrlsForPageId(int $pageId): array
575
    {
576
        // Get page TSconfig for page ID
577 4
        $pageTSconfig = $this->getPageTSconfigForId($pageId);
578
579 4
        $res = [];
580
581
        // Fetch Crawler Configuration from pageTSconfig
582 4
        $crawlerCfg = $pageTSconfig['tx_crawler.']['crawlerCfg.']['paramSets.'] ?? [];
583 4
        foreach ($crawlerCfg as $key => $values) {
584 3
            if (! is_array($values)) {
585 3
                continue;
586
            }
587 3
            $key = str_replace('.', '', $key);
588
            // Sub configuration for a single configuration string:
589 3
            $subCfg = (array) $crawlerCfg[$key . '.'];
590 3
            $subCfg['key'] = $key;
591
592 3
            if (strcmp($subCfg['procInstrFilter'] ?? '', '')) {
593 3
                $subCfg['procInstrFilter'] = implode(',', GeneralUtility::trimExplode(',', $subCfg['procInstrFilter']));
594
            }
595 3
            $pidOnlyList = implode(',', GeneralUtility::trimExplode(',', $subCfg['pidsOnly'], true));
596
597
            // process configuration if it is not page-specific or if the specific page is the current page:
598
            // TODO: Check if $pidOnlyList can be kept as Array instead of imploded
599 3
            if (! strcmp((string) $subCfg['pidsOnly'], '') || GeneralUtility::inList($pidOnlyList, strval($pageId))) {
600
601
                // Explode, process etc.:
602 3
                $res[$key] = [];
603 3
                $res[$key]['subCfg'] = $subCfg;
604 3
                $res[$key]['paramParsed'] = GeneralUtility::explodeUrl2Array($crawlerCfg[$key]);
605 3
                $res[$key]['paramExpanded'] = $this->expandParameters($res[$key]['paramParsed'], $pageId);
606 3
                $res[$key]['origin'] = 'pagets';
607
608
                // recognize MP value
609 3
                if (! $this->MP) {
610 3
                    $res[$key]['URLs'] = $this->compileUrls($res[$key]['paramExpanded'], ['?id=' . $pageId]);
611
                } else {
612
                    $res[$key]['URLs'] = $this->compileUrls($res[$key]['paramExpanded'], ['?id=' . $pageId . '&MP=' . $this->MP]);
0 ignored issues
show
Bug introduced by
Are you sure $this->MP of type true can be used in concatenation? ( Ignorable by Annotation )

If this is a false-positive, you can also ignore this issue in your code via the ignore-type  annotation

612
                    $res[$key]['URLs'] = $this->compileUrls($res[$key]['paramExpanded'], ['?id=' . $pageId . '&MP=' . /** @scrutinizer ignore-type */ $this->MP]);
Loading history...
613
                }
614
            }
615
        }
616
617
        // Get configuration from tx_crawler_configuration records up the rootline
618 4
        $crawlerConfigurations = $this->configurationRepository->getCrawlerConfigurationRecordsFromRootLine($pageId);
619 4
        foreach ($crawlerConfigurations as $configurationRecord) {
620
621
            // check access to the configuration record
622 1
            if (empty($configurationRecord['begroups']) || $this->getBackendUser()->isAdmin() || $this->hasGroupAccess($this->getBackendUser()->user['usergroup_cached_list'], $configurationRecord['begroups'])) {
623 1
                $pidOnlyList = implode(',', GeneralUtility::trimExplode(',', $configurationRecord['pidsonly'], true));
624
625
                // process configuration if it is not page-specific or if the specific page is the current page:
626
                // TODO: Check if $pidOnlyList can be kept as Array instead of imploded
627 1
                if (! strcmp($configurationRecord['pidsonly'], '') || GeneralUtility::inList($pidOnlyList, strval($pageId))) {
628 1
                    $key = $configurationRecord['name'];
629
630
                    // don't overwrite previously defined paramSets
631 1
                    if (! isset($res[$key])) {
632
633
                        /* @var $TSparserObject TypoScriptParser */
634 1
                        $TSparserObject = GeneralUtility::makeInstance(TypoScriptParser::class);
635 1
                        $TSparserObject->parse($configurationRecord['processing_instruction_parameters_ts']);
636
637
                        $subCfg = [
638 1
                            'procInstrFilter' => $configurationRecord['processing_instruction_filter'],
639 1
                            'procInstrParams.' => $TSparserObject->setup,
640 1
                            'baseUrl' => $configurationRecord['base_url'],
641 1
                            'force_ssl' => (int) $configurationRecord['force_ssl'],
642 1
                            'userGroups' => $configurationRecord['fegroups'],
643 1
                            'exclude' => $configurationRecord['exclude'],
644 1
                            'key' => $key,
645
                        ];
646
647 1
                        if (! in_array($pageId, $this->expandExcludeString($subCfg['exclude']), true)) {
648 1
                            $res[$key] = [];
649 1
                            $res[$key]['subCfg'] = $subCfg;
650 1
                            $res[$key]['paramParsed'] = GeneralUtility::explodeUrl2Array($configurationRecord['configuration']);
651 1
                            $res[$key]['paramExpanded'] = $this->expandParameters($res[$key]['paramParsed'], $pageId);
652 1
                            $res[$key]['URLs'] = $this->compileUrls($res[$key]['paramExpanded'], ['?id=' . $pageId]);
653 1
                            $res[$key]['origin'] = 'tx_crawler_configuration_' . $configurationRecord['uid'];
654
                        }
655
                    }
656
                }
657
            }
658
        }
659
660 4
        foreach ($GLOBALS['TYPO3_CONF_VARS']['EXTCONF']['crawler']['processUrls'] ?? [] as $func) {
661
            $params = [
662
                'res' => &$res,
663
            ];
664
            GeneralUtility::callUserFunction($func, $params, $this);
665
        }
666 4
        return $res;
667
    }
668
669
    /**
670
     * Find all configurations of subpages of a page
671
     * TODO: Write Functional Tests
672
     */
673 1
    public function getConfigurationsForBranch(int $rootid, int $depth): array
674
    {
675 1
        $configurationsForBranch = [];
676 1
        $pageTSconfig = $this->getPageTSconfigForId($rootid);
677 1
        $sets = $pageTSconfig['tx_crawler.']['crawlerCfg.']['paramSets.'] ?? [];
678 1
        foreach ($sets as $key => $value) {
679
            if (! is_array($value)) {
680
                continue;
681
            }
682
            $configurationsForBranch[] = substr($key, -1) === '.' ? substr($key, 0, -1) : $key;
683
        }
684 1
        $pids = [];
685 1
        $rootLine = BackendUtility::BEgetRootLine($rootid);
686 1
        foreach ($rootLine as $node) {
687 1
            $pids[] = $node['uid'];
688
        }
689
        /* @var PageTreeView $tree */
690 1
        $tree = GeneralUtility::makeInstance(PageTreeView::class);
691 1
        $perms_clause = $this->getBackendUser()->getPagePermsClause(Permission::PAGE_SHOW);
692 1
        $tree->init(empty($perms_clause) ? '' : ('AND ' . $perms_clause));
693 1
        $tree->getTree($rootid, $depth, '');
694 1
        foreach ($tree->tree as $node) {
695
            $pids[] = $node['row']['uid'];
696
        }
697
698 1
        $queryBuilder = $this->getQueryBuilder('tx_crawler_configuration');
699
        $statement = $queryBuilder
700 1
            ->select('name')
701 1
            ->from('tx_crawler_configuration')
702 1
            ->where(
703 1
                $queryBuilder->expr()->in('pid', $queryBuilder->createNamedParameter($pids, Connection::PARAM_INT_ARRAY))
704
            )
705 1
            ->execute();
706
707 1
        while ($row = $statement->fetch()) {
708 1
            $configurationsForBranch[] = $row['name'];
709
        }
710 1
        return $configurationsForBranch;
711
    }
712
713
    /**
714
     * Check if a user has access to an item
715
     * (e.g. get the group list of the current logged in user from $GLOBALS['TSFE']->gr_list)
716
     *
717
     * @param string $groupList Comma-separated list of (fe_)group UIDs from a user
718
     * @param string $accessList Comma-separated list of (fe_)group UIDs of the item to access
719
     * @return bool TRUE if at least one of the users group UIDs is in the access list or the access list is empty
720
     * @see \TYPO3\CMS\Frontend\Page\PageRepository::getMultipleGroupsWhereClause()
721
     */
722 3
    public function hasGroupAccess($groupList, $accessList)
723
    {
724 3
        if (empty($accessList)) {
725 1
            return true;
726
        }
727 2
        foreach (GeneralUtility::intExplode(',', $groupList) as $groupUid) {
728 2
            if (GeneralUtility::inList($accessList, $groupUid)) {
729 1
                return true;
730
            }
731
        }
732 1
        return false;
733
    }
734
735
    /**
736
     * Will expand the parameters configuration to individual values. This follows a certain syntax of the value of each parameter.
737
     * Syntax of values:
738
     * - Basically: If the value is wrapped in [...] it will be expanded according to the following syntax, otherwise the value is taken literally
739
     * - Configuration is splitted by "|" and the parts are processed individually and finally added together
740
     * - For each configuration part:
741
     *         - "[int]-[int]" = Integer range, will be expanded to all values in between, values included, starting from low to high (max. 1000). Example "1-34" or "-40--30"
742
     *         - "_TABLE:[TCA table name];[_PID:[optional page id, default is current page]];[_ENABLELANG:1]" = Look up of table records from PID, filtering out deleted records. Example "_TABLE:tt_content; _PID:123"
743
     *        _ENABLELANG:1 picks only original records without their language overlays
744
     *         - Default: Literal value
745
     *
746
     * @param array $paramArray Array with key (GET var name) and values (value of GET var which is configuration for expansion)
747
     * @param integer $pid Current page ID
748
     * @return array
749
     *
750
     * TODO: Write Functional Tests
751
     */
752 11
    public function expandParameters($paramArray, $pid)
753
    {
754
        // Traverse parameter names:
755 11
        foreach ($paramArray as $p => $v) {
756 11
            $v = trim($v);
757
758
            // If value is encapsulated in square brackets it means there are some ranges of values to find, otherwise the value is literal
759 11
            if (strpos($v, '[') === 0 && substr($v, -1) === ']') {
760
                // So, find the value inside brackets and reset the paramArray value as an array.
761 11
                $v = substr($v, 1, -1);
762 11
                $paramArray[$p] = [];
763
764
                // Explode parts and traverse them:
765 11
                $parts = explode('|', $v);
766 11
                foreach ($parts as $pV) {
767
768
                    // Look for integer range: (fx. 1-34 or -40--30 // reads minus 40 to minus 30)
769 11
                    if (preg_match('/^(-?[0-9]+)\s*-\s*(-?[0-9]+)$/', trim($pV), $reg)) {
770 1
                        $reg = $this->swapIfFirstIsLargerThanSecond($reg);
771
772
                        // Traverse range, add values:
773
                        // Limit to size of range!
774 1
                        $runAwayBrake = 1000;
775 1
                        for ($a = $reg[1]; $a <= $reg[2]; $a++) {
776 1
                            $paramArray[$p][] = $a;
777 1
                            $runAwayBrake--;
778 1
                            if ($runAwayBrake <= 0) {
779
                                break;
780
                            }
781
                        }
782 10
                    } elseif (strpos(trim($pV), '_TABLE:') === 0) {
783
784
                        // Parse parameters:
785 6
                        $subparts = GeneralUtility::trimExplode(';', $pV);
786 6
                        $subpartParams = [];
787 6
                        foreach ($subparts as $spV) {
788 6
                            [$pKey, $pVal] = GeneralUtility::trimExplode(':', $spV);
789 6
                            $subpartParams[$pKey] = $pVal;
790
                        }
791
792
                        // Table exists:
793 6
                        if (isset($GLOBALS['TCA'][$subpartParams['_TABLE']])) {
794 6
                            $lookUpPid = isset($subpartParams['_PID']) ? intval($subpartParams['_PID']) : intval($pid);
795 6
                            $recursiveDepth = isset($subpartParams['_RECURSIVE']) ? intval($subpartParams['_RECURSIVE']) : 0;
796 6
                            $pidField = isset($subpartParams['_PIDFIELD']) ? trim($subpartParams['_PIDFIELD']) : 'pid';
797 6
                            $where = $subpartParams['_WHERE'] ?? '';
798 6
                            $addTable = $subpartParams['_ADDTABLE'] ?? '';
799
800 6
                            $fieldName = $subpartParams['_FIELD'] ? $subpartParams['_FIELD'] : 'uid';
801 6
                            if ($fieldName === 'uid' || $GLOBALS['TCA'][$subpartParams['_TABLE']]['columns'][$fieldName]) {
802 6
                                $queryBuilder = $this->getQueryBuilder($subpartParams['_TABLE']);
803
804 6
                                if ($recursiveDepth > 0) {
805
                                    /** @var QueryGenerator $queryGenerator */
806 2
                                    $queryGenerator = GeneralUtility::makeInstance(QueryGenerator::class);
807 2
                                    $pidList = $queryGenerator->getTreeList($lookUpPid, $recursiveDepth, 0, 1);
808 2
                                    $pidArray = GeneralUtility::intExplode(',', $pidList);
809
                                } else {
810 4
                                    $pidArray = [(string) $lookUpPid];
811
                                }
812
813 6
                                $queryBuilder->getRestrictions()
814 6
                                    ->removeAll()
815 6
                                    ->add(GeneralUtility::makeInstance(DeletedRestriction::class));
816
817
                                $queryBuilder
818 6
                                    ->select($fieldName)
819 6
                                    ->from($subpartParams['_TABLE'])
820 6
                                    ->where(
821 6
                                        $queryBuilder->expr()->in($pidField, $queryBuilder->createNamedParameter($pidArray, Connection::PARAM_INT_ARRAY)),
822
                                        $where
823
                                    );
824
825 6
                                if (! empty($addTable)) {
826
                                    // TODO: Check if this works as intended!
827
                                    $queryBuilder->add('from', $addTable);
828
                                }
829 6
                                $transOrigPointerField = $GLOBALS['TCA'][$subpartParams['_TABLE']]['ctrl']['transOrigPointerField'];
830
831 6
                                if ($subpartParams['_ENABLELANG'] && $transOrigPointerField) {
832
                                    $queryBuilder->andWhere(
833
                                        $queryBuilder->expr()->lte(
834
                                            $transOrigPointerField,
835
                                            0
836
                                        )
837
                                    );
838
                                }
839
840 6
                                $statement = $queryBuilder->execute();
841
842 6
                                $rows = [];
843 6
                                while ($row = $statement->fetch()) {
844 6
                                    $rows[$row[$fieldName]] = $row;
845
                                }
846
847 6
                                if (is_array($rows)) {
848 6
                                    $paramArray[$p] = array_merge($paramArray[$p], array_keys($rows));
849
                                }
850
                            }
851
                        }
852
                    } else {
853
                        // Just add value:
854 4
                        $paramArray[$p][] = $pV;
855
                    }
856
                    // Hook for processing own expandParameters place holder
857 11
                    if (is_array($GLOBALS['TYPO3_CONF_VARS']['SC_OPTIONS']['crawler/class.tx_crawler_lib.php']['expandParameters'])) {
858
                        $_params = [
859
                            'pObj' => &$this,
860
                            'paramArray' => &$paramArray,
861
                            'currentKey' => $p,
862
                            'currentValue' => $pV,
863
                            'pid' => $pid,
864
                        ];
865
                        foreach ($GLOBALS['TYPO3_CONF_VARS']['SC_OPTIONS']['crawler/class.tx_crawler_lib.php']['expandParameters'] as $_funcRef) {
866
                            GeneralUtility::callUserFunction($_funcRef, $_params, $this);
867
                        }
868
                    }
869
                }
870
871
                // Make unique set of values and sort array by key:
872 11
                $paramArray[$p] = array_unique($paramArray[$p]);
873 11
                ksort($paramArray);
874
            } else {
875
                // Set the literal value as only value in array:
876 4
                $paramArray[$p] = [$v];
877
            }
878
        }
879
880 11
        return $paramArray;
881
    }
882
883
    /**
884
     * Compiling URLs from parameter array (output of expandParameters())
885
     * The number of URLs will be the multiplication of the number of parameter values for each key
886
     *
887
     * @param array $paramArray Output of expandParameters(): Array with keys (GET var names) and for each an array of values
888
     * @param array $urls URLs accumulated in this array (for recursion)
889
     * @return array
890
     */
891 8
    public function compileUrls($paramArray, array $urls)
892
    {
893 8
        if (empty($paramArray)) {
894 8
            return $urls;
895
        }
896 7
        $varName = key($paramArray);
897 7
        $valueSet = array_shift($paramArray);
898
899
        // Traverse value set:
900 7
        $newUrls = [];
901 7
        foreach ($urls as $url) {
902 6
            foreach ($valueSet as $val) {
903 6
                if (count($newUrls) < $this->getMaximumUrlsToCompile()) {
904 6
                    $newUrls[] = $url . (strcmp((string) $val, '') ? '&' . rawurlencode($varName) . '=' . rawurlencode((string) $val) : '');
905
                }
906
            }
907
        }
908 7
        return $this->compileUrls($paramArray, $newUrls);
909
    }
910
911
    /************************************
912
     *
913
     * Crawler log
914
     *
915
     ************************************/
916
917
    /**
918
     * Return array of records from crawler queue for input page ID
919
     *
920
     * @param integer $id Page ID for which to look up log entries.
921
     * @param boolean $doFlush If TRUE, then entries selected at DELETED(!) instead of selected!
922
     * @param boolean $doFullFlush
923
     * @param integer $itemsPerPage Limit the amount of entries per page default is 10
924
     * @return array
925
     *
926
     * @deprecated
927
     */
928 4
    public function getLogEntriesForPageId($id, QueueFilter $queueFilter, $doFlush = false, $doFullFlush = false, $itemsPerPage = 10)
0 ignored issues
show
Unused Code introduced by
The parameter $doFullFlush is not used and could be removed. ( Ignorable by Annotation )

If this is a false-positive, you can also ignore this issue in your code via the ignore-unused  annotation

928
    public function getLogEntriesForPageId($id, QueueFilter $queueFilter, $doFlush = false, /** @scrutinizer ignore-unused */ $doFullFlush = false, $itemsPerPage = 10)

This check looks for parameters that have been defined for a function or method, but which are not used in the method body.

Loading history...
929
    {
930 4
        $queryBuilder = GeneralUtility::makeInstance(ConnectionPool::class)->getQueryBuilderForTable($this->tableName);
931
        $queryBuilder
932 4
            ->select('*')
933 4
            ->from($this->tableName)
934 4
            ->where(
935 4
                $queryBuilder->expr()->eq('page_id', $queryBuilder->createNamedParameter($id, PDO::PARAM_INT))
936
            )
937 4
            ->orderBy('scheduled', 'DESC');
938
939 4
        $expressionBuilder = GeneralUtility::makeInstance(ConnectionPool::class)
940 4
            ->getConnectionForTable($this->tableName)
941 4
            ->getExpressionBuilder();
942 4
        $query = $expressionBuilder->andX();
0 ignored issues
show
Unused Code introduced by
The assignment to $query is dead and can be removed.
Loading history...
943
        // PHPStorm adds the highlight that the $addWhere is immediately overwritten,
944
        // but the $query = $expressionBuilder->andX() ensures that the $addWhere is written correctly with AND
945
        // between the statements, it's not a mistake in the code.
946 4
        switch ($queueFilter) {
947 4
            case 'pending':
948
                $queryBuilder->andWhere($queryBuilder->expr()->eq('exec_time', 0));
949
                break;
950 4
            case 'finished':
951
                $queryBuilder->andWhere($queryBuilder->expr()->gt('exec_time', 0));
952
                break;
953
        }
954
955 4
        if ($doFlush) {
956 2
            $this->queueRepository->flushQueue($queueFilter);
957
        }
958 4
        if ($itemsPerPage > 0) {
959
            $queryBuilder
960 4
                ->setMaxResults((int) $itemsPerPage);
961
        }
962
963 4
        return $queryBuilder->execute()->fetchAll();
964
    }
965
966
    /**
967
     * Return array of records from crawler queue for input set ID
968
     *
969
     * @param int $set_id Set ID for which to look up log entries.
970
     * @param string $filter Filter: "all" => all entries, "pending" => all that is not yet run, "finished" => all complete ones
971
     * @param bool $doFlush If TRUE, then entries selected at DELETED(!) instead of selected!
972
     * @param int $itemsPerPage Limit the amount of entries per page default is 10
973
     * @return array
974
     *
975
     * @deprecated
976
     */
977 6
    public function getLogEntriesForSetId(int $set_id, string $filter = '', bool $doFlush = false, bool $doFullFlush = false, int $itemsPerPage = 10)
978
    {
979 6
        $queryBuilder = GeneralUtility::makeInstance(ConnectionPool::class)->getQueryBuilderForTable($this->tableName);
980
        $queryBuilder
981 6
            ->select('*')
982 6
            ->from($this->tableName)
983 6
            ->where(
984 6
                $queryBuilder->expr()->eq('set_id', $queryBuilder->createNamedParameter($set_id, PDO::PARAM_INT))
985
            )
986 6
            ->orderBy('scheduled', 'DESC');
987
988 6
        $expressionBuilder = GeneralUtility::makeInstance(ConnectionPool::class)
989 6
            ->getConnectionForTable($this->tableName)
990 6
            ->getExpressionBuilder();
991 6
        $query = $expressionBuilder->andX();
992
        // PHPStorm adds the highlight that the $addWhere is immediately overwritten,
993
        // but the $query = $expressionBuilder->andX() ensures that the $addWhere is written correctly with AND
994
        // between the statements, it's not a mistake in the code.
995 6
        $addWhere = '';
996 6
        switch ($filter) {
997 6
            case 'pending':
998 1
                $queryBuilder->andWhere($queryBuilder->expr()->eq('exec_time', 0));
999 1
                $addWhere = $query->add($expressionBuilder->eq('exec_time', 0));
0 ignored issues
show
Unused Code introduced by
The assignment to $addWhere is dead and can be removed.
Loading history...
1000 1
                break;
1001 5
            case 'finished':
1002 1
                $queryBuilder->andWhere($queryBuilder->expr()->gt('exec_time', 0));
1003 1
                $addWhere = $query->add($expressionBuilder->gt('exec_time', 0));
1004 1
                break;
1005
        }
1006 6
        if ($doFlush) {
1007 4
            $addWhere = $query->add($expressionBuilder->eq('set_id', (int) $set_id));
1008 4
            $this->flushQueue($doFullFlush ? '' : $addWhere);
0 ignored issues
show
Deprecated Code introduced by
The function AOE\Crawler\Controller\C...ontroller::flushQueue() has been deprecated. ( Ignorable by Annotation )

If this is a false-positive, you can also ignore this issue in your code via the ignore-deprecated  annotation

1008
            /** @scrutinizer ignore-deprecated */ $this->flushQueue($doFullFlush ? '' : $addWhere);
Loading history...
1009 4
            return [];
1010
        }
1011 2
        if ($itemsPerPage > 0) {
1012
            $queryBuilder
1013 2
                ->setMaxResults((int) $itemsPerPage);
1014
        }
1015
1016 2
        return $queryBuilder->execute()->fetchAll();
1017
    }
1018
1019
    /**
1020
     * Adding call back entries to log (called from hooks typically, see indexed search class "class.crawler.php"
1021
     *
1022
     * @param integer $setId Set ID
1023
     * @param array $params Parameters to pass to call back function
1024
     * @param string $callBack Call back object reference, eg. 'EXT:indexed_search/class.crawler.php:&tx_indexedsearch_crawler'
1025
     * @param integer $page_id Page ID to attach it to
1026
     * @param integer $schedule Time at which to activate
1027
     */
1028
    public function addQueueEntry_callBack($setId, $params, $callBack, $page_id = 0, $schedule = 0): void
1029
    {
1030
        if (! is_array($params)) {
0 ignored issues
show
introduced by
The condition is_array($params) is always true.
Loading history...
1031
            $params = [];
1032
        }
1033
        $params['_CALLBACKOBJ'] = $callBack;
1034
1035
        GeneralUtility::makeInstance(ConnectionPool::class)->getConnectionForTable('tx_crawler_queue')
1036
            ->insert(
1037
                'tx_crawler_queue',
1038
                [
1039
                    'page_id' => (int) $page_id,
1040
                    'parameters' => json_encode($params),
1041
                    'scheduled' => (int) $schedule ?: $this->getCurrentTime(),
1042
                    'exec_time' => 0,
1043
                    'set_id' => (int) $setId,
1044
                    'result_data' => '',
1045
                ]
1046
            );
1047
    }
1048
1049
    /************************************
1050
     *
1051
     * URL setting
1052
     *
1053
     ************************************/
1054
1055
    /**
1056
     * Setting a URL for crawling:
1057
     *
1058
     * @param integer $id Page ID
1059
     * @param string $url Complete URL
1060
     * @param array $subCfg Sub configuration array (from TS config)
1061
     * @param integer $tstamp Scheduled-time
1062
     * @param string $configurationHash (optional) configuration hash
1063
     * @param bool $skipInnerDuplicationCheck (optional) skip inner duplication check
1064
     * @return bool
1065
     */
1066 8
    public function addUrl(
1067
        $id,
1068
        $url,
1069
        array $subCfg,
1070
        $tstamp,
1071
        $configurationHash = '',
1072
        $skipInnerDuplicationCheck = false
1073
    ) {
1074 8
        $urlAdded = false;
1075 8
        $rows = [];
1076
1077
        // Creating parameters:
1078
        $parameters = [
1079 8
            'url' => $url,
1080
        ];
1081
1082
        // fe user group simulation:
1083 8
        $uGs = implode(',', array_unique(GeneralUtility::intExplode(',', $subCfg['userGroups'], true)));
1084 8
        if ($uGs) {
1085 1
            $parameters['feUserGroupList'] = $uGs;
1086
        }
1087
1088
        // Setting processing instructions
1089 8
        $parameters['procInstructions'] = GeneralUtility::trimExplode(',', $subCfg['procInstrFilter']);
1090 8
        if (is_array($subCfg['procInstrParams.'])) {
1091 5
            $parameters['procInstrParams'] = $subCfg['procInstrParams.'];
1092
        }
1093
1094
        // Compile value array:
1095 8
        $parameters_serialized = json_encode($parameters);
1096
        $fieldArray = [
1097 8
            'page_id' => (int) $id,
1098 8
            'parameters' => $parameters_serialized,
1099 8
            'parameters_hash' => GeneralUtility::shortMD5($parameters_serialized),
1100 8
            'configuration_hash' => $configurationHash,
1101 8
            'scheduled' => $tstamp,
1102 8
            'exec_time' => 0,
1103 8
            'set_id' => (int) $this->setID,
1104 8
            'result_data' => '',
1105 8
            'configuration' => $subCfg['key'],
1106
        ];
1107
1108 8
        if ($this->registerQueueEntriesInternallyOnly) {
1109
            //the entries will only be registered and not stored to the database
1110 1
            $this->queueEntries[] = $fieldArray;
1111
        } else {
1112 7
            if (! $skipInnerDuplicationCheck) {
1113
                // check if there is already an equal entry
1114 6
                $rows = $this->queueRepository->getDuplicateQueueItemsIfExists(
1115 6
                    (bool) $this->extensionSettings['enableTimeslot'],
1116
                    $tstamp,
1117 6
                    $this->getCurrentTime(),
1118 6
                    $fieldArray['page_id'],
1119 6
                    $fieldArray['parameters_hash']
1120
                );
1121
            }
1122
1123 7
            if (empty($rows)) {
1124 6
                $connectionForCrawlerQueue = GeneralUtility::makeInstance(ConnectionPool::class)->getConnectionForTable('tx_crawler_queue');
1125 6
                $connectionForCrawlerQueue->insert(
1126 6
                    'tx_crawler_queue',
1127
                    $fieldArray
1128
                );
1129 6
                $uid = $connectionForCrawlerQueue->lastInsertId('tx_crawler_queue', 'qid');
1130 6
                $rows[] = $uid;
1131 6
                $urlAdded = true;
1132
1133 6
                $signalPayload = ['uid' => $uid, 'fieldArray' => $fieldArray];
1134 6
                SignalSlotUtility::emitSignal(
0 ignored issues
show
Deprecated Code introduced by
The function AOE\Crawler\Utility\Sign...otUtility::emitSignal() has been deprecated. ( Ignorable by Annotation )

If this is a false-positive, you can also ignore this issue in your code via the ignore-deprecated  annotation

1134
                /** @scrutinizer ignore-deprecated */ SignalSlotUtility::emitSignal(
Loading history...
1135 6
                    self::class,
1136 6
                    SignalSlotUtility::SIGNAL_URL_ADDED_TO_QUEUE,
1137
                    $signalPayload
1138
                );
1139
            } else {
1140 3
                $signalPayload = ['rows' => $rows, 'fieldArray' => $fieldArray];
1141 3
                SignalSlotUtility::emitSignal(
0 ignored issues
show
Deprecated Code introduced by
The function AOE\Crawler\Utility\Sign...otUtility::emitSignal() has been deprecated. ( Ignorable by Annotation )

If this is a false-positive, you can also ignore this issue in your code via the ignore-deprecated  annotation

1141
                /** @scrutinizer ignore-deprecated */ SignalSlotUtility::emitSignal(
Loading history...
1142 3
                    self::class,
1143 3
                    SignalSlotUtility::SIGNAL_DUPLICATE_URL_IN_QUEUE,
1144
                    $signalPayload
1145
                );
1146
            }
1147
        }
1148
1149 8
        return $urlAdded;
1150
    }
1151
1152
    /**
1153
     * Returns the current system time
1154
     *
1155
     * @return int
1156
     */
1157 2
    public function getCurrentTime()
1158
    {
1159 2
        return time();
1160
    }
1161
1162
    /************************************
1163
     *
1164
     * URL reading
1165
     *
1166
     ************************************/
1167
1168
    /**
1169
     * Read URL for single queue entry
1170
     *
1171
     * @param integer $queueId
1172
     * @param boolean $force If set, will process even if exec_time has been set!
1173
     * @return integer
1174
     */
1175
    public function readUrl($queueId, $force = false)
1176
    {
1177
        $queryBuilder = GeneralUtility::makeInstance(ConnectionPool::class)->getQueryBuilderForTable($this->tableName);
1178
        $ret = 0;
1179
        $this->logger->debug('crawler-readurl start ' . microtime(true));
1180
        // Get entry:
1181
        $queryBuilder
1182
            ->select('*')
1183
            ->from('tx_crawler_queue')
1184
            ->where(
1185
                $queryBuilder->expr()->eq('qid', $queryBuilder->createNamedParameter($queueId, PDO::PARAM_INT))
1186
            );
1187
        if (! $force) {
1188
            $queryBuilder
1189
                ->andWhere('exec_time = 0')
1190
                ->andWhere('process_scheduled > 0');
1191
        }
1192
        $queueRec = $queryBuilder->execute()->fetch();
1193
1194
        if (! is_array($queueRec)) {
1195
            return;
1196
        }
1197
1198
        SignalSlotUtility::emitSignal(
0 ignored issues
show
Deprecated Code introduced by
The function AOE\Crawler\Utility\Sign...otUtility::emitSignal() has been deprecated. ( Ignorable by Annotation )

If this is a false-positive, you can also ignore this issue in your code via the ignore-deprecated  annotation

1198
        /** @scrutinizer ignore-deprecated */ SignalSlotUtility::emitSignal(
Loading history...
1199
            self::class,
1200
            SignalSlotUtility::SIGNAL_QUEUEITEM_PREPROCESS,
1201
            [$queueId, &$queueRec]
1202
        );
1203
1204
        // Set exec_time to lock record:
1205
        $field_array = ['exec_time' => $this->getCurrentTime()];
1206
1207
        if (isset($this->processID)) {
1208
            //if mulitprocessing is used we need to store the id of the process which has handled this entry
1209
            $field_array['process_id_completed'] = $this->processID;
1210
        }
1211
1212
        GeneralUtility::makeInstance(ConnectionPool::class)->getConnectionForTable('tx_crawler_queue')
1213
            ->update(
1214
                'tx_crawler_queue',
1215
                $field_array,
1216
                ['qid' => (int) $queueId]
1217
            );
1218
1219
        $result = $this->queueExecutor->executeQueueItem($queueRec, $this);
1220
        if ($result['content'] === null) {
1221
            $resultData = 'An errors happened';
1222
        } else {
1223
            /** @var JsonCompatibilityConverter $jsonCompatibilityConverter */
1224
            $jsonCompatibilityConverter = GeneralUtility::makeInstance(JsonCompatibilityConverter::class);
1225
            $resultData = $jsonCompatibilityConverter->convert($result['content']);
1226
        }
1227
1228
        //atm there's no need to point to specific pollable extensions
1229
        if (is_array($GLOBALS['TYPO3_CONF_VARS']['EXTCONF']['crawler']['pollSuccess'])) {
1230
            foreach ($GLOBALS['TYPO3_CONF_VARS']['EXTCONF']['crawler']['pollSuccess'] as $pollable) {
1231
                // only check the success value if the instruction is runnig
1232
                // it is important to name the pollSuccess key same as the procInstructions key
1233
                if (is_array($resultData['parameters']['procInstructions'])
1234
                    && in_array(
1235
                        $pollable,
1236
                        $resultData['parameters']['procInstructions'], true
1237
                    )
1238
                ) {
1239
                    if (! empty($resultData['success'][$pollable]) && $resultData['success'][$pollable]) {
1240
                        $ret |= self::CLI_STATUS_POLLABLE_PROCESSED;
1241
                    }
1242
                }
1243
            }
1244
        }
1245
1246
        // Set result in log which also denotes the end of the processing of this entry.
1247
        $field_array = ['result_data' => json_encode($result)];
1248
1249
        SignalSlotUtility::emitSignal(
0 ignored issues
show
Deprecated Code introduced by
The function AOE\Crawler\Utility\Sign...otUtility::emitSignal() has been deprecated. ( Ignorable by Annotation )

If this is a false-positive, you can also ignore this issue in your code via the ignore-deprecated  annotation

1249
        /** @scrutinizer ignore-deprecated */ SignalSlotUtility::emitSignal(
Loading history...
1250
            self::class,
1251
            SignalSlotUtility::SIGNAL_QUEUEITEM_POSTPROCESS,
1252
            [$queueId, &$field_array]
1253
        );
1254
1255
        GeneralUtility::makeInstance(ConnectionPool::class)->getConnectionForTable('tx_crawler_queue')
1256
            ->update(
1257
                'tx_crawler_queue',
1258
                $field_array,
1259
                ['qid' => (int) $queueId]
1260
            );
1261
1262
        $this->logger->debug('crawler-readurl stop ' . microtime(true));
1263
        return $ret;
1264
    }
1265
1266
    /**
1267
     * Read URL for not-yet-inserted log-entry
1268
     *
1269
     * @param array $field_array Queue field array,
1270
     *
1271
     * @return array|bool|mixed|string
1272
     */
1273
    public function readUrlFromArray($field_array)
1274
    {
1275
        // Set exec_time to lock record:
1276
        $field_array['exec_time'] = $this->getCurrentTime();
1277
        $connectionForCrawlerQueue = GeneralUtility::makeInstance(ConnectionPool::class)->getConnectionForTable($this->tableName);
1278
        $connectionForCrawlerQueue->insert(
1279
            $this->tableName,
1280
            $field_array
1281
        );
1282
        $queueId = $connectionForCrawlerQueue->lastInsertId($this->tableName, 'qid');
1283
        $result = $this->queueExecutor->executeQueueItem($field_array, $this);
1284
1285
        // Set result in log which also denotes the end of the processing of this entry.
1286
        $field_array = ['result_data' => json_encode($result)];
1287
1288
        SignalSlotUtility::emitSignal(
0 ignored issues
show
Deprecated Code introduced by
The function AOE\Crawler\Utility\Sign...otUtility::emitSignal() has been deprecated. ( Ignorable by Annotation )

If this is a false-positive, you can also ignore this issue in your code via the ignore-deprecated  annotation

1288
        /** @scrutinizer ignore-deprecated */ SignalSlotUtility::emitSignal(
Loading history...
1289
            self::class,
1290
            SignalSlotUtility::SIGNAL_QUEUEITEM_POSTPROCESS,
1291
            [$queueId, &$field_array]
1292
        );
1293
1294
        $connectionForCrawlerQueue->update(
1295
            $this->tableName,
1296
            $field_array,
1297
            ['qid' => $queueId]
1298
        );
1299
1300
        return $result;
1301
    }
1302
1303
    /*****************************
1304
     *
1305
     * Compiling URLs to crawl - tools
1306
     *
1307
     *****************************/
1308
1309
    /**
1310
     * @param integer $id Root page id to start from.
1311
     * @param integer $depth Depth of tree, 0=only id-page, 1= on sublevel, 99 = infinite
1312
     * @param integer $scheduledTime Unix Time when the URL is timed to be visited when put in queue
1313
     * @param integer $reqMinute Number of requests per minute (creates the interleave between requests)
1314
     * @param boolean $submitCrawlUrls If set, submits the URLs to queue in database (real crawling)
1315
     * @param boolean $downloadCrawlUrls If set (and submitcrawlUrls is false) will fill $downloadUrls with entries)
1316
     * @param array $incomingProcInstructions Array of processing instructions
1317
     * @param array $configurationSelection Array of configuration keys
1318
     * @return string
1319
     */
1320
    public function getPageTreeAndUrls(
1321
        $id,
1322
        $depth,
1323
        $scheduledTime,
1324
        $reqMinute,
1325
        $submitCrawlUrls,
1326
        $downloadCrawlUrls,
1327
        array $incomingProcInstructions,
1328
        array $configurationSelection
1329
    ) {
1330
        $this->scheduledTime = $scheduledTime;
1331
        $this->reqMinute = $reqMinute;
1332
        $this->submitCrawlUrls = $submitCrawlUrls;
1333
        $this->downloadCrawlUrls = $downloadCrawlUrls;
1334
        $this->incomingProcInstructions = $incomingProcInstructions;
1335
        $this->incomingConfigurationSelection = $configurationSelection;
1336
1337
        $this->duplicateTrack = [];
1338
        $this->downloadUrls = [];
1339
1340
        // Drawing tree:
1341
        /* @var PageTreeView $tree */
1342
        $tree = GeneralUtility::makeInstance(PageTreeView::class);
1343
        $perms_clause = $this->getBackendUser()->getPagePermsClause(Permission::PAGE_SHOW);
1344
        $tree->init('AND ' . $perms_clause);
1345
1346
        $pageInfo = BackendUtility::readPageAccess($id, $perms_clause);
1347
        if (is_array($pageInfo)) {
1348
            // Set root row:
1349
            $tree->tree[] = [
1350
                'row' => $pageInfo,
1351
                'HTML' => $this->iconFactory->getIconForRecord('pages', $pageInfo, Icon::SIZE_SMALL),
1352
            ];
1353
        }
1354
1355
        // Get branch beneath:
1356
        if ($depth) {
1357
            $tree->getTree($id, $depth, '');
1358
        }
1359
1360
        // Traverse page tree:
1361
        $code = '';
1362
1363
        foreach ($tree->tree as $data) {
1364
            $this->MP = false;
1365
1366
            // recognize mount points
1367
            if ($data['row']['doktype'] === PageRepository::DOKTYPE_MOUNTPOINT) {
1368
                $mountpage = $this->pageRepository->getPage($data['row']['uid']);
1369
1370
                // fetch mounted pages
1371
                $this->MP = $mountpage[0]['mount_pid'] . '-' . $data['row']['uid'];
0 ignored issues
show
Documentation Bug introduced by
The property $MP was declared of type boolean, but $mountpage[0]['mount_pid...' . $data['row']['uid'] is of type string. Maybe add a type cast?

This check looks for assignments to scalar types that may be of the wrong type.

To ensure the code behaves as expected, it may be a good idea to add an explicit type cast.

$answer = 42;

$correct = false;

$correct = (bool) $answer;
Loading history...
1372
1373
                $mountTree = GeneralUtility::makeInstance(PageTreeView::class);
1374
                $mountTree->init('AND ' . $perms_clause);
1375
                $mountTree->getTree($mountpage[0]['mount_pid'], $depth);
1376
1377
                foreach ($mountTree->tree as $mountData) {
1378
                    $code .= $this->drawURLs_addRowsForPage(
1379
                        $mountData['row'],
1380
                        $mountData['HTML'] . BackendUtility::getRecordTitle('pages', $mountData['row'], true)
1381
                    );
1382
                }
1383
1384
                // replace page when mount_pid_ol is enabled
1385
                if ($mountpage[0]['mount_pid_ol']) {
1386
                    $data['row']['uid'] = $mountpage[0]['mount_pid'];
1387
                } else {
1388
                    // if the mount_pid_ol is not set the MP must not be used for the mountpoint page
1389
                    $this->MP = false;
1390
                }
1391
            }
1392
1393
            $code .= $this->drawURLs_addRowsForPage(
1394
                $data['row'],
1395
                $data['HTML'] . BackendUtility::getRecordTitle('pages', $data['row'], true)
1396
            );
1397
        }
1398
1399
        return $code;
1400
    }
1401
1402
    /**
1403
     * Expands exclude string
1404
     *
1405
     * @param string $excludeString Exclude string
1406
     * @return array
1407
     */
1408 2
    public function expandExcludeString($excludeString)
1409
    {
1410
        // internal static caches;
1411 2
        static $expandedExcludeStringCache;
1412 2
        static $treeCache;
1413
1414 2
        if (empty($expandedExcludeStringCache[$excludeString])) {
1415 2
            $pidList = [];
1416
1417 2
            if (! empty($excludeString)) {
1418
                /** @var PageTreeView $tree */
1419 1
                $tree = GeneralUtility::makeInstance(PageTreeView::class);
1420 1
                $tree->init('AND ' . $this->getBackendUser()->getPagePermsClause(Permission::PAGE_SHOW));
1421
1422 1
                $excludeParts = GeneralUtility::trimExplode(',', $excludeString);
1423
1424 1
                foreach ($excludeParts as $excludePart) {
1425 1
                    [$pid, $depth] = GeneralUtility::trimExplode('+', $excludePart);
1426
1427
                    // default is "page only" = "depth=0"
1428 1
                    if (empty($depth)) {
1429 1
                        $depth = (stristr($excludePart, '+')) ? 99 : 0;
1430
                    }
1431
1432 1
                    $pidList[] = (int) $pid;
1433
1434 1
                    if ($depth > 0) {
1435
                        if (empty($treeCache[$pid][$depth])) {
1436
                            $tree->reset();
1437
                            $tree->getTree($pid, $depth);
0 ignored issues
show
Bug introduced by
$pid of type string is incompatible with the type integer expected by parameter $uid of TYPO3\CMS\Backend\Tree\V...ractTreeView::getTree(). ( Ignorable by Annotation )

If this is a false-positive, you can also ignore this issue in your code via the ignore-type  annotation

1437
                            $tree->getTree(/** @scrutinizer ignore-type */ $pid, $depth);
Loading history...
1438
                            $treeCache[$pid][$depth] = $tree->tree;
1439
                        }
1440
1441
                        foreach ($treeCache[$pid][$depth] as $data) {
1442
                            $pidList[] = (int) $data['row']['uid'];
1443
                        }
1444
                    }
1445
                }
1446
            }
1447
1448 2
            $expandedExcludeStringCache[$excludeString] = array_unique($pidList);
1449
        }
1450
1451 2
        return $expandedExcludeStringCache[$excludeString];
1452
    }
1453
1454
    /**
1455
     * Create the rows for display of the page tree
1456
     * For each page a number of rows are shown displaying GET variable configuration
1457
     */
1458
    public function drawURLs_addRowsForPage(array $pageRow, string $pageTitle): string
1459
    {
1460
        $skipMessage = '';
1461
1462
        // Get list of configurations
1463
        $configurations = $this->getUrlsForPageRow($pageRow, $skipMessage);
1464
        $configurations = ConfigurationService::removeDisallowedConfigurations($this->incomingConfigurationSelection, $configurations);
1465
1466
        // Traverse parameter combinations:
1467
        $c = 0;
1468
        $content = '';
1469
        if (! empty($configurations)) {
1470
            foreach ($configurations as $confKey => $confArray) {
1471
1472
                // Title column:
1473
                if (! $c) {
1474
                    $titleClm = '<td rowspan="' . count($configurations) . '">' . $pageTitle . '</td>';
1475
                } else {
1476
                    $titleClm = '';
1477
                }
1478
1479
                if (! in_array($pageRow['uid'], $this->expandExcludeString($confArray['subCfg']['exclude']), true)) {
1480
1481
                    // URL list:
1482
                    $urlList = $this->urlListFromUrlArray(
1483
                        $confArray,
1484
                        $pageRow,
1485
                        $this->scheduledTime,
1486
                        $this->reqMinute,
1487
                        $this->submitCrawlUrls,
1488
                        $this->downloadCrawlUrls,
1489
                        $this->duplicateTrack,
1490
                        $this->downloadUrls,
1491
                        // if empty the urls won't be filtered by processing instructions
1492
                        $this->incomingProcInstructions
1493
                    );
1494
1495
                    // Expanded parameters:
1496
                    $paramExpanded = '';
1497
                    $calcAccu = [];
1498
                    $calcRes = 1;
1499
                    foreach ($confArray['paramExpanded'] as $gVar => $gVal) {
1500
                        $paramExpanded .= '
1501
                            <tr>
1502
                                <td>' . htmlspecialchars('&' . $gVar . '=') . '<br/>' .
1503
                            '(' . count($gVal) . ')' .
1504
                            '</td>
1505
                                <td nowrap="nowrap">' . nl2br(htmlspecialchars(implode(chr(10), $gVal))) . '</td>
1506
                            </tr>
1507
                        ';
1508
                        $calcRes *= count($gVal);
1509
                        $calcAccu[] = count($gVal);
1510
                    }
1511
                    $paramExpanded = '<table>' . $paramExpanded . '</table>';
1512
                    $paramExpanded .= 'Comb: ' . implode('*', $calcAccu) . '=' . $calcRes;
1513
1514
                    // Options
1515
                    $optionValues = '';
1516
                    if ($confArray['subCfg']['userGroups']) {
1517
                        $optionValues .= 'User Groups: ' . $confArray['subCfg']['userGroups'] . '<br/>';
1518
                    }
1519
                    if ($confArray['subCfg']['procInstrFilter']) {
1520
                        $optionValues .= 'ProcInstr: ' . $confArray['subCfg']['procInstrFilter'] . '<br/>';
1521
                    }
1522
1523
                    // Compile row:
1524
                    $content .= '
1525
                        <tr>
1526
                            ' . $titleClm . '
1527
                            <td>' . htmlspecialchars($confKey) . '</td>
1528
                            <td>' . nl2br(htmlspecialchars(rawurldecode(trim(str_replace('&', chr(10) . '&', GeneralUtility::implodeArrayForUrl('', $confArray['paramParsed'])))))) . '</td>
1529
                            <td>' . $paramExpanded . '</td>
1530
                            <td nowrap="nowrap">' . $urlList . '</td>
1531
                            <td nowrap="nowrap">' . $optionValues . '</td>
1532
                            <td nowrap="nowrap">' . DebugUtility::viewArray($confArray['subCfg']['procInstrParams.']) . '</td>
1533
                        </tr>';
1534
                } else {
1535
                    $content .= '<tr>
1536
                            ' . $titleClm . '
1537
                            <td>' . htmlspecialchars($confKey) . '</td>
1538
                            <td colspan="5"><em>No entries</em> (Page is excluded in this configuration)</td>
1539
                        </tr>';
1540
                }
1541
1542
                $c++;
1543
            }
1544
        } else {
1545
            $message = ! empty($skipMessage) ? ' (' . $skipMessage . ')' : '';
1546
1547
            // Compile row:
1548
            $content .= '
1549
                <tr>
1550
                    <td>' . $pageTitle . '</td>
1551
                    <td colspan="6"><em>No entries</em>' . $message . '</td>
1552
                </tr>';
1553
        }
1554
1555
        return $content;
1556
    }
1557
1558
    /*****************************
1559
     *
1560
     * CLI functions
1561
     *
1562
     *****************************/
1563
1564
    /**
1565
     * Running the functionality of the CLI (crawling URLs from queue)
1566
     */
1567
    public function CLI_run(int $countInARun, int $sleepTime, int $sleepAfterFinish): int
1568
    {
1569
        $result = 0;
1570
        $counter = 0;
1571
1572
        // First, run hooks:
1573
        foreach ($GLOBALS['TYPO3_CONF_VARS']['EXTCONF']['crawler']['cli_hooks'] ?? [] as $objRef) {
1574
            trigger_error(
1575
                'This hook (crawler/cli_hooks) is deprecated since 9.1.5 and will be removed when dropping support for TYPO3 9LTS and 10LTS',
1576
                E_USER_DEPRECATED
1577
            );
1578
            $hookObj = GeneralUtility::makeInstance($objRef);
1579
            if (is_object($hookObj)) {
1580
                $hookObj->crawler_init($this);
1581
            }
1582
        }
1583
1584
        // Clean up the queue
1585
        $this->queueRepository->cleanupQueue();
1586
1587
        // Select entries:
1588
        $rows = $this->queueRepository->fetchRecordsToBeCrawled($countInARun);
1589
1590
        if (! empty($rows)) {
1591
            $quidList = [];
1592
1593
            foreach ($rows as $r) {
1594
                $quidList[] = $r['qid'];
1595
            }
1596
1597
            $processId = $this->CLI_buildProcessId();
1598
1599
            //save the number of assigned queue entries to determine how many have been processed later
1600
            $numberOfAffectedRows = $this->queueRepository->updateProcessIdAndSchedulerForQueueIds($quidList, $processId);
1601
            $this->processRepository->updateProcessAssignItemsCount($numberOfAffectedRows, $processId);
1602
1603
            if ($numberOfAffectedRows !== count($quidList)) {
1604
                $this->CLI_debug('Nothing processed due to multi-process collision (' . $this->CLI_buildProcessId() . ')');
0 ignored issues
show
Deprecated Code introduced by
The function AOE\Crawler\Controller\C...Controller::CLI_debug() has been deprecated. ( Ignorable by Annotation )

If this is a false-positive, you can also ignore this issue in your code via the ignore-deprecated  annotation

1604
                /** @scrutinizer ignore-deprecated */ $this->CLI_debug('Nothing processed due to multi-process collision (' . $this->CLI_buildProcessId() . ')');
Loading history...
1605
                return ($result | self::CLI_STATUS_ABORTED);
1606
            }
1607
1608
            foreach ($rows as $r) {
1609
                $result |= $this->readUrl($r['qid']);
1610
1611
                $counter++;
1612
                // Just to relax the system
1613
                usleep((int) $sleepTime);
1614
1615
                // if during the start and the current read url the cli has been disable we need to return from the function
1616
                // mark the process NOT as ended.
1617
                if ($this->crawler->isDisabled()) {
1618
                    return ($result | self::CLI_STATUS_ABORTED);
1619
                }
1620
1621
                if (! $this->processRepository->isProcessActive($this->CLI_buildProcessId())) {
1622
                    $this->CLI_debug('conflict / timeout (' . $this->CLI_buildProcessId() . ')');
0 ignored issues
show
Deprecated Code introduced by
The function AOE\Crawler\Controller\C...Controller::CLI_debug() has been deprecated. ( Ignorable by Annotation )

If this is a false-positive, you can also ignore this issue in your code via the ignore-deprecated  annotation

1622
                    /** @scrutinizer ignore-deprecated */ $this->CLI_debug('conflict / timeout (' . $this->CLI_buildProcessId() . ')');
Loading history...
1623
                    $result |= self::CLI_STATUS_ABORTED;
1624
                    //possible timeout
1625
                    break;
1626
                }
1627
            }
1628
1629
            sleep((int) $sleepAfterFinish);
1630
1631
            $msg = 'Rows: ' . $counter;
1632
            $this->CLI_debug($msg . ' (' . $this->CLI_buildProcessId() . ')');
0 ignored issues
show
Deprecated Code introduced by
The function AOE\Crawler\Controller\C...Controller::CLI_debug() has been deprecated. ( Ignorable by Annotation )

If this is a false-positive, you can also ignore this issue in your code via the ignore-deprecated  annotation

1632
            /** @scrutinizer ignore-deprecated */ $this->CLI_debug($msg . ' (' . $this->CLI_buildProcessId() . ')');
Loading history...
1633
        } else {
1634
            $this->CLI_debug('Nothing within queue which needs to be processed (' . $this->CLI_buildProcessId() . ')');
0 ignored issues
show
Deprecated Code introduced by
The function AOE\Crawler\Controller\C...Controller::CLI_debug() has been deprecated. ( Ignorable by Annotation )

If this is a false-positive, you can also ignore this issue in your code via the ignore-deprecated  annotation

1634
            /** @scrutinizer ignore-deprecated */ $this->CLI_debug('Nothing within queue which needs to be processed (' . $this->CLI_buildProcessId() . ')');
Loading history...
1635
        }
1636
1637
        if ($counter > 0) {
1638
            $result |= self::CLI_STATUS_PROCESSED;
1639
        }
1640
1641
        return $result;
1642
    }
1643
1644
    /**
1645
     * Activate hooks
1646
     * @deprecated
1647
     */
1648
    public function CLI_runHooks(): void
1649
    {
1650
        foreach ($GLOBALS['TYPO3_CONF_VARS']['EXTCONF']['crawler']['cli_hooks'] ?? [] as $objRef) {
1651
            $hookObj = GeneralUtility::makeInstance($objRef);
1652
            if (is_object($hookObj)) {
1653
                $hookObj->crawler_init($this);
1654
            }
1655
        }
1656
    }
1657
1658
    /**
1659
     * Try to acquire a new process with the given id
1660
     * also performs some auto-cleanup for orphan processes
1661
     * @param string $id identification string for the process
1662
     * @return boolean
1663
     * @todo preemption might not be the most elegant way to clean up
1664
     */
1665
    public function CLI_checkAndAcquireNewProcess($id)
1666
    {
1667
        $ret = true;
1668
1669
        $systemProcessId = getmypid();
1670
        if (! $systemProcessId) {
1671
            return false;
1672
        }
1673
1674
        $processCount = 0;
1675
        $orphanProcesses = [];
1676
1677
        $activeProcesses = $this->processRepository->findAllActive();
1678
        $currentTime = $this->getCurrentTime();
1679
1680
        /** @var Process $process */
1681
        foreach ($activeProcesses as $process) {
1682
            if ($process->getTtl() < $currentTime) {
1683
                $orphanProcesses[] = $process->getProcessId();
1684
            } else {
1685
                $processCount++;
1686
            }
1687
        }
1688
1689
        // if there are less than allowed active processes then add a new one
1690
        if ($processCount < (int) $this->extensionSettings['processLimit']) {
1691
            $this->CLI_debug('add process ' . $this->CLI_buildProcessId() . ' (' . ($processCount + 1) . '/' . (int) $this->extensionSettings['processLimit'] . ')');
0 ignored issues
show
Deprecated Code introduced by
The function AOE\Crawler\Controller\C...Controller::CLI_debug() has been deprecated. ( Ignorable by Annotation )

If this is a false-positive, you can also ignore this issue in your code via the ignore-deprecated  annotation

1691
            /** @scrutinizer ignore-deprecated */ $this->CLI_debug('add process ' . $this->CLI_buildProcessId() . ' (' . ($processCount + 1) . '/' . (int) $this->extensionSettings['processLimit'] . ')');
Loading history...
1692
1693
            GeneralUtility::makeInstance(ConnectionPool::class)->getConnectionForTable('tx_crawler_process')->insert(
1694
                'tx_crawler_process',
1695
                [
1696
                    'process_id' => $id,
1697
                    'active' => 1,
1698
                    'ttl' => $currentTime + (int) $this->extensionSettings['processMaxRunTime'],
1699
                    'system_process_id' => $systemProcessId,
1700
                ]
1701
            );
1702
        } else {
1703
            $this->CLI_debug('Processlimit reached (' . ($processCount) . '/' . (int) $this->extensionSettings['processLimit'] . ')');
0 ignored issues
show
Deprecated Code introduced by
The function AOE\Crawler\Controller\C...Controller::CLI_debug() has been deprecated. ( Ignorable by Annotation )

If this is a false-positive, you can also ignore this issue in your code via the ignore-deprecated  annotation

1703
            /** @scrutinizer ignore-deprecated */ $this->CLI_debug('Processlimit reached (' . ($processCount) . '/' . (int) $this->extensionSettings['processLimit'] . ')');
Loading history...
1704
            $ret = false;
1705
        }
1706
1707
        $this->processRepository->deleteProcessesMarkedAsDeleted();
1708
        $this->CLI_releaseProcesses($orphanProcesses);
1709
1710
        return $ret;
1711
    }
1712
1713
    /**
1714
     * Release a process and the required resources
1715
     *
1716
     * @param mixed $releaseIds string with a single process-id or array with multiple process-ids
1717
     * @return boolean
1718
     */
1719
    public function CLI_releaseProcesses($releaseIds)
1720
    {
1721
        $queryBuilder = GeneralUtility::makeInstance(ConnectionPool::class)->getQueryBuilderForTable($this->tableName);
1722
1723
        if (! is_array($releaseIds)) {
1724
            $releaseIds = [$releaseIds];
1725
        }
1726
1727
        if (empty($releaseIds)) {
1728
            //nothing to release
1729
            return false;
1730
        }
1731
1732
        // some kind of 2nd chance algo - this way you need at least 2 processes to have a real cleanup
1733
        // this ensures that a single process can't mess up the entire process table
1734
1735
        // mark all processes as deleted which have no "waiting" queue-entires and which are not active
1736
1737
        $queryBuilder
1738
            ->update($this->tableName, 'q')
1739
            ->where(
1740
                'q.process_id IN(SELECT p.process_id FROM tx_crawler_process as p WHERE p.active = 0)'
1741
            )
1742
            ->set('q.process_scheduled', 0)
1743
            ->set('q.process_id', '')
1744
            ->execute();
1745
1746
        // FIXME: Not entirely sure that this is equivalent to the previous version
1747
        $queryBuilder->resetQueryPart('set');
1748
1749
        $queryBuilder
1750
            ->update('tx_crawler_process')
1751
            ->where(
1752
                $queryBuilder->expr()->eq('active', 0),
1753
                'process_id IN(SELECT q.process_id FROM tx_crawler_queue as q WHERE q.exec_time = 0)'
1754
            )
1755
            ->set('system_process_id', 0)
1756
            ->execute();
1757
1758
        $this->processRepository->markRequestedProcessesAsNotActive($releaseIds);
1759
        $this->queueRepository->unsetProcessScheduledAndProcessIdForQueueEntries($releaseIds);
1760
1761
        return true;
1762
    }
1763
1764
    /**
1765
     * Create a unique Id for the current process
1766
     *
1767
     * @return string the ID
1768
     */
1769 1
    public function CLI_buildProcessId()
1770
    {
1771 1
        if (! $this->processID) {
1772
            $this->processID = GeneralUtility::shortMD5(microtime(true));
1773
        }
1774 1
        return $this->processID;
1775
    }
1776
1777
    /**
1778
     * Prints a message to the stdout (only if debug-mode is enabled)
1779
     *
1780
     * @param string $msg the message
1781
     * @deprecated
1782
     * @codeCoverageIgnore
1783
     */
1784
    public function CLI_debug($msg): void
1785
    {
1786
        if ((int) $this->extensionSettings['processDebug']) {
1787
            echo $msg . "\n";
1788
            flush();
1789
        }
1790
    }
1791
1792
    /**
1793
     * Cleans up entries that stayed for too long in the queue. These are:
1794
     * - processed entries that are over 1.5 days in age
1795
     * - scheduled entries that are over 7 days old
1796
     *
1797
     * @deprecated
1798
     */
1799 1
    public function cleanUpOldQueueEntries(): void
1800
    {
1801
        // 24*60*60 Seconds in 24 hours
1802 1
        $processedAgeInSeconds = $this->extensionSettings['cleanUpProcessedAge'] * 86400;
1803 1
        $scheduledAgeInSeconds = $this->extensionSettings['cleanUpScheduledAge'] * 86400;
1804
1805 1
        $now = time();
1806 1
        $condition = '(exec_time<>0 AND exec_time<' . ($now - $processedAgeInSeconds) . ') OR scheduled<=' . ($now - $scheduledAgeInSeconds);
1807 1
        $this->flushQueue($condition);
0 ignored issues
show
Deprecated Code introduced by
The function AOE\Crawler\Controller\C...ontroller::flushQueue() has been deprecated. ( Ignorable by Annotation )

If this is a false-positive, you can also ignore this issue in your code via the ignore-deprecated  annotation

1807
        /** @scrutinizer ignore-deprecated */ $this->flushQueue($condition);
Loading history...
1808 1
    }
1809
1810
    /**
1811
     * Removes queue entries
1812
     *
1813
     * @param string $where SQL related filter for the entries which should be removed
1814
     *
1815
     * @deprecated
1816
     */
1817 5
    protected function flushQueue($where = ''): void
1818
    {
1819 5
        $realWhere = strlen((string) $where) > 0 ? $where : '1=1';
1820
1821 5
        $queryBuilder = $this->getQueryBuilder($this->tableName);
1822
1823
        $groups = $queryBuilder
1824 5
            ->selectLiteral('DISTINCT set_id')
1825 5
            ->from($this->tableName)
1826 5
            ->where($realWhere)
1827 5
            ->execute()
1828 5
            ->fetchAll();
1829 5
        if (is_array($groups)) {
0 ignored issues
show
introduced by
The condition is_array($groups) is always true.
Loading history...
1830 5
            foreach ($groups as $group) {
1831
                $subSet = $queryBuilder
1832 4
                    ->select('qid', 'set_id')
1833 4
                    ->from($this->tableName)
1834 4
                    ->where(
1835 4
                        $realWhere,
1836 4
                        $queryBuilder->expr()->eq('set_id', $group['set_id'])
1837
                    )
1838 4
                    ->execute()
1839 4
                    ->fetchAll();
1840
1841 4
                $payLoad = ['subSet' => $subSet];
1842 4
                SignalSlotUtility::emitSignal(
0 ignored issues
show
Deprecated Code introduced by
The function AOE\Crawler\Utility\Sign...otUtility::emitSignal() has been deprecated. ( Ignorable by Annotation )

If this is a false-positive, you can also ignore this issue in your code via the ignore-deprecated  annotation

1842
                /** @scrutinizer ignore-deprecated */ SignalSlotUtility::emitSignal(
Loading history...
1843 4
                    self::class,
1844 4
                    SignalSlotUtility::SIGNAL_QUEUE_ENTRY_FLUSH,
1845
                    $payLoad
1846
                );
1847
            }
1848
        }
1849
1850
        $queryBuilder
1851 5
            ->delete($this->tableName)
1852 5
            ->where($realWhere)
1853 5
            ->execute();
1854 5
    }
1855
1856
    /**
1857
     * This method determines duplicates for a queue entry with the same parameters and this timestamp.
1858
     * If the timestamp is in the past, it will check if there is any unprocessed queue entry in the past.
1859
     * If the timestamp is in the future it will check, if the queued entry has exactly the same timestamp
1860
     *
1861
     * @param int $tstamp
1862
     * @param array $fieldArray
1863
     *
1864
     * @return array
1865
     * @deprecated
1866
     */
1867 5
    protected function getDuplicateRowsIfExist($tstamp, $fieldArray)
1868
    {
1869 5
        $rows = [];
1870
1871 5
        $currentTime = $this->getCurrentTime();
1872
1873 5
        $queryBuilder = GeneralUtility::makeInstance(ConnectionPool::class)->getQueryBuilderForTable($this->tableName);
1874
        $queryBuilder
1875 5
            ->select('qid')
1876 5
            ->from('tx_crawler_queue');
1877
        //if this entry is scheduled with "now"
1878 5
        if ($tstamp <= $currentTime) {
1879 2
            if ($this->extensionSettings['enableTimeslot']) {
1880 1
                $timeBegin = $currentTime - 100;
1881 1
                $timeEnd = $currentTime + 100;
1882
                $queryBuilder
1883 1
                    ->where(
1884 1
                        'scheduled BETWEEN ' . $timeBegin . ' AND ' . $timeEnd . ''
1885
                    )
1886 1
                    ->orWhere(
1887 1
                        $queryBuilder->expr()->lte('scheduled', $currentTime)
1888
                    );
1889
            } else {
1890
                $queryBuilder
1891 1
                    ->where(
1892 2
                        $queryBuilder->expr()->lte('scheduled', $currentTime)
1893
                    );
1894
            }
1895 3
        } elseif ($tstamp > $currentTime) {
1896
            //entry with a timestamp in the future need to have the same schedule time
1897
            $queryBuilder
1898 3
                ->where(
1899 3
                    $queryBuilder->expr()->eq('scheduled', $tstamp)
1900
                );
1901
        }
1902
1903
        $queryBuilder
1904 5
            ->andWhere('NOT exec_time')
1905 5
            ->andWhere('NOT process_id')
1906 5
            ->andWhere($queryBuilder->expr()->eq('page_id', $queryBuilder->createNamedParameter($fieldArray['page_id'], PDO::PARAM_INT)))
1907 5
            ->andWhere($queryBuilder->expr()->eq('parameters_hash', $queryBuilder->createNamedParameter($fieldArray['parameters_hash'], PDO::PARAM_STR)));
1908
1909 5
        $statement = $queryBuilder->execute();
1910
1911 5
        while ($row = $statement->fetch()) {
1912 5
            $rows[] = $row['qid'];
1913
        }
1914
1915 5
        return $rows;
1916
    }
1917
1918
    /**
1919
     * Returns a md5 hash generated from a serialized configuration array.
1920
     *
1921
     * @return string
1922
     */
1923 10
    protected function getConfigurationHash(array $configuration)
1924
    {
1925 10
        unset($configuration['paramExpanded']);
1926 10
        unset($configuration['URLs']);
1927 10
        return md5(serialize($configuration));
1928
    }
1929
1930
    /**
1931
     * Build a URL from a Page and the Query String. If the page has a Site configuration, it can be built by using
1932
     * the Site instance.
1933
     *
1934
     * @param int $httpsOrHttp see tx_crawler_configuration.force_ssl
1935
     * @throws SiteNotFoundException
1936
     * @throws InvalidRouteArgumentsException
1937
     *
1938
     * @deprecated Using CrawlerController::getUrlFromPageAndQueryParameters() is deprecated since 9.1.1 and will be removed in v11.x, please use UrlService->getUrlFromPageAndQueryParameters() instead.
1939
     * @codeCoverageIgnore
1940
     */
1941
    protected function getUrlFromPageAndQueryParameters(int $pageId, string $queryString, ?string $alternativeBaseUrl, int $httpsOrHttp): UriInterface
1942
    {
1943
        $urlService = new UrlService();
1944
        return $urlService->getUrlFromPageAndQueryParameters($pageId, $queryString, $alternativeBaseUrl, $httpsOrHttp);
1945
    }
1946
1947 1
    protected function swapIfFirstIsLargerThanSecond(array $reg): array
1948
    {
1949
        // Swap if first is larger than last:
1950 1
        if ($reg[1] > $reg[2]) {
1951
            $temp = $reg[2];
1952
            $reg[2] = $reg[1];
1953
            $reg[1] = $temp;
1954
        }
1955
1956 1
        return $reg;
1957
    }
1958
1959 6
    private function getMaximumUrlsToCompile(): int
1960
    {
1961 6
        return $this->maximumUrlsToCompile;
1962
    }
1963
1964
    /**
1965
     * @return BackendUserAuthentication
1966
     */
1967 2
    private function getBackendUser()
1968
    {
1969
        // Make sure the _cli_ user is loaded
1970 2
        Bootstrap::initializeBackendAuthentication();
1971 2
        if ($this->backendUser === null) {
1972 2
            $this->backendUser = $GLOBALS['BE_USER'];
1973
        }
1974 2
        return $this->backendUser;
1975
    }
1976
1977
    /**
1978
     * Get querybuilder for given table
1979
     *
1980
     * @return QueryBuilder
1981
     */
1982 12
    private function getQueryBuilder(string $table)
1983
    {
1984 12
        return GeneralUtility::makeInstance(ConnectionPool::class)->getQueryBuilderForTable($table);
1985
    }
1986
}
1987