Passed
Push — Cleanup/CrawlerController ( 956077...a99521 )
by Tomas Norre
24:13 queued 06:14
created

CrawlerController::checkIfPageShouldBeSkipped()   A

Complexity

Conditions 1
Paths 1

Size

Total Lines 4
Code Lines 2

Duplication

Lines 0
Ratio 0 %

Code Coverage

Tests 2
CRAP Score 1

Importance

Changes 1
Bugs 0 Features 0
Metric Value
cc 1
eloc 2
c 1
b 0
f 0
nc 1
nop 1
dl 0
loc 4
ccs 2
cts 2
cp 1
crap 1
rs 10
1
<?php
2
3
declare(strict_types=1);
4
5
namespace AOE\Crawler\Controller;
6
7
/***************************************************************
8
 *  Copyright notice
9
 *
10
 *  (c) 2020 AOE GmbH <[email protected]>
11
 *
12
 *  All rights reserved
13
 *
14
 *  This script is part of the TYPO3 project. The TYPO3 project is
15
 *  free software; you can redistribute it and/or modify
16
 *  it under the terms of the GNU General Public License as published by
17
 *  the Free Software Foundation; either version 3 of the License, or
18
 *  (at your option) any later version.
19
 *
20
 *  The GNU General Public License can be found at
21
 *  http://www.gnu.org/copyleft/gpl.html.
22
 *
23
 *  This script is distributed in the hope that it will be useful,
24
 *  but WITHOUT ANY WARRANTY; without even the implied warranty of
25
 *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
26
 *  GNU General Public License for more details.
27
 *
28
 *  This copyright notice MUST APPEAR in all copies of the script!
29
 ***************************************************************/
30
31
use AOE\Crawler\Configuration\ExtensionConfigurationProvider;
32
use AOE\Crawler\Converter\JsonCompatibilityConverter;
33
use AOE\Crawler\Crawler;
34
use AOE\Crawler\CrawlStrategy\CrawlStrategyFactory;
35
use AOE\Crawler\Domain\Model\Process;
36
use AOE\Crawler\Domain\Repository\ConfigurationRepository;
37
use AOE\Crawler\Domain\Repository\ProcessRepository;
38
use AOE\Crawler\Domain\Repository\QueueRepository;
39
use AOE\Crawler\QueueExecutor;
40
use AOE\Crawler\Service\ConfigurationService;
41
use AOE\Crawler\Service\PageService;
42
use AOE\Crawler\Service\UrlService;
43
use AOE\Crawler\Service\UserService;
44
use AOE\Crawler\Utility\SignalSlotUtility;
45
use AOE\Crawler\Value\QueueFilter;
46
use PDO;
47
use Psr\Http\Message\UriInterface;
48
use Psr\Log\LoggerAwareInterface;
49
use Psr\Log\LoggerAwareTrait;
50
use TYPO3\CMS\Backend\Tree\View\PageTreeView;
51
use TYPO3\CMS\Backend\Utility\BackendUtility;
52
use TYPO3\CMS\Core\Authentication\BackendUserAuthentication;
53
use TYPO3\CMS\Core\Compatibility\PublicMethodDeprecationTrait;
54
use TYPO3\CMS\Core\Compatibility\PublicPropertyDeprecationTrait;
55
use TYPO3\CMS\Core\Core\Bootstrap;
56
use TYPO3\CMS\Core\Core\Environment;
57
use TYPO3\CMS\Core\Database\Connection;
58
use TYPO3\CMS\Core\Database\ConnectionPool;
59
use TYPO3\CMS\Core\Database\Query\QueryBuilder;
60
use TYPO3\CMS\Core\Database\Query\Restriction\DeletedRestriction;
61
use TYPO3\CMS\Core\Database\QueryGenerator;
62
use TYPO3\CMS\Core\Domain\Repository\PageRepository;
63
use TYPO3\CMS\Core\Exception\SiteNotFoundException;
64
use TYPO3\CMS\Core\Imaging\Icon;
65
use TYPO3\CMS\Core\Imaging\IconFactory;
66
use TYPO3\CMS\Core\Routing\InvalidRouteArgumentsException;
67
use TYPO3\CMS\Core\Site\Entity\Site;
68
use TYPO3\CMS\Core\Type\Bitmask\Permission;
69
use TYPO3\CMS\Core\TypoScript\Parser\TypoScriptParser;
70
use TYPO3\CMS\Core\Utility\DebugUtility;
71
use TYPO3\CMS\Core\Utility\GeneralUtility;
72
use TYPO3\CMS\Core\Utility\MathUtility;
73
use TYPO3\CMS\Extbase\Object\ObjectManager;
74
75
/**
76
 * Class CrawlerController
77
 *
78
 * @package AOE\Crawler\Controller
79
 */
80
class CrawlerController implements LoggerAwareInterface
81
{
82
    use LoggerAwareTrait;
83
    use PublicMethodDeprecationTrait;
84
    use PublicPropertyDeprecationTrait;
85
86
    public const CLI_STATUS_NOTHING_PROCCESSED = 0;
87
88
    //queue not empty
89
    public const CLI_STATUS_REMAIN = 1;
90
91
    //(some) queue items where processed
92
    public const CLI_STATUS_PROCESSED = 2;
93
94
    //instance didn't finish
95
    public const CLI_STATUS_ABORTED = 4;
96
97
    public const CLI_STATUS_POLLABLE_PROCESSED = 8;
98
99
    /**
100
     * @var integer
101
     */
102
    public $setID = 0;
103
104
    /**
105
     * @var string
106
     */
107
    public $processID = '';
108
109
    /**
110
     * @var array
111
     */
112
    public $duplicateTrack = [];
113
114
    /**
115
     * @var array
116
     */
117
    public $downloadUrls = [];
118
119
    /**
120
     * @var array
121
     */
122
    public $incomingProcInstructions = [];
123
124
    /**
125
     * @var array
126
     */
127
    public $incomingConfigurationSelection = [];
128
129
    /**
130
     * @var bool
131
     */
132
    public $registerQueueEntriesInternallyOnly = false;
133
134
    /**
135
     * @var array
136
     */
137
    public $queueEntries = [];
138
139
    /**
140
     * @var array
141
     */
142
    public $urlList = [];
143
144
    /**
145
     * @var array
146
     */
147
    public $extensionSettings = [];
148
149
    /**
150
     * Mount Point
151
     *
152
     * @var bool
153
     * Todo: Check what this is used for and adjust the type hint or code, as bool doesn't match the current code.
154
     */
155
    public $MP = false;
156
157
    /**
158
     * @var string
159
     * @deprecated
160
     */
161
    protected $processFilename;
162
163
    /**
164
     * Holds the internal access mode can be 'gui','cli' or 'cli_im'
165
     *
166
     * @var string
167
     * @deprecated
168
     */
169
    protected $accessMode;
170
171
    /**
172
     * @var QueueRepository
173
     */
174
    protected $queueRepository;
175
176
    /**
177
     * @var ProcessRepository
178
     */
179
    protected $processRepository;
180
181
    /**
182
     * @var ConfigurationRepository
183
     */
184
    protected $configurationRepository;
185
186
    /**
187
     * @var string
188
     * @deprecated Since v9.2.5 - This will be remove in v10
189
     */
190
    protected $tableName = 'tx_crawler_queue';
191
192
    /**
193
     * @var QueueExecutor
194
     */
195
    protected $queueExecutor;
196
197
    /**
198
     * @var int
199
     */
200
    protected $maximumUrlsToCompile = 1;
201
202
    /**
203
     * @var IconFactory
204
     */
205
    protected $iconFactory;
206
207
    /**
208
     * @var string[]
209
     */
210
    private $deprecatedPublicMethods = [
0 ignored issues
show
introduced by
The private property $deprecatedPublicMethods is not used, and could be removed.
Loading history...
211
        'compileUrls' => 'Using CrawlerController->compileUrls() is deprecated since 9.2.5, and will be removed in v11.x',
212
        'cleanUpOldQueueEntries' => 'Using CrawlerController::cleanUpOldQueueEntries() is deprecated since 9.0.1 and will be removed in v11.x, please use QueueRepository->cleanUpOldQueueEntries() instead.',
213
        'CLI_debug' => 'Using CrawlerController->CLI_debug() is deprecated since 9.1.3 and will be removed in v11.x',
214
        'CLI_releaseProcesses' => 'Using CrawlerController->CLI_releaseProcesses() is deprecated since 9.2.2 and will be removed in v11.x',
215
        'CLI_runHooks' => 'Using CrawlerController->CLI_runHooks() is deprecated since 9.1.5 and will be removed in v11.x',
216
        'getAccessMode' => 'Using CrawlerController->getAccessMode() is deprecated since 9.1.3 and will be removed in v11.x',
217
        'getLogEntriesForPageId' => 'Using CrawlerController->getLogEntriesForPageId() is deprecated since 9.1.5 and will be remove in v11.x',
218
        'getLogEntriesForSetId' => 'Using crawlerController::getLogEntriesForSetId() is deprecated since 9.0.1 and will be removed in v11.x',
219
        'hasGroupAccess' => 'Using CrawlerController->getLogEntriesForPageId() is deprecated since 9.2.2 and will be remove in v11.x, please use UserService::hasGroupAccess() instead.',
220
        'flushQueue' => 'Using CrawlerController::flushQueue() is deprecated since 9.0.1 and will be removed in v11.x, please use QueueRepository->flushQueue() instead.',
221
        'setAccessMode' => 'Using CrawlerController->setAccessMode() is deprecated since 9.1.3 and will be removed in v11.x',
222
        'getDisabled' => 'Using CrawlerController->getDisabled() is deprecated since 9.1.3 and will be removed in v11.x, please use Crawler->isDisabled() instead',
223
        'setDisabled' => 'Using CrawlerController->setDisabled() is deprecated since 9.1.3 and will be removed in v11.x, please use Crawler->setDisabled() instead',
224
        'getProcessFilename' => 'Using CrawlerController->getProcessFilename() is deprecated since 9.1.3 and will be removed in v11.x',
225
        'setProcessFilename' => 'Using CrawlerController->setProcessFilename() is deprecated since 9.1.3 and will be removed in v11.x',
226
        'getDuplicateRowsIfExist' => 'Using CrawlerController->getDuplicateRowsIfExist() is deprecated since 9.1.4 and will be remove in v11.x, please use QueueRepository->getDuplicateQueueItemsIfExists() instead',
227
        'checkIfPageShouldBeSkipped' => 'Using CrawlerController->checkIfPageShouldBeSkipped() is deprecated since 9.2.5 and will be removed in v11.x',
228
        'swapIfFirstIsLargerThanSecond' => 'Using CrawlerController->swapIfFirstIsLargerThanSecond() is deprecated since 9.2.5, and will be removed in v11.x',
229 41
        'expandParameters' => 'Using CrawlerController->expandParameters() is deprecated since 9.2.5, and will be removed in v11.x',
230
    ];
231 41
232 41
    /**
233 41
     * @var string[]
234 41
     */
235 41
    private $deprecatedPublicProperties = [
236 41
        'accessMode' => 'Using CrawlerController->accessMode is deprecated since 9.1.3 and will be removed in v11.x',
237
        'processFilename' => 'Using CrawlerController->accessMode is deprecated since 9.1.3 and will be removed in v11.x',
238 41
    ];
239
240
    /**
241 41
     * @var BackendUserAuthentication|null
242 41
     */
243 41
    private $backendUser;
244
245
    /**
246 41
     * @var integer
247
     */
248
    private $scheduledTime = 0;
249
250 41
    /**
251 41
     * @var integer
252 41
     */
253
    private $reqMinute = 0;
254
255
    /**
256
     * @var bool
257
     */
258
    private $submitCrawlUrls = false;
259 1
260
    /**
261 1
     * @var bool
262
     */
263
    private $downloadCrawlUrls = false;
264
265
    /**
266
     * @var PageRepository
267 1
     */
268
    private $pageRepository;
269 1
270 1
    /**
271
     * @var Crawler
272
     */
273
    private $crawler;
274
275
    /**
276
     * @var ConfigurationService
277 2
     */
278
    private $configurationService;
279 2
280 1
    /**
281
     * @var UrlService
282 1
     */
283 1
    private $urlService;
284
285
    /************************************
286 2
     *
287
     * Getting URLs based on Page TSconfig
288
     *
289
     ************************************/
290
291
    public function __construct()
292
    {
293 2
        $objectManager = GeneralUtility::makeInstance(ObjectManager::class);
294
        $crawlStrategyFactory = GeneralUtility::makeInstance(CrawlStrategyFactory::class);
295 2
        $this->queueRepository = $objectManager->get(QueueRepository::class);
296
        $this->processRepository = $objectManager->get(ProcessRepository::class);
297
        $this->configurationRepository = $objectManager->get(ConfigurationRepository::class);
298
        $this->pageRepository = GeneralUtility::makeInstance(PageRepository::class);
299
        $this->queueExecutor = GeneralUtility::makeInstance(QueueExecutor::class, $crawlStrategyFactory);
300
        $this->iconFactory = GeneralUtility::makeInstance(IconFactory::class);
301 3
        $this->crawler = GeneralUtility::makeInstance(Crawler::class);
302
        $this->configurationService = GeneralUtility::makeInstance(ConfigurationService::class);
303 3
        $this->urlService = GeneralUtility::makeInstance(UrlService::class);
304 3
305
        $this->processFilename = Environment::getVarPath() . '/lock/tx_crawler.proc';
0 ignored issues
show
Deprecated Code introduced by
The property AOE\Crawler\Controller\C...oller::$processFilename has been deprecated. ( Ignorable by Annotation )

If this is a false-positive, you can also ignore this issue in your code via the ignore-deprecated  annotation

305
        /** @scrutinizer ignore-deprecated */ $this->processFilename = Environment::getVarPath() . '/lock/tx_crawler.proc';
Loading history...
306
307
        /** @var ExtensionConfigurationProvider $configurationProvider */
308
        $configurationProvider = GeneralUtility::makeInstance(ExtensionConfigurationProvider::class);
309 1
        $settings = $configurationProvider->getExtensionConfiguration();
310
        $this->extensionSettings = is_array($settings) ? $settings : [];
311 1
312
        if (MathUtility::convertToPositiveInteger($this->extensionSettings['countInARun']) === 0) {
313
            $this->extensionSettings['countInARun'] = 100;
314
        }
315
316
        $this->extensionSettings['processLimit'] = MathUtility::forceIntegerInRange($this->extensionSettings['processLimit'], 1, 99, 1);
317 12
        $this->setMaximumUrlsToCompile(MathUtility::forceIntegerInRange($this->extensionSettings['maxCompileUrls'], 1, 1000000000, 10000));
318
    }
319 12
320 12
    public function setMaximumUrlsToCompile(int $maximumUrlsToCompile): void
321
    {
322
        $this->maximumUrlsToCompile = $maximumUrlsToCompile;
323
    }
324
325
    /**
326
     * Method to set the accessMode can be gui, cli or cli_im
327 8
     *
328
     * @return string
329 8
     * @deprecated
330 8
     */
331
    public function getAccessMode()
332
    {
333 8
        return $this->accessMode;
0 ignored issues
show
Deprecated Code introduced by
The property AOE\Crawler\Controller\C...Controller::$accessMode has been deprecated. ( Ignorable by Annotation )

If this is a false-positive, you can also ignore this issue in your code via the ignore-deprecated  annotation

333
        return /** @scrutinizer ignore-deprecated */ $this->accessMode;
Loading history...
334 8
    }
335 1
336 1
    /**
337
     * @param string $accessMode
338
     * @deprecated
339
     */
340 8
    public function setAccessMode($accessMode): void
341 7
    {
342 3
        $this->accessMode = $accessMode;
0 ignored issues
show
Deprecated Code introduced by
The property AOE\Crawler\Controller\C...Controller::$accessMode has been deprecated. ( Ignorable by Annotation )

If this is a false-positive, you can also ignore this issue in your code via the ignore-deprecated  annotation

342
        /** @scrutinizer ignore-deprecated */ $this->accessMode = $accessMode;
Loading history...
343 3
    }
344
345
    /**
346
     * Set disabled status to prevent processes from being processed
347 8
     * @deprecated
348 4
     */
349 1
    public function setDisabled(?bool $disabled = true): void
350 1
    {
351 1
        if ($disabled) {
352 1
            GeneralUtility::writeFile($this->processFilename, 'disabled');
0 ignored issues
show
Deprecated Code introduced by
The property AOE\Crawler\Controller\C...oller::$processFilename has been deprecated. ( Ignorable by Annotation )

If this is a false-positive, you can also ignore this issue in your code via the ignore-deprecated  annotation

352
            GeneralUtility::writeFile(/** @scrutinizer ignore-deprecated */ $this->processFilename, 'disabled');
Loading history...
353
        } elseif (is_file($this->processFilename)) {
0 ignored issues
show
Deprecated Code introduced by
The property AOE\Crawler\Controller\C...oller::$processFilename has been deprecated. ( Ignorable by Annotation )

If this is a false-positive, you can also ignore this issue in your code via the ignore-deprecated  annotation

353
        } elseif (is_file(/** @scrutinizer ignore-deprecated */ $this->processFilename)) {
Loading history...
354
            unlink($this->processFilename);
0 ignored issues
show
Deprecated Code introduced by
The property AOE\Crawler\Controller\C...oller::$processFilename has been deprecated. ( Ignorable by Annotation )

If this is a false-positive, you can also ignore this issue in your code via the ignore-deprecated  annotation

354
            unlink(/** @scrutinizer ignore-deprecated */ $this->processFilename);
Loading history...
355
        }
356
    }
357 8
358
    /**
359 3
     * Get disable status
360
     * @deprecated
361
     */
362
    public function getDisabled(): bool
363
    {
364
        return is_file($this->processFilename);
0 ignored issues
show
Deprecated Code introduced by
The property AOE\Crawler\Controller\C...oller::$processFilename has been deprecated. ( Ignorable by Annotation )

If this is a false-positive, you can also ignore this issue in your code via the ignore-deprecated  annotation

364
        return is_file(/** @scrutinizer ignore-deprecated */ $this->processFilename);
Loading history...
365
    }
366
367
    /**
368
     * @param string $filenameWithPath
369
     * @deprecated
370
     */
371
    public function setProcessFilename($filenameWithPath): void
372
    {
373
        $this->processFilename = $filenameWithPath;
0 ignored issues
show
Deprecated Code introduced by
The property AOE\Crawler\Controller\C...oller::$processFilename has been deprecated. ( Ignorable by Annotation )

If this is a false-positive, you can also ignore this issue in your code via the ignore-deprecated  annotation

373
        /** @scrutinizer ignore-deprecated */ $this->processFilename = $filenameWithPath;
Loading history...
374
    }
375
376
    /**
377
     * @return string
378 8
     * @deprecated
379
     */
380
    public function getProcessFilename()
381
    {
382
        return $this->processFilename;
0 ignored issues
show
Deprecated Code introduced by
The property AOE\Crawler\Controller\C...oller::$processFilename has been deprecated. ( Ignorable by Annotation )

If this is a false-positive, you can also ignore this issue in your code via the ignore-deprecated  annotation

382
        return /** @scrutinizer ignore-deprecated */ $this->processFilename;
Loading history...
383
    }
384
385
    /**
386
     * Sets the extensions settings (unserialized pendant of $TYPO3_CONF_VARS['EXT']['extConf']['crawler']).
387
     */
388
    public function setExtensionSettings(array $extensionSettings): void
389
    {
390 4
        $this->extensionSettings = $extensionSettings;
391
    }
392 4
393
    /**
394 4
     * Check if the given page should be crawled
395 3
     *
396 3
     * @return false|string false if the page should be crawled (not excluded), true / skipMessage if it should be skipped
397
     * @deprecated
398 1
     */
399 1
    public function checkIfPageShouldBeSkipped(array $pageRow)
400
    {
401
        $pageService = GeneralUtility::makeInstance(PageService::class);
402 4
        return $pageService->checkIfPageShouldBeSkipped($pageRow);
403
    }
404
405
    /**
406
     * Wrapper method for getUrlsForPageId()
407
     * It returns an array of configurations and no urls!
408
     *
409
     * @param array $pageRow Page record with at least dok-type and uid columns.
410
     * @param string $skipMessage
411
     * @return array
412
     * @see getUrlsForPageId()
413
     */
414
    public function getUrlsForPageRow(array $pageRow, &$skipMessage = '')
415
    {
416
        if (! is_int($pageRow['uid'])) {
417
            $skipMessage = 'PageUid ' . $pageRow['uid'] . ' was not an integer';
418
            return [];
419
        }
420 2
421
        $message = $this->getPageService()->checkIfPageShouldBeSkipped($pageRow);
422
        if ($message === false) {
423
            $res = $this->getUrlsForPageId($pageRow['uid']);
424
            $skipMessage = '';
425
        } else {
426
            $skipMessage = $message;
427
            $res = [];
428
        }
429
430
        return $res;
431 2
    }
432
433
    /**
434 2
     * Creates a list of URLs from input array (and submits them to queue if asked for)
435 2
     * See Web > Info module script + "indexed_search"'s crawler hook-client using this!
436 2
     *
437 2
     * @param array $vv Information about URLs from pageRow to crawl.
438
     * @param array $pageRow Page row
439 2
     * @param int $scheduledTime Unix time to schedule indexing to, typically time()
440 2
     * @param int $reqMinute Number of requests per minute (creates the interleave between requests)
441
     * @param bool $submitCrawlUrls If set, submits the URLs to queue
442
     * @param bool $downloadCrawlUrls If set (and submitcrawlUrls is false) will fill $downloadUrls with entries)
443 2
     * @param array $duplicateTrack Array which is passed by reference and contains the an id per url to secure we will not crawl duplicates
444 2
     * @param array $downloadUrls Array which will be filled with URLS for download if flag is set.
445 2
     * @param array $incomingProcInstructions Array of processing instructions
446 2
     * @return string List of URLs (meant for display in backend module)
447 2
     */
448
    public function urlListFromUrlArray(
449
        array $vv,
450
        array $pageRow,
451 2
        $scheduledTime,
452
        $reqMinute,
453 2
        $submitCrawlUrls,
454
        $downloadCrawlUrls,
455
        array &$duplicateTrack,
456
        array &$downloadUrls,
457
        array $incomingProcInstructions
458 2
    ) {
459 2
        if (! is_array($vv['URLs'])) {
460 2
            return 'ERROR - no URL generated';
461 2
        }
462 2
        $urlLog = [];
463
        $pageId = (int) $pageRow['uid'];
464
        $configurationHash = $this->getConfigurationHash($vv);
465 2
        $skipInnerCheck = $this->queueRepository->noUnprocessedQueueEntriesForPageWithConfigurationHashExist($pageId, $configurationHash);
466 2
467 2
        $urlService = new UrlService();
468 2
469 2
        foreach ($vv['URLs'] as $urlQuery) {
470 2
            if (! $this->drawURLs_PIfilter($vv['subCfg']['procInstrFilter'], $incomingProcInstructions)) {
471 2
                continue;
472 2
            }
473
            $url = (string) $urlService->getUrlFromPageAndQueryParameters(
474 2
                $pageId,
475 2
                $urlQuery,
476
                $vv['subCfg']['baseUrl'] ?? null,
477
                $vv['subCfg']['force_ssl'] ?? 0
478
            );
479
480 2
            // Create key by which to determine unique-ness:
481
            $uKey = $url . '|' . $vv['subCfg']['userGroups'] . '|' . $vv['subCfg']['procInstrFilter'];
482 2
483
            if (isset($duplicateTrack[$uKey])) {
484
                //if the url key is registered just display it and do not resubmit is
485 2
                $urlLog[] = '<em><span class="text-muted">' . htmlspecialchars($url) . '</span></em>';
486
            } else {
487
                // Scheduled time:
488
                $schTime = $scheduledTime + round(count($duplicateTrack) * (60 / $reqMinute));
489
                $schTime = intval($schTime / 60) * 60;
490
                $formattedDate = BackendUtility::datetime($schTime);
491
                $this->urlList[] = '[' . $formattedDate . '] ' . $url;
492
                $urlList = '[' . $formattedDate . '] ' . htmlspecialchars($url);
493
494
                // Submit for crawling!
495 5
                if ($submitCrawlUrls) {
496
                    $added = $this->addUrl(
497 5
                        $pageId,
498 1
                        $url,
499
                        $vv['subCfg'],
500
                        $scheduledTime,
501 4
                        $configurationHash,
502 4
                        $skipInnerCheck
503 2
                    );
504
                    if ($added === false) {
505
                        $urlList .= ' (URL already existed)';
506 2
                    }
507
                } elseif ($downloadCrawlUrls) {
508
                    $downloadUrls[$url] = $url;
509 3
                }
510
                $urlLog[] = $urlList;
511 3
            }
512 3
            $duplicateTrack[$uKey] = true;
513
        }
514
515
        return implode('<br>', $urlLog);
516
    }
517
518
    /**
519
     * Returns true if input processing instruction is among registered ones.
520 3
     *
521
     * @param string $piString PI to test
522
     * @param array $incomingProcInstructions Processing instructions
523
     * @return boolean
524
     */
525
    public function drawURLs_PIfilter($piString, array $incomingProcInstructions)
526
    {
527
        if (empty($incomingProcInstructions)) {
528
            return true;
529 3
        }
530
531
        foreach ($incomingProcInstructions as $pi) {
532
            if (GeneralUtility::inList($piString, $pi)) {
533
                return true;
534
            }
535
        }
536 2
        return false;
537
    }
538
539 2
    public function getPageTSconfigForId(int $id): array
540
    {
541 2
        if (! $this->MP) {
542
            $pageTSconfig = BackendUtility::getPagesTSconfig($id);
543
        } else {
544 2
            // TODO: Please check, this makes no sense to split a boolean value.
545 2
            [, $mountPointId] = explode('-', $this->MP);
0 ignored issues
show
Bug introduced by
$this->MP of type true is incompatible with the type string expected by parameter $string of explode(). ( Ignorable by Annotation )

If this is a false-positive, you can also ignore this issue in your code via the ignore-type  annotation

545
            [, $mountPointId] = explode('-', /** @scrutinizer ignore-type */ $this->MP);
Loading history...
546 1
            $pageTSconfig = BackendUtility::getPagesTSconfig($mountPointId);
0 ignored issues
show
Bug introduced by
$mountPointId of type string is incompatible with the type integer expected by parameter $id of TYPO3\CMS\Backend\Utilit...ity::getPagesTSconfig(). ( Ignorable by Annotation )

If this is a false-positive, you can also ignore this issue in your code via the ignore-type  annotation

546
            $pageTSconfig = BackendUtility::getPagesTSconfig(/** @scrutinizer ignore-type */ $mountPointId);
Loading history...
547 1
        }
548
549 1
        // Call a hook to alter configuration
550
        if (is_array($GLOBALS['TYPO3_CONF_VARS']['EXTCONF']['crawler']['getPageTSconfigForId'])) {
551 1
            $params = [
552 1
                'pageId' => $id,
553
                'pageTSConfig' => &$pageTSconfig,
554 1
            ];
555 1
            foreach ($GLOBALS['TYPO3_CONF_VARS']['EXTCONF']['crawler']['getPageTSconfigForId'] as $userFunc) {
556
                GeneralUtility::callUserFunction($userFunc, $params, $this);
557 1
            }
558
        }
559
        return $pageTSconfig;
560
    }
561 1
562
    /**
563
     * This methods returns an array of configurations.
564 1
     * Adds no urls!
565 1
     */
566 1
    public function getUrlsForPageId(int $pageId): array
567 1
    {
568 1
        // Get page TSconfig for page ID
569
        $pageTSconfig = $this->getPageTSconfigForId($pageId);
570
571 1
        $res = [];
572 1
573
        // Fetch Crawler Configuration from pageTSConfig
574
        $res = $this->configurationService->getConfigurationFromPageTS($pageTSconfig, $pageId, $res);
575
576
        // Get configuration from tx_crawler_configuration records up the rootline
577
        $crawlerConfigurations = $this->configurationRepository->getCrawlerConfigurationRecordsFromRootLine($pageId);
578
        foreach ($crawlerConfigurations as $configurationRecord) {
579
580 2
            // check access to the configuration record
581 2
            if (empty($configurationRecord['begroups']) || $this->getBackendUser()->isAdmin() || UserService::hasGroupAccess($this->getBackendUser()->user['usergroup_cached_list'], $configurationRecord['begroups'])) {
582
                $pidOnlyList = implode(',', GeneralUtility::trimExplode(',', $configurationRecord['pidsonly'], true));
583
584 1
                // process configuration if it is not page-specific or if the specific page is the current page:
585 1
                // TODO: Check if $pidOnlyList can be kept as Array instead of imploded
586
                if (! strcmp($configurationRecord['pidsonly'], '') || GeneralUtility::inList($pidOnlyList, strval($pageId))) {
587
                    $key = $configurationRecord['name'];
588
589 1
                    // don't overwrite previously defined paramSets
590 1
                    if (! isset($res[$key])) {
591
592
                        /* @var $TSparserObject TypoScriptParser */
593 1
                        $TSparserObject = GeneralUtility::makeInstance(TypoScriptParser::class);
594
                        $TSparserObject->parse($configurationRecord['processing_instruction_parameters_ts']);
595
596 1
                        $subCfg = [
597 1
                            'procInstrFilter' => $configurationRecord['processing_instruction_filter'],
598
                            'procInstrParams.' => $TSparserObject->setup,
599
                            'baseUrl' => $configurationRecord['base_url'],
600 1
                            'force_ssl' => (int) $configurationRecord['force_ssl'],
601 1
                            'userGroups' => $configurationRecord['fegroups'],
602 1
                            'exclude' => $configurationRecord['exclude'],
603 1
                            'key' => $key,
604 1
                        ];
605 1
606 1
                        if (! in_array($pageId, $this->expandExcludeString($subCfg['exclude']), true)) {
607
                            $res[$key] = [];
608
                            $res[$key]['subCfg'] = $subCfg;
609 1
                            $res[$key]['paramParsed'] = GeneralUtility::explodeUrl2Array($configurationRecord['configuration']);
610 1
                            $res[$key]['paramExpanded'] = $this->configurationService->expandParameters($res[$key]['paramParsed'], $pageId);
611 1
                            $res[$key]['URLs'] = $this->urlService->compileUrls($res[$key]['paramExpanded'], ['?id=' . $pageId], $this->getMaximumUrlsToCompile());
612 1
                            $res[$key]['origin'] = 'tx_crawler_configuration_' . $configurationRecord['uid'];
613 1
                        }
614 1
                    }
615 1
                }
616
            }
617
        }
618
619
        foreach ($GLOBALS['TYPO3_CONF_VARS']['EXTCONF']['crawler']['processUrls'] ?? [] as $func) {
620
            $params = [
621
                'res' => &$res,
622 2
            ];
623
            GeneralUtility::callUserFunction($func, $params, $this);
624
        }
625
        return $res;
626
    }
627
628 2
    /**
629
     * Find all configurations of subpages of a page
630
     * TODO: Write Functional Tests
631
     */
632
    public function getConfigurationsForBranch(int $rootid, int $depth): array
633
    {
634
        $configurationsForBranch = [];
635 1
        $pageTSconfig = $this->getPageTSconfigForId($rootid);
636
        $sets = $pageTSconfig['tx_crawler.']['crawlerCfg.']['paramSets.'] ?? [];
637 1
        foreach ($sets as $key => $value) {
638 1
            if (! is_array($value)) {
639 1
                continue;
640 1
            }
641
            $configurationsForBranch[] = substr($key, -1) === '.' ? substr($key, 0, -1) : $key;
642
        }
643
        $pids = [];
644
        $rootLine = BackendUtility::BEgetRootLine($rootid);
645
        foreach ($rootLine as $node) {
646 1
            $pids[] = $node['uid'];
647 1
        }
648 1
        /* @var PageTreeView $tree */
649 1
        $tree = GeneralUtility::makeInstance(PageTreeView::class);
650
        $perms_clause = $this->getBackendUser()->getPagePermsClause(Permission::PAGE_SHOW);
651
        $tree->init(empty($perms_clause) ? '' : ('AND ' . $perms_clause));
652 1
        $tree->getTree($rootid, $depth, '');
653 1
        foreach ($tree->tree as $node) {
654 1
            $pids[] = $node['row']['uid'];
655 1
        }
656 1
657
        $configurations = $this->configurationRepository->getCrawlerConfigurationRecordsFromRootLine($rootid, $pids);
658
659
        foreach($configurations as $configuration) {
660 1
            $configurationsForBranch[] = $configuration['name'];
661
        }
662 1
        return $configurationsForBranch;
663 1
    }
664 1
665 1
    /**
666
     * Check if a user has access to an item
667 1
     * (e.g. get the group list of the current logged in user from $GLOBALS['TSFE']->gr_list)
668
     *
669 1
     * @param string $groupList Comma-separated list of (fe_)group UIDs from a user
670 1
     * @param string $accessList Comma-separated list of (fe_)group UIDs of the item to access
671
     * @return bool TRUE if at least one of the users group UIDs is in the access list or the access list is empty
672 1
     * @see \TYPO3\CMS\Frontend\Page\PageRepository::getMultipleGroupsWhereClause()
673
     * @deprecated
674
     * @codeCoverageIgnore
675
     */
676
    public function hasGroupAccess($groupList, $accessList)
677
    {
678
        if (empty($accessList)) {
679
            return true;
680
        }
681
        foreach (GeneralUtility::intExplode(',', $groupList) as $groupUid) {
682
            if (GeneralUtility::inList($accessList, $groupUid)) {
683
                return true;
684 3
            }
685
        }
686 3
        return false;
687 1
    }
688
689 2
    /**
690 2
     * Will expand the parameters configuration to individual values. This follows a certain syntax of the value of each parameter.
691 1
     * Syntax of values:
692
     * - Basically: If the value is wrapped in [...] it will be expanded according to the following syntax, otherwise the value is taken literally
693
     * - Configuration is splitted by "|" and the parts are processed individually and finally added together
694 1
     * - For each configuration part:
695
     *         - "[int]-[int]" = Integer range, will be expanded to all values in between, values included, starting from low to high (max. 1000). Example "1-34" or "-40--30"
696
     *         - "_TABLE:[TCA table name];[_PID:[optional page id, default is current page]];[_ENABLELANG:1]" = Look up of table records from PID, filtering out deleted records. Example "_TABLE:tt_content; _PID:123"
697
     *        _ENABLELANG:1 picks only original records without their language overlays
698
     *         - Default: Literal value
699
     *
700
     * @param array $paramArray Array with key (GET var name) and values (value of GET var which is configuration for expansion)
701
     * @param integer $pid Current page ID
702
     * @return array
703
     * @deprecated
704
     *
705
     * TODO: Write Functional Tests
706
     */
707
    public function expandParameters($paramArray, $pid)
708
    {
709
        // Traverse parameter names:
710
        foreach ($paramArray as $p => $v) {
711
            $v = trim($v);
712
713
            // If value is encapsulated in square brackets it means there are some ranges of values to find, otherwise the value is literal
714 9
            if (strpos($v, '[') === 0 && substr($v, -1) === ']') {
715
                // So, find the value inside brackets and reset the paramArray value as an array.
716
                $v = substr($v, 1, -1);
717 9
                $paramArray[$p] = [];
718 9
719
                // Explode parts and traverse them:
720
                $parts = explode('|', $v);
721 9
                foreach ($parts as $pV) {
722
723 9
                    // Look for integer range: (fx. 1-34 or -40--30 // reads minus 40 to minus 30)
724 9
                    if (preg_match('/^(-?[0-9]+)\s*-\s*(-?[0-9]+)$/', trim($pV), $reg)) {
725
                        $reg = $this->swapIfFirstIsLargerThanSecond($reg);
0 ignored issues
show
Deprecated Code introduced by
The function AOE\Crawler\Controller\C...rstIsLargerThanSecond() has been deprecated. ( Ignorable by Annotation )

If this is a false-positive, you can also ignore this issue in your code via the ignore-deprecated  annotation

725
                        $reg = /** @scrutinizer ignore-deprecated */ $this->swapIfFirstIsLargerThanSecond($reg);
Loading history...
726
727 9
                        // Traverse range, add values:
728 9
                        // Limit to size of range!
729
                        $runAwayBrake = 1000;
730
                        for ($a = $reg[1]; $a <= $reg[2]; $a++) {
731 9
                            $paramArray[$p][] = $a;
732 1
                            $runAwayBrake--;
733
                            if ($runAwayBrake <= 0) {
734
                                break;
735 1
                            }
736 1
                        }
737 1
                    } elseif (strpos(trim($pV), '_TABLE:') === 0) {
738 1
739 1
                        // Parse parameters:
740
                        $subparts = GeneralUtility::trimExplode(';', $pV);
741
                        $subpartParams = [];
742
                        foreach ($subparts as $spV) {
743 8
                            [$pKey, $pVal] = GeneralUtility::trimExplode(':', $spV);
744
                            $subpartParams[$pKey] = $pVal;
745
                        }
746 6
747 6
                        // Table exists:
748 6
                        if (isset($GLOBALS['TCA'][$subpartParams['_TABLE']])) {
749 6
                            $lookUpPid = isset($subpartParams['_PID']) ? intval($subpartParams['_PID']) : intval($pid);
750 6
                            $recursiveDepth = isset($subpartParams['_RECURSIVE']) ? intval($subpartParams['_RECURSIVE']) : 0;
751
                            $pidField = isset($subpartParams['_PIDFIELD']) ? trim($subpartParams['_PIDFIELD']) : 'pid';
752
                            $where = $subpartParams['_WHERE'] ?? '';
753
                            $addTable = $subpartParams['_ADDTABLE'] ?? '';
754 6
755 6
                            $fieldName = $subpartParams['_FIELD'] ? $subpartParams['_FIELD'] : 'uid';
756 6
                            if ($fieldName === 'uid' || $GLOBALS['TCA'][$subpartParams['_TABLE']]['columns'][$fieldName]) {
757 6
                                $queryBuilder = $this->getQueryBuilder($subpartParams['_TABLE']);
758 6
759 6
                                if ($recursiveDepth > 0) {
760
                                    /** @var QueryGenerator $queryGenerator */
761 6
                                    $queryGenerator = GeneralUtility::makeInstance(QueryGenerator::class);
762 6
                                    $pidList = $queryGenerator->getTreeList($lookUpPid, $recursiveDepth, 0, 1);
763 6
                                    $pidArray = GeneralUtility::intExplode(',', $pidList);
764
                                } else {
765 6
                                    $pidArray = [(string) $lookUpPid];
766
                                }
767 2
768 2
                                $queryBuilder->getRestrictions()
769 2
                                    ->removeAll()
770
                                    ->add(GeneralUtility::makeInstance(DeletedRestriction::class));
771 4
772
                                $queryBuilder
773
                                    ->select($fieldName)
774 6
                                    ->from($subpartParams['_TABLE'])
775 6
                                    ->where(
776 6
                                        $queryBuilder->expr()->in($pidField, $queryBuilder->createNamedParameter($pidArray, Connection::PARAM_INT_ARRAY)),
777
                                        $where
778
                                    );
779 6
780 6
                                if (! empty($addTable)) {
781 6
                                    // TODO: Check if this works as intended!
782 6
                                    $queryBuilder->add('from', $addTable);
783 6
                                }
784
                                $transOrigPointerField = $GLOBALS['TCA'][$subpartParams['_TABLE']]['ctrl']['transOrigPointerField'];
785
786 6
                                if ($subpartParams['_ENABLELANG'] && $transOrigPointerField) {
787
                                    $queryBuilder->andWhere(
788
                                        $queryBuilder->expr()->lte(
789
                                            $transOrigPointerField,
790 6
                                            0
791
                                        )
792 6
                                    );
793
                                }
794
795
                                $statement = $queryBuilder->execute();
796
797
                                $rows = [];
798
                                while ($row = $statement->fetch()) {
799
                                    $rows[$row[$fieldName]] = $row;
800
                                }
801 6
802
                                if (is_array($rows)) {
803 6
                                    $paramArray[$p] = array_merge($paramArray[$p], array_keys($rows));
804 6
                                }
805 6
                            }
806
                        }
807
                    } else {
808 6
                        // Just add value:
809 6
                        $paramArray[$p][] = $pV;
810
                    }
811
                    // Hook for processing own expandParameters place holder
812
                    if (is_array($GLOBALS['TYPO3_CONF_VARS']['SC_OPTIONS']['crawler/class.tx_crawler_lib.php']['expandParameters'])) {
813
                        $_params = [
814 2
                            'pObj' => &$this,
815
                            'paramArray' => &$paramArray,
816
                            'currentKey' => $p,
817 9
                            'currentValue' => $pV,
818
                            'pid' => $pid,
819
                        ];
820
                        foreach ($GLOBALS['TYPO3_CONF_VARS']['SC_OPTIONS']['crawler/class.tx_crawler_lib.php']['expandParameters'] as $_funcRef) {
821
                            GeneralUtility::callUserFunction($_funcRef, $_params, $this);
822
                        }
823
                    }
824
                }
825
826
                // Make unique set of values and sort array by key:
827
                $paramArray[$p] = array_unique($paramArray[$p]);
828
                ksort($paramArray);
829
            } else {
830
                // Set the literal value as only value in array:
831
                $paramArray[$p] = [$v];
832 9
            }
833 9
        }
834
835
        return $paramArray;
836 2
    }
837
838
    /**
839
     * Compiling URLs from parameter array (output of expandParameters())
840 9
     * The number of URLs will be the multiplication of the number of parameter values for each key
841
     *
842
     * @param array $paramArray Output of expandParameters(): Array with keys (GET var names) and for each an array of values
843
     * @param array $urls URLs accumulated in this array (for recursion)
844
     * @deprecated
845
     */
846
    public function compileUrls(array $paramArray, array $urls): array
847
    {
848
        return $this->urlService->compileUrls($paramArray, $urls, $this->getMaximumUrlsToCompile());
849
    }
850
851 5
    /************************************
852
     *
853 5
     * Crawler log
854 5
     *
855
     ************************************/
856
857 4
    /**
858 4
     * Return array of records from crawler queue for input page ID
859 4
     *
860
     * @param integer $id Page ID for which to look up log entries.
861
     * @param boolean $doFlush If TRUE, then entries selected at DELETED(!) instead of selected!
862 4
     * @param boolean $doFullFlush
863 4
     * @param integer $itemsPerPage Limit the amount of entries per page default is 10
864 3
     * @return array
865 3
     *
866
     * @deprecated
867 3
     */
868
    public function getLogEntriesForPageId($id, QueueFilter $queueFilter, $doFlush = false, $doFullFlush = false, $itemsPerPage = 10)
0 ignored issues
show
Unused Code introduced by
The parameter $doFullFlush is not used and could be removed. ( Ignorable by Annotation )

If this is a false-positive, you can also ignore this issue in your code via the ignore-unused  annotation

868
    public function getLogEntriesForPageId($id, QueueFilter $queueFilter, $doFlush = false, /** @scrutinizer ignore-unused */ $doFullFlush = false, $itemsPerPage = 10)

This check looks for parameters that have been defined for a function or method, but which are not used in the method body.

Loading history...
869
    {
870
        $queryBuilder = GeneralUtility::makeInstance(ConnectionPool::class)->getQueryBuilderForTable($this->tableName);
0 ignored issues
show
Deprecated Code introduced by
The property AOE\Crawler\Controller\C...rController::$tableName has been deprecated: Since v9.2.5 - This will be remove in v10 ( Ignorable by Annotation )

If this is a false-positive, you can also ignore this issue in your code via the ignore-deprecated  annotation

870
        $queryBuilder = GeneralUtility::makeInstance(ConnectionPool::class)->getQueryBuilderForTable(/** @scrutinizer ignore-deprecated */ $this->tableName);

This property has been deprecated. The supplier of the class has supplied an explanatory message.

The explanatory message should give you some clue as to whether and when the property will be removed from the class and what other property to use instead.

Loading history...
871
        $queryBuilder
872 4
            ->select('*')
873
            ->from($this->tableName)
0 ignored issues
show
Deprecated Code introduced by
The property AOE\Crawler\Controller\C...rController::$tableName has been deprecated: Since v9.2.5 - This will be remove in v10 ( Ignorable by Annotation )

If this is a false-positive, you can also ignore this issue in your code via the ignore-deprecated  annotation

873
            ->from(/** @scrutinizer ignore-deprecated */ $this->tableName)

This property has been deprecated. The supplier of the class has supplied an explanatory message.

The explanatory message should give you some clue as to whether and when the property will be removed from the class and what other property to use instead.

Loading history...
874
            ->where(
875
                $queryBuilder->expr()->eq('page_id', $queryBuilder->createNamedParameter($id, PDO::PARAM_INT))
876
            )
877
            ->orderBy('scheduled', 'DESC');
878
879
        $expressionBuilder = GeneralUtility::makeInstance(ConnectionPool::class)
880
            ->getConnectionForTable($this->tableName)
0 ignored issues
show
Deprecated Code introduced by
The property AOE\Crawler\Controller\C...rController::$tableName has been deprecated: Since v9.2.5 - This will be remove in v10 ( Ignorable by Annotation )

If this is a false-positive, you can also ignore this issue in your code via the ignore-deprecated  annotation

880
            ->getConnectionForTable(/** @scrutinizer ignore-deprecated */ $this->tableName)

This property has been deprecated. The supplier of the class has supplied an explanatory message.

The explanatory message should give you some clue as to whether and when the property will be removed from the class and what other property to use instead.

Loading history...
881
            ->getExpressionBuilder();
882
        $query = $expressionBuilder->andX();
0 ignored issues
show
Unused Code introduced by
The assignment to $query is dead and can be removed.
Loading history...
883
        // PHPStorm adds the highlight that the $addWhere is immediately overwritten,
884
        // but the $query = $expressionBuilder->andX() ensures that the $addWhere is written correctly with AND
885
        // between the statements, it's not a mistake in the code.
886
        switch ($queueFilter) {
887
            case 'pending':
888
                $queryBuilder->andWhere($queryBuilder->expr()->eq('exec_time', 0));
889
                break;
890
            case 'finished':
891 4
                $queryBuilder->andWhere($queryBuilder->expr()->gt('exec_time', 0));
892
                break;
893 4
        }
894
895 4
        if ($doFlush) {
896 4
            $this->queueRepository->flushQueue($queueFilter);
897 4
        }
898 4
        if ($itemsPerPage > 0) {
899
            $queryBuilder
900 4
                ->setMaxResults((int) $itemsPerPage);
901
        }
902 4
903 4
        return $queryBuilder->execute()->fetchAll();
904 4
    }
905 4
906
    /**
907
     * Return array of records from crawler queue for input set ID
908
     *
909 4
     * @param int $set_id Set ID for which to look up log entries.
910 4
     * @param string $filter Filter: "all" => all entries, "pending" => all that is not yet run, "finished" => all complete ones
911
     * @param bool $doFlush If TRUE, then entries selected at DELETED(!) instead of selected!
912 4
     * @param int $itemsPerPage Limit the amount of entries per page default is 10
913
     * @return array
914
     *
915 4
     * @deprecated
916
     */
917
    public function getLogEntriesForSetId(int $set_id, string $filter = '', bool $doFlush = false, bool $doFullFlush = false, int $itemsPerPage = 10)
918
    {
919
        $queryBuilder = GeneralUtility::makeInstance(ConnectionPool::class)->getQueryBuilderForTable($this->tableName);
0 ignored issues
show
Deprecated Code introduced by
The property AOE\Crawler\Controller\C...rController::$tableName has been deprecated: Since v9.2.5 - This will be remove in v10 ( Ignorable by Annotation )

If this is a false-positive, you can also ignore this issue in your code via the ignore-deprecated  annotation

919
        $queryBuilder = GeneralUtility::makeInstance(ConnectionPool::class)->getQueryBuilderForTable(/** @scrutinizer ignore-deprecated */ $this->tableName);

This property has been deprecated. The supplier of the class has supplied an explanatory message.

The explanatory message should give you some clue as to whether and when the property will be removed from the class and what other property to use instead.

Loading history...
920
        $queryBuilder
921
            ->select('*')
922
            ->from($this->tableName)
0 ignored issues
show
Deprecated Code introduced by
The property AOE\Crawler\Controller\C...rController::$tableName has been deprecated: Since v9.2.5 - This will be remove in v10 ( Ignorable by Annotation )

If this is a false-positive, you can also ignore this issue in your code via the ignore-deprecated  annotation

922
            ->from(/** @scrutinizer ignore-deprecated */ $this->tableName)

This property has been deprecated. The supplier of the class has supplied an explanatory message.

The explanatory message should give you some clue as to whether and when the property will be removed from the class and what other property to use instead.

Loading history...
923
            ->where(
924
                $queryBuilder->expr()->eq('set_id', $queryBuilder->createNamedParameter($set_id, PDO::PARAM_INT))
925
            )
926
            ->orderBy('scheduled', 'DESC');
927
928
        $expressionBuilder = GeneralUtility::makeInstance(ConnectionPool::class)
929 6
            ->getConnectionForTable($this->tableName)
0 ignored issues
show
Deprecated Code introduced by
The property AOE\Crawler\Controller\C...rController::$tableName has been deprecated: Since v9.2.5 - This will be remove in v10 ( Ignorable by Annotation )

If this is a false-positive, you can also ignore this issue in your code via the ignore-deprecated  annotation

929
            ->getConnectionForTable(/** @scrutinizer ignore-deprecated */ $this->tableName)

This property has been deprecated. The supplier of the class has supplied an explanatory message.

The explanatory message should give you some clue as to whether and when the property will be removed from the class and what other property to use instead.

Loading history...
930
            ->getExpressionBuilder();
931 6
        $query = $expressionBuilder->andX();
932
        // PHPStorm adds the highlight that the $addWhere is immediately overwritten,
933 6
        // but the $query = $expressionBuilder->andX() ensures that the $addWhere is written correctly with AND
934 6
        // between the statements, it's not a mistake in the code.
935 6
        $addWhere = '';
936 6
        switch ($filter) {
937
            case 'pending':
938 6
                $queryBuilder->andWhere($queryBuilder->expr()->eq('exec_time', 0));
939
                $addWhere = $query->add($expressionBuilder->eq('exec_time', 0));
0 ignored issues
show
Unused Code introduced by
The assignment to $addWhere is dead and can be removed.
Loading history...
940 6
                break;
941 6
            case 'finished':
942 6
                $queryBuilder->andWhere($queryBuilder->expr()->gt('exec_time', 0));
943 6
                $addWhere = $query->add($expressionBuilder->gt('exec_time', 0));
944
                break;
945
        }
946
        if ($doFlush) {
947 6
            $addWhere = $query->add($expressionBuilder->eq('set_id', (int) $set_id));
948 6
            $this->flushQueue($doFullFlush ? '' : $addWhere);
0 ignored issues
show
Deprecated Code introduced by
The function AOE\Crawler\Controller\C...ontroller::flushQueue() has been deprecated. ( Ignorable by Annotation )

If this is a false-positive, you can also ignore this issue in your code via the ignore-deprecated  annotation

948
            /** @scrutinizer ignore-deprecated */ $this->flushQueue($doFullFlush ? '' : $addWhere);
Loading history...
949 6
            return [];
950 1
        }
951 1
        if ($itemsPerPage > 0) {
952 1
            $queryBuilder
953 5
                ->setMaxResults((int) $itemsPerPage);
954 1
        }
955 1
956 1
        return $queryBuilder->execute()->fetchAll();
957
    }
958 6
959 4
    /**
960 4
     * Adding call back entries to log (called from hooks typically, see indexed search class "class.crawler.php"
961 4
     *
962
     * @param integer $setId Set ID
963 2
     * @param array $params Parameters to pass to call back function
964
     * @param string $callBack Call back object reference, eg. 'EXT:indexed_search/class.crawler.php:&tx_indexedsearch_crawler'
965 2
     * @param integer $page_id Page ID to attach it to
966
     * @param integer $schedule Time at which to activate
967
     */
968 2
    public function addQueueEntry_callBack($setId, $params, $callBack, $page_id = 0, $schedule = 0): void
969
    {
970
        if (! is_array($params)) {
0 ignored issues
show
introduced by
The condition is_array($params) is always true.
Loading history...
971
            $params = [];
972
        }
973
        $params['_CALLBACKOBJ'] = $callBack;
974
975
        GeneralUtility::makeInstance(ConnectionPool::class)->getConnectionForTable(QueueRepository::TABLE_NAME)
976
            ->insert(
977
                QueueRepository::TABLE_NAME,
978
                [
979
                    'page_id' => (int) $page_id,
980
                    'parameters' => json_encode($params),
981
                    'scheduled' => (int) $schedule ?: $this->getCurrentTime(),
982
                    'exec_time' => 0,
983
                    'set_id' => (int) $setId,
984
                    'result_data' => '',
985
                ]
986
            );
987
    }
988
989
    /************************************
990
     *
991
     * URL setting
992
     *
993
     ************************************/
994
995
    /**
996
     * Setting a URL for crawling:
997
     *
998
     * @param integer $id Page ID
999
     * @param string $url Complete URL
1000
     * @param array $subCfg Sub configuration array (from TS config)
1001
     * @param integer $tstamp Scheduled-time
1002
     * @param string $configurationHash (optional) configuration hash
1003
     * @param bool $skipInnerDuplicationCheck (optional) skip inner duplication check
1004
     * @return bool
1005
     */
1006
    public function addUrl(
1007
        $id,
1008
        $url,
1009
        array $subCfg,
1010
        $tstamp,
1011
        $configurationHash = '',
1012
        $skipInnerDuplicationCheck = false
1013
    ) {
1014
        $urlAdded = false;
1015
        $rows = [];
1016
1017
        // Creating parameters:
1018 6
        $parameters = [
1019
            'url' => $url,
1020
        ];
1021
1022
        // fe user group simulation:
1023
        $uGs = implode(',', array_unique(GeneralUtility::intExplode(',', $subCfg['userGroups'], true)));
1024
        if ($uGs) {
1025
            $parameters['feUserGroupList'] = $uGs;
1026 6
        }
1027 6
1028
        // Setting processing instructions
1029
        $parameters['procInstructions'] = GeneralUtility::trimExplode(',', $subCfg['procInstrFilter']);
1030
        if (is_array($subCfg['procInstrParams.'])) {
1031 6
            $parameters['procInstrParams'] = $subCfg['procInstrParams.'];
1032
        }
1033
1034
        // Compile value array:
1035 6
        $parameters_serialized = json_encode($parameters);
1036 6
        $fieldArray = [
1037 1
            'page_id' => (int) $id,
1038
            'parameters' => $parameters_serialized,
1039
            'parameters_hash' => GeneralUtility::shortMD5($parameters_serialized),
1040
            'configuration_hash' => $configurationHash,
1041 6
            'scheduled' => $tstamp,
1042 6
            'exec_time' => 0,
1043 3
            'set_id' => (int) $this->setID,
1044
            'result_data' => '',
1045
            'configuration' => $subCfg['key'],
1046
        ];
1047 6
1048
        if ($this->registerQueueEntriesInternallyOnly) {
1049 6
            //the entries will only be registered and not stored to the database
1050 6
            $this->queueEntries[] = $fieldArray;
1051 6
        } else {
1052 6
            if (! $skipInnerDuplicationCheck) {
1053 6
                // check if there is already an equal entry
1054 6
                $rows = $this->queueRepository->getDuplicateQueueItemsIfExists(
1055 6
                    (bool) $this->extensionSettings['enableTimeslot'],
1056 6
                    $tstamp,
1057 6
                    $this->getCurrentTime(),
1058
                    $fieldArray['page_id'],
1059
                    $fieldArray['parameters_hash']
1060 6
                );
1061
            }
1062 1
1063
            if (empty($rows)) {
1064 5
                $connectionForCrawlerQueue = GeneralUtility::makeInstance(ConnectionPool::class)->getConnectionForTable(QueueRepository::TABLE_NAME);
1065
                $connectionForCrawlerQueue->insert(
1066 4
                    QueueRepository::TABLE_NAME,
1067
                    $fieldArray
1068
                );
1069 5
                $uid = $connectionForCrawlerQueue->lastInsertId(QueueRepository::TABLE_NAME, 'qid');
1070 4
                $rows[] = $uid;
1071 4
                $urlAdded = true;
1072 4
1073 4
                $signalPayload = ['uid' => $uid, 'fieldArray' => $fieldArray];
1074
                SignalSlotUtility::emitSignal(
0 ignored issues
show
Deprecated Code introduced by
The function AOE\Crawler\Utility\Sign...otUtility::emitSignal() has been deprecated. ( Ignorable by Annotation )

If this is a false-positive, you can also ignore this issue in your code via the ignore-deprecated  annotation

1074
                /** @scrutinizer ignore-deprecated */ SignalSlotUtility::emitSignal(
Loading history...
1075 4
                    self::class,
1076 4
                    SignalSlotUtility::SIGNAL_URL_ADDED_TO_QUEUE,
1077 4
                    $signalPayload
1078 4
                );
1079
            } else {
1080 1
                $signalPayload = ['rows' => $rows, 'fieldArray' => $fieldArray];
1081
                SignalSlotUtility::emitSignal(
0 ignored issues
show
Deprecated Code introduced by
The function AOE\Crawler\Utility\Sign...otUtility::emitSignal() has been deprecated. ( Ignorable by Annotation )

If this is a false-positive, you can also ignore this issue in your code via the ignore-deprecated  annotation

1081
                /** @scrutinizer ignore-deprecated */ SignalSlotUtility::emitSignal(
Loading history...
1082
                    self::class,
1083
                    SignalSlotUtility::SIGNAL_DUPLICATE_URL_IN_QUEUE,
1084 6
                    $signalPayload
1085
                );
1086
            }
1087
        }
1088
1089
        return $urlAdded;
1090
    }
1091
1092
    /**
1093
     * Returns the current system time
1094
     *
1095
     * @return int
1096
     */
1097
    public function getCurrentTime()
1098
    {
1099
        return time();
1100
    }
1101
1102
    /************************************
1103
     *
1104
     * URL reading
1105
     *
1106
     ************************************/
1107
1108
    /**
1109
     * Read URL for single queue entry
1110
     *
1111
     * @param integer $queueId
1112
     * @param boolean $force If set, will process even if exec_time has been set!
1113
     *
1114
     * @return int|null
1115
     */
1116
    public function readUrl($queueId, $force = false)
1117
    {
1118
        $queryBuilder = GeneralUtility::makeInstance(ConnectionPool::class)->getQueryBuilderForTable(QueueRepository::TABLE_NAME);
1119
        $ret = 0;
1120
        $this->logger->debug('crawler-readurl start ' . microtime(true));
0 ignored issues
show
Bug introduced by
The method debug() does not exist on null. ( Ignorable by Annotation )

If this is a false-positive, you can also ignore this issue in your code via the ignore-call  annotation

1120
        $this->logger->/** @scrutinizer ignore-call */ 
1121
                       debug('crawler-readurl start ' . microtime(true));

This check looks for calls to methods that do not seem to exist on a given type. It looks for the method on the type itself as well as in inherited classes or implemented interfaces.

This is most likely a typographical error or the method has been renamed.

Loading history...
1121
1122
        $queryBuilder
1123
            ->select('*')
1124
            ->from(QueueRepository::TABLE_NAME)
1125
            ->where(
1126
                $queryBuilder->expr()->eq('qid', $queryBuilder->createNamedParameter($queueId, PDO::PARAM_INT))
1127
            );
1128
        if (! $force) {
1129
            $queryBuilder
1130
                ->andWhere('exec_time = 0')
1131
                ->andWhere('process_scheduled > 0');
1132
        }
1133
        $queueRec = $queryBuilder->execute()->fetch();
1134
1135
        if (! is_array($queueRec)) {
1136
            return;
1137
        }
1138
1139
        SignalSlotUtility::emitSignal(
0 ignored issues
show
Deprecated Code introduced by
The function AOE\Crawler\Utility\Sign...otUtility::emitSignal() has been deprecated. ( Ignorable by Annotation )

If this is a false-positive, you can also ignore this issue in your code via the ignore-deprecated  annotation

1139
        /** @scrutinizer ignore-deprecated */ SignalSlotUtility::emitSignal(
Loading history...
1140
            self::class,
1141
            SignalSlotUtility::SIGNAL_QUEUEITEM_PREPROCESS,
1142
            [$queueId, &$queueRec]
1143
        );
1144
1145
        // Set exec_time to lock record:
1146
        $field_array = ['exec_time' => $this->getCurrentTime()];
1147
1148
        if (isset($this->processID)) {
1149
            //if mulitprocessing is used we need to store the id of the process which has handled this entry
1150
            $field_array['process_id_completed'] = $this->processID;
1151
        }
1152
1153
        GeneralUtility::makeInstance(ConnectionPool::class)->getConnectionForTable(QueueRepository::TABLE_NAME)
1154
            ->update(
1155
                QueueRepository::TABLE_NAME,
1156
                $field_array,
1157
                ['qid' => (int) $queueId]
1158
            );
1159
1160
        $result = $this->queueExecutor->executeQueueItem($queueRec, $this);
1161
        if ($result['content'] === null) {
1162
            $resultData = 'An errors happened';
0 ignored issues
show
Unused Code introduced by
The assignment to $resultData is dead and can be removed.
Loading history...
1163
        } else {
1164
            /** @var JsonCompatibilityConverter $jsonCompatibilityConverter */
1165
            $jsonCompatibilityConverter = GeneralUtility::makeInstance(JsonCompatibilityConverter::class);
1166
            $resultData = $jsonCompatibilityConverter->convert($result['content']);
1167
1168
            //atm there's no need to point to specific pollable extensions
1169
            if (is_array($resultData) && is_array($GLOBALS['TYPO3_CONF_VARS']['EXTCONF']['crawler']['pollSuccess'])) {
1170
                foreach ($GLOBALS['TYPO3_CONF_VARS']['EXTCONF']['crawler']['pollSuccess'] as $pollable) {
1171
                    // only check the success value if the instruction is runnig
1172
                    // it is important to name the pollSuccess key same as the procInstructions key
1173
                    if (is_array($resultData['parameters']['procInstructions'])
1174
                        && in_array(
1175
                            $pollable,
1176
                            $resultData['parameters']['procInstructions'], true
1177
                        )
1178
                    ) {
1179
                        if (! empty($resultData['success'][$pollable]) && $resultData['success'][$pollable]) {
1180
                            $ret |= self::CLI_STATUS_POLLABLE_PROCESSED;
1181
                        }
1182
                    }
1183
                }
1184
            }
1185
        }
1186
        // Set result in log which also denotes the end of the processing of this entry.
1187
        $field_array = ['result_data' => json_encode($result)];
1188
1189
        SignalSlotUtility::emitSignal(
0 ignored issues
show
Deprecated Code introduced by
The function AOE\Crawler\Utility\Sign...otUtility::emitSignal() has been deprecated. ( Ignorable by Annotation )

If this is a false-positive, you can also ignore this issue in your code via the ignore-deprecated  annotation

1189
        /** @scrutinizer ignore-deprecated */ SignalSlotUtility::emitSignal(
Loading history...
1190
            self::class,
1191
            SignalSlotUtility::SIGNAL_QUEUEITEM_POSTPROCESS,
1192
            [$queueId, &$field_array]
1193
        );
1194
1195
        GeneralUtility::makeInstance(ConnectionPool::class)->getConnectionForTable(QueueRepository::TABLE_NAME)
1196
            ->update(
1197
                QueueRepository::TABLE_NAME,
1198
                $field_array,
1199
                ['qid' => (int) $queueId]
1200
            );
1201
1202
        $this->logger->debug('crawler-readurl stop ' . microtime(true));
1203
        return $ret;
1204
    }
1205
1206
    /**
1207
     * Read URL for not-yet-inserted log-entry
1208
     *
1209
     * @param array $field_array Queue field array,
1210
     *
1211
     * @return array|bool|mixed|string
1212
     */
1213
    public function readUrlFromArray($field_array)
1214
    {
1215
        // Set exec_time to lock record:
1216
        $field_array['exec_time'] = $this->getCurrentTime();
1217
        $connectionForCrawlerQueue = GeneralUtility::makeInstance(ConnectionPool::class)->getConnectionForTable(QueueRepository::TABLE_NAME);
1218
        $connectionForCrawlerQueue->insert(
1219
            QueueRepository::TABLE_NAME,
1220
            $field_array
1221
        );
1222
        $queueId = $field_array['qid'] = $connectionForCrawlerQueue->lastInsertId(QueueRepository::TABLE_NAME, 'qid');
1223
        $result = $this->queueExecutor->executeQueueItem($field_array, $this);
1224
1225
        // Set result in log which also denotes the end of the processing of this entry.
1226
        $field_array = ['result_data' => json_encode($result)];
1227
1228
        SignalSlotUtility::emitSignal(
0 ignored issues
show
Deprecated Code introduced by
The function AOE\Crawler\Utility\Sign...otUtility::emitSignal() has been deprecated. ( Ignorable by Annotation )

If this is a false-positive, you can also ignore this issue in your code via the ignore-deprecated  annotation

1228
        /** @scrutinizer ignore-deprecated */ SignalSlotUtility::emitSignal(
Loading history...
1229
            self::class,
1230
            SignalSlotUtility::SIGNAL_QUEUEITEM_POSTPROCESS,
1231
            [$queueId, &$field_array]
1232
        );
1233
1234
        $connectionForCrawlerQueue->update(
1235
            QueueRepository::TABLE_NAME,
1236
            $field_array,
1237
            ['qid' => $queueId]
1238
        );
1239
1240
        return $result;
1241
    }
1242
1243
    /*****************************
1244
     *
1245
     * Compiling URLs to crawl - tools
1246
     *
1247
     *****************************/
1248
1249
    /**
1250
     * @param integer $id Root page id to start from.
1251
     * @param integer $depth Depth of tree, 0=only id-page, 1= on sublevel, 99 = infinite
1252
     * @param integer $scheduledTime Unix Time when the URL is timed to be visited when put in queue
1253
     * @param integer $reqMinute Number of requests per minute (creates the interleave between requests)
1254
     * @param boolean $submitCrawlUrls If set, submits the URLs to queue in database (real crawling)
1255
     * @param boolean $downloadCrawlUrls If set (and submitcrawlUrls is false) will fill $downloadUrls with entries)
1256
     * @param array $incomingProcInstructions Array of processing instructions
1257
     * @param array $configurationSelection Array of configuration keys
1258
     * @return string
1259
     */
1260
    public function getPageTreeAndUrls(
1261
        $id,
1262
        $depth,
1263
        $scheduledTime,
1264
        $reqMinute,
1265
        $submitCrawlUrls,
1266
        $downloadCrawlUrls,
1267
        array $incomingProcInstructions,
1268
        array $configurationSelection
1269
    ) {
1270
        $this->scheduledTime = $scheduledTime;
1271
        $this->reqMinute = $reqMinute;
1272
        $this->submitCrawlUrls = $submitCrawlUrls;
1273
        $this->downloadCrawlUrls = $downloadCrawlUrls;
1274
        $this->incomingProcInstructions = $incomingProcInstructions;
1275
        $this->incomingConfigurationSelection = $configurationSelection;
1276
1277
        $this->duplicateTrack = [];
1278
        $this->downloadUrls = [];
1279
1280
        // Drawing tree:
1281
        /* @var PageTreeView $tree */
1282
        $tree = GeneralUtility::makeInstance(PageTreeView::class);
1283
        $perms_clause = $this->getBackendUser()->getPagePermsClause(Permission::PAGE_SHOW);
1284
        $tree->init('AND ' . $perms_clause);
1285
1286
        $pageInfo = BackendUtility::readPageAccess($id, $perms_clause);
1287
        if (is_array($pageInfo)) {
1288
            // Set root row:
1289
            $tree->tree[] = [
1290
                'row' => $pageInfo,
1291
                'HTML' => $this->iconFactory->getIconForRecord('pages', $pageInfo, Icon::SIZE_SMALL),
1292
            ];
1293
        }
1294
1295
        // Get branch beneath:
1296
        if ($depth) {
1297
            $tree->getTree($id, $depth, '');
1298
        }
1299
1300
        // Traverse page tree:
1301
        $code = '';
1302
1303
        foreach ($tree->tree as $data) {
1304
            $this->MP = false;
1305
1306
            // recognize mount points
1307
            if ($data['row']['doktype'] === PageRepository::DOKTYPE_MOUNTPOINT) {
1308
                $mountpage = $this->pageRepository->getPage($data['row']['uid']);
1309
1310
                // fetch mounted pages
1311
                $this->MP = $mountpage[0]['mount_pid'] . '-' . $data['row']['uid'];
0 ignored issues
show
Documentation Bug introduced by
The property $MP was declared of type boolean, but $mountpage[0]['mount_pid...' . $data['row']['uid'] is of type string. Maybe add a type cast?

This check looks for assignments to scalar types that may be of the wrong type.

To ensure the code behaves as expected, it may be a good idea to add an explicit type cast.

$answer = 42;

$correct = false;

$correct = (bool) $answer;
Loading history...
1312
1313
                $mountTree = GeneralUtility::makeInstance(PageTreeView::class);
1314
                $mountTree->init('AND ' . $perms_clause);
1315
                $mountTree->getTree($mountpage[0]['mount_pid'], $depth);
1316
1317
                foreach ($mountTree->tree as $mountData) {
1318
                    $code .= $this->drawURLs_addRowsForPage(
1319
                        $mountData['row'],
1320
                        $mountData['HTML'] . BackendUtility::getRecordTitle('pages', $mountData['row'], true)
1321
                    );
1322
                }
1323
1324
                // replace page when mount_pid_ol is enabled
1325
                if ($mountpage[0]['mount_pid_ol']) {
1326
                    $data['row']['uid'] = $mountpage[0]['mount_pid'];
1327
                } else {
1328
                    // if the mount_pid_ol is not set the MP must not be used for the mountpoint page
1329
                    $this->MP = false;
1330
                }
1331
            }
1332
1333
            $code .= $this->drawURLs_addRowsForPage(
1334
                $data['row'],
1335
                $data['HTML'] . BackendUtility::getRecordTitle('pages', $data['row'], true)
1336
            );
1337
        }
1338
1339
        return $code;
1340
    }
1341
1342
    /**
1343
     * Expands exclude string
1344
     *
1345
     * @param string $excludeString Exclude string
1346
     * @return array
1347
     */
1348
    public function expandExcludeString($excludeString)
1349
    {
1350
        // internal static caches;
1351
        static $expandedExcludeStringCache;
1352 1
        static $treeCache;
1353
1354
        if (empty($expandedExcludeStringCache[$excludeString])) {
1355 1
            $pidList = [];
1356 1
1357
            if (! empty($excludeString)) {
1358 1
                /** @var PageTreeView $tree */
1359 1
                $tree = GeneralUtility::makeInstance(PageTreeView::class);
1360
                $tree->init('AND ' . $this->getBackendUser()->getPagePermsClause(Permission::PAGE_SHOW));
1361 1
1362
                $excludeParts = GeneralUtility::trimExplode(',', $excludeString);
1363
1364
                foreach ($excludeParts as $excludePart) {
1365
                    [$pid, $depth] = GeneralUtility::trimExplode('+', $excludePart);
1366
1367
                    // default is "page only" = "depth=0"
1368
                    if (empty($depth)) {
1369
                        $depth = (stristr($excludePart, '+')) ? 99 : 0;
1370
                    }
1371
1372
                    $pidList[] = (int) $pid;
1373
1374
                    if ($depth > 0) {
1375
                        if (empty($treeCache[$pid][$depth])) {
1376
                            $tree->reset();
1377
                            $tree->getTree($pid, $depth);
0 ignored issues
show
Bug introduced by
$pid of type string is incompatible with the type integer expected by parameter $uid of TYPO3\CMS\Backend\Tree\V...ractTreeView::getTree(). ( Ignorable by Annotation )

If this is a false-positive, you can also ignore this issue in your code via the ignore-type  annotation

1377
                            $tree->getTree(/** @scrutinizer ignore-type */ $pid, $depth);
Loading history...
1378
                            $treeCache[$pid][$depth] = $tree->tree;
1379
                        }
1380
1381
                        foreach ($treeCache[$pid][$depth] as $data) {
1382
                            $pidList[] = (int) $data['row']['uid'];
1383
                        }
1384
                    }
1385
                }
1386
            }
1387
1388
            $expandedExcludeStringCache[$excludeString] = array_unique($pidList);
1389
        }
1390
1391
        return $expandedExcludeStringCache[$excludeString];
1392 1
    }
1393
1394
    /**
1395 1
     * Create the rows for display of the page tree
1396
     * For each page a number of rows are shown displaying GET variable configuration
1397
     */
1398
    public function drawURLs_addRowsForPage(array $pageRow, string $pageTitle): string
1399
    {
1400
        $skipMessage = '';
1401
1402
        // Get list of configurations
1403
        $configurations = $this->getUrlsForPageRow($pageRow, $skipMessage);
1404
        $configurations = ConfigurationService::removeDisallowedConfigurations($this->incomingConfigurationSelection, $configurations);
1405
1406
        // Traverse parameter combinations:
1407
        $c = 0;
1408
        $content = '';
1409
        if (! empty($configurations)) {
1410
            foreach ($configurations as $confKey => $confArray) {
1411
1412
                // Title column:
1413
                if (! $c) {
1414
                    $titleClm = '<td rowspan="' . count($configurations) . '">' . $pageTitle . '</td>';
1415
                } else {
1416
                    $titleClm = '';
1417
                }
1418
1419
                if (! in_array($pageRow['uid'], $this->expandExcludeString($confArray['subCfg']['exclude']), true)) {
1420
1421
                    // URL list:
1422
                    $urlList = $this->urlListFromUrlArray(
1423
                        $confArray,
1424
                        $pageRow,
1425
                        $this->scheduledTime,
1426
                        $this->reqMinute,
1427
                        $this->submitCrawlUrls,
1428
                        $this->downloadCrawlUrls,
1429
                        $this->duplicateTrack,
1430
                        $this->downloadUrls,
1431
                        // if empty the urls won't be filtered by processing instructions
1432
                        $this->incomingProcInstructions
1433
                    );
1434
1435
                    // Expanded parameters:
1436
                    $paramExpanded = '';
1437
                    $calcAccu = [];
1438
                    $calcRes = 1;
1439
                    foreach ($confArray['paramExpanded'] as $gVar => $gVal) {
1440
                        $paramExpanded .= '
1441
                            <tr>
1442
                                <td>' . htmlspecialchars('&' . $gVar . '=') . '<br/>' .
1443
                            '(' . count($gVal) . ')' .
1444
                            '</td>
1445
                                <td nowrap="nowrap">' . nl2br(htmlspecialchars(implode(chr(10), $gVal))) . '</td>
1446
                            </tr>
1447
                        ';
1448
                        $calcRes *= count($gVal);
1449
                        $calcAccu[] = count($gVal);
1450
                    }
1451
                    $paramExpanded = '<table>' . $paramExpanded . '</table>';
1452
                    $paramExpanded .= 'Comb: ' . implode('*', $calcAccu) . '=' . $calcRes;
1453
1454
                    // Options
1455
                    $optionValues = '';
1456
                    if ($confArray['subCfg']['userGroups']) {
1457
                        $optionValues .= 'User Groups: ' . $confArray['subCfg']['userGroups'] . '<br/>';
1458
                    }
1459
                    if ($confArray['subCfg']['procInstrFilter']) {
1460
                        $optionValues .= 'ProcInstr: ' . $confArray['subCfg']['procInstrFilter'] . '<br/>';
1461
                    }
1462
1463
                    // Compile row:
1464
                    $content .= '
1465
                        <tr>
1466
                            ' . $titleClm . '
1467
                            <td>' . htmlspecialchars($confKey) . '</td>
1468
                            <td>' . nl2br(htmlspecialchars(rawurldecode(trim(str_replace('&', chr(10) . '&', GeneralUtility::implodeArrayForUrl('', $confArray['paramParsed'])))))) . '</td>
1469
                            <td>' . $paramExpanded . '</td>
1470
                            <td nowrap="nowrap">' . $urlList . '</td>
1471
                            <td nowrap="nowrap">' . $optionValues . '</td>
1472
                            <td nowrap="nowrap">' . DebugUtility::viewArray($confArray['subCfg']['procInstrParams.']) . '</td>
1473
                        </tr>';
1474
                } else {
1475
                    $content .= '<tr>
1476
                            ' . $titleClm . '
1477
                            <td>' . htmlspecialchars($confKey) . '</td>
1478
                            <td colspan="5"><em>No entries</em> (Page is excluded in this configuration)</td>
1479
                        </tr>';
1480
                }
1481
1482
                $c++;
1483
            }
1484
        } else {
1485
            $message = ! empty($skipMessage) ? ' (' . $skipMessage . ')' : '';
1486
1487
            // Compile row:
1488
            $content .= '
1489
                <tr>
1490
                    <td>' . $pageTitle . '</td>
1491
                    <td colspan="6"><em>No entries</em>' . $message . '</td>
1492
                </tr>';
1493
        }
1494
1495
        return $content;
1496
    }
1497
1498
    /*****************************
1499
     *
1500
     * CLI functions
1501
     *
1502
     *****************************/
1503
1504
    /**
1505
     * Running the functionality of the CLI (crawling URLs from queue)
1506
     */
1507
    public function CLI_run(int $countInARun, int $sleepTime, int $sleepAfterFinish): int
1508
    {
1509
        $result = 0;
1510
        $counter = 0;
1511
1512
        // First, run hooks:
1513
        foreach ($GLOBALS['TYPO3_CONF_VARS']['EXTCONF']['crawler']['cli_hooks'] ?? [] as $objRef) {
1514
            trigger_error(
1515
                'This hook (crawler/cli_hooks) is deprecated since 9.1.5 and will be removed when dropping support for TYPO3 9LTS and 10LTS',
1516
                E_USER_DEPRECATED
1517
            );
1518
            $hookObj = GeneralUtility::makeInstance($objRef);
1519
            if (is_object($hookObj)) {
1520
                $hookObj->crawler_init($this);
1521
            }
1522
        }
1523
1524
        // Clean up the queue
1525
        $this->queueRepository->cleanupQueue();
1526
1527
        // Select entries:
1528
        $rows = $this->queueRepository->fetchRecordsToBeCrawled($countInARun);
1529
1530
        if (! empty($rows)) {
1531
            $quidList = [];
1532
1533
            foreach ($rows as $r) {
1534
                $quidList[] = $r['qid'];
1535
            }
1536
1537
            $processId = $this->CLI_buildProcessId();
1538
1539
            //save the number of assigned queue entries to determine how many have been processed later
1540
            $numberOfAffectedRows = $this->queueRepository->updateProcessIdAndSchedulerForQueueIds($quidList, $processId);
1541
            $this->processRepository->updateProcessAssignItemsCount($numberOfAffectedRows, $processId);
1542
1543
            if ($numberOfAffectedRows !== count($quidList)) {
1544
                return ($result | self::CLI_STATUS_ABORTED);
1545
            }
1546
1547
            foreach ($rows as $r) {
1548
                $result |= $this->readUrl($r['qid']);
1549
1550
                $counter++;
1551
                // Just to relax the system
1552
                usleep((int) $sleepTime);
1553
1554
                // if during the start and the current read url the cli has been disable we need to return from the function
1555
                // mark the process NOT as ended.
1556
                if ($this->crawler->isDisabled()) {
1557
                    return ($result | self::CLI_STATUS_ABORTED);
1558
                }
1559
1560
                if (! $this->processRepository->isProcessActive($this->CLI_buildProcessId())) {
1561
                    $this->CLI_debug('conflict / timeout (' . $this->CLI_buildProcessId() . ')');
0 ignored issues
show
Deprecated Code introduced by
The function AOE\Crawler\Controller\C...Controller::CLI_debug() has been deprecated. ( Ignorable by Annotation )

If this is a false-positive, you can also ignore this issue in your code via the ignore-deprecated  annotation

1561
                    /** @scrutinizer ignore-deprecated */ $this->CLI_debug('conflict / timeout (' . $this->CLI_buildProcessId() . ')');
Loading history...
1562
                    $result |= self::CLI_STATUS_ABORTED;
1563
                    //possible timeout
1564
                    break;
1565
                }
1566
            }
1567
1568
            sleep((int) $sleepAfterFinish);
1569
        }
1570
1571
        if ($counter > 0) {
1572
            $result |= self::CLI_STATUS_PROCESSED;
1573
        }
1574
1575
        return $result;
1576
    }
1577
1578
    /**
1579
     * Activate hooks
1580
     * @deprecated
1581
     */
1582
    public function CLI_runHooks(): void
1583
    {
1584
        foreach ($GLOBALS['TYPO3_CONF_VARS']['EXTCONF']['crawler']['cli_hooks'] ?? [] as $objRef) {
1585
            $hookObj = GeneralUtility::makeInstance($objRef);
1586
            if (is_object($hookObj)) {
1587
                $hookObj->crawler_init($this);
1588
            }
1589
        }
1590
    }
1591
1592
    /**
1593
     * Try to acquire a new process with the given id
1594
     * also performs some auto-cleanup for orphan processes
1595
     * @param string $id identification string for the process
1596
     * @return boolean
1597
     * @todo preemption might not be the most elegant way to clean up
1598
     */
1599
    public function CLI_checkAndAcquireNewProcess($id)
1600
    {
1601
        $ret = true;
1602
1603
        $systemProcessId = getmypid();
1604
        if (! $systemProcessId) {
1605
            return false;
1606
        }
1607
1608
        $processCount = 0;
1609
        $orphanProcesses = [];
1610
1611
        $activeProcesses = $this->processRepository->findAllActive();
1612
        $currentTime = $this->getCurrentTime();
1613
1614
        /** @var Process $process */
1615
        foreach ($activeProcesses as $process) {
1616
            if ($process->getTtl() < $currentTime) {
1617
                $orphanProcesses[] = $process->getProcessId();
1618
            } else {
1619
                $processCount++;
1620
            }
1621
        }
1622
1623
        // if there are less than allowed active processes then add a new one
1624
        if ($processCount < (int) $this->extensionSettings['processLimit']) {
1625
            $this->processRepository->addProcess($id, $systemProcessId);
1626
        } else {
1627
            $ret = false;
1628
        }
1629
1630
        $this->processRepository->deleteProcessesMarkedAsDeleted();
1631
        $this->processRepository->markRequestedProcessesAsNotActive($orphanProcesses);
1632
        $this->queueRepository->unsetProcessScheduledAndProcessIdForQueueEntries($orphanProcesses);
1633
1634
        return $ret;
1635
    }
1636
1637
    /**
1638
     * Release a process and the required resources
1639
     *
1640
     * @param mixed $releaseIds string with a single process-id or array with multiple process-ids
1641
     * @return boolean
1642
     * @deprecated
1643
     */
1644
    public function CLI_releaseProcesses($releaseIds)
1645
    {
1646
        $queryBuilder = GeneralUtility::makeInstance(ConnectionPool::class)->getQueryBuilderForTable($this->tableName);
0 ignored issues
show
Deprecated Code introduced by
The property AOE\Crawler\Controller\C...rController::$tableName has been deprecated: Since v9.2.5 - This will be remove in v10 ( Ignorable by Annotation )

If this is a false-positive, you can also ignore this issue in your code via the ignore-deprecated  annotation

1646
        $queryBuilder = GeneralUtility::makeInstance(ConnectionPool::class)->getQueryBuilderForTable(/** @scrutinizer ignore-deprecated */ $this->tableName);

This property has been deprecated. The supplier of the class has supplied an explanatory message.

The explanatory message should give you some clue as to whether and when the property will be removed from the class and what other property to use instead.

Loading history...
1647
1648
        if (! is_array($releaseIds)) {
1649
            $releaseIds = [$releaseIds];
1650
        }
1651
1652
        if (empty($releaseIds)) {
1653
            //nothing to release
1654
            return false;
1655
        }
1656
1657
        // some kind of 2nd chance algo - this way you need at least 2 processes to have a real cleanup
1658
        // this ensures that a single process can't mess up the entire process table
1659
1660
        // mark all processes as deleted which have no "waiting" queue-entires and which are not active
1661
1662
        // ReleaseQueueEntries
1663
        $queryBuilder
1664
            ->update(QueueRepository::TABLE_NAME, 'q')
1665
            ->where(
1666
                'q.process_id IN(SELECT p.process_id FROM tx_crawler_process as p WHERE p.active = 0)'
1667
            )
1668
            ->set('q.process_scheduled', 0)
1669
            ->set('q.process_id', '')
1670
            ->execute();
1671
1672
        // FIXME: Not entirely sure that this is equivalent to the previous version
1673
        $queryBuilder->resetQueryPart('set');
1674
1675
        // ReleaseProcessEntries
1676
        $queryBuilder
1677
            ->update(ProcessRepository::TABLE_NAME)
1678
            ->where(
1679
                $queryBuilder->expr()->eq('active', 0),
1680
                'process_id IN(SELECT q.process_id FROM tx_crawler_queue as q WHERE q.exec_time = 0)'
1681
            )
1682
            ->set('system_process_id', 0)
1683
            ->execute();
1684
1685
        $this->processRepository->markRequestedProcessesAsNotActive($releaseIds);
1686
        $this->queueRepository->unsetProcessScheduledAndProcessIdForQueueEntries($releaseIds);
1687
1688
        return true;
1689
    }
1690
1691
    /**
1692
     * Create a unique Id for the current process
1693
     *
1694
     * @return string the ID
1695
     */
1696
    public function CLI_buildProcessId()
1697
    {
1698
        if (! $this->processID) {
1699
            $this->processID = GeneralUtility::shortMD5(microtime(true));
1700
        }
1701
        return $this->processID;
1702
    }
1703
1704
    /**
1705
     * Prints a message to the stdout (only if debug-mode is enabled)
1706
     *
1707
     * @param string $msg the message
1708
     * @deprecated
1709
     * @codeCoverageIgnore
1710
     */
1711
    public function CLI_debug($msg): void
1712
    {
1713
        if ((int) $this->extensionSettings['processDebug']) {
1714
            echo $msg . "\n";
1715
            flush();
1716 1
        }
1717
    }
1718 1
1719
    /**
1720
     * Cleans up entries that stayed for too long in the queue. These are:
1721 1
     * - processed entries that are over 1.5 days in age
1722
     * - scheduled entries that are over 7 days old
1723
     *
1724
     * @deprecated
1725
     */
1726
    public function cleanUpOldQueueEntries(): void
1727
    {
1728
        // 24*60*60 Seconds in 24 hours
1729
        $processedAgeInSeconds = $this->extensionSettings['cleanUpProcessedAge'] * 86400;
1730
        $scheduledAgeInSeconds = $this->extensionSettings['cleanUpScheduledAge'] * 86400;
1731
1732
        $now = time();
1733
        $condition = '(exec_time<>0 AND exec_time<' . ($now - $processedAgeInSeconds) . ') OR scheduled<=' . ($now - $scheduledAgeInSeconds);
1734
        $this->flushQueue($condition);
0 ignored issues
show
Deprecated Code introduced by
The function AOE\Crawler\Controller\C...ontroller::flushQueue() has been deprecated. ( Ignorable by Annotation )

If this is a false-positive, you can also ignore this issue in your code via the ignore-deprecated  annotation

1734
        /** @scrutinizer ignore-deprecated */ $this->flushQueue($condition);
Loading history...
1735
    }
1736
1737
    /**
1738
     * Removes queue entries
1739
     *
1740
     * @param string $where SQL related filter for the entries which should be removed
1741
     *
1742
     * @deprecated
1743
     */
1744 1
    protected function flushQueue($where = ''): void
1745
    {
1746 1
        $realWhere = strlen((string) $where) > 0 ? $where : '1=1';
1747 1
1748
        $queryBuilder = $this->getQueryBuilder($this->tableName);
0 ignored issues
show
Deprecated Code introduced by
The property AOE\Crawler\Controller\C...rController::$tableName has been deprecated: Since v9.2.5 - This will be remove in v10 ( Ignorable by Annotation )

If this is a false-positive, you can also ignore this issue in your code via the ignore-deprecated  annotation

1748
        $queryBuilder = $this->getQueryBuilder(/** @scrutinizer ignore-deprecated */ $this->tableName);

This property has been deprecated. The supplier of the class has supplied an explanatory message.

The explanatory message should give you some clue as to whether and when the property will be removed from the class and what other property to use instead.

Loading history...
1749 1
1750 1
        $groups = $queryBuilder
1751 1
            ->selectLiteral('DISTINCT set_id')
1752 1
            ->from($this->tableName)
0 ignored issues
show
Deprecated Code introduced by
The property AOE\Crawler\Controller\C...rController::$tableName has been deprecated: Since v9.2.5 - This will be remove in v10 ( Ignorable by Annotation )

If this is a false-positive, you can also ignore this issue in your code via the ignore-deprecated  annotation

1752
            ->from(/** @scrutinizer ignore-deprecated */ $this->tableName)

This property has been deprecated. The supplier of the class has supplied an explanatory message.

The explanatory message should give you some clue as to whether and when the property will be removed from the class and what other property to use instead.

Loading history...
1753
            ->where($realWhere)
1754
            ->execute()
1755
            ->fetchAll();
1756
        if (is_array($groups)) {
0 ignored issues
show
introduced by
The condition is_array($groups) is always true.
Loading history...
1757
            foreach ($groups as $group) {
1758
                $subSet = $queryBuilder
1759
                    ->select('qid', 'set_id')
1760
                    ->from($this->tableName)
0 ignored issues
show
Deprecated Code introduced by
The property AOE\Crawler\Controller\C...rController::$tableName has been deprecated: Since v9.2.5 - This will be remove in v10 ( Ignorable by Annotation )

If this is a false-positive, you can also ignore this issue in your code via the ignore-deprecated  annotation

1760
                    ->from(/** @scrutinizer ignore-deprecated */ $this->tableName)

This property has been deprecated. The supplier of the class has supplied an explanatory message.

The explanatory message should give you some clue as to whether and when the property will be removed from the class and what other property to use instead.

Loading history...
1761 5
                    ->where(
1762
                        $realWhere,
1763 5
                        $queryBuilder->expr()->eq('set_id', $group['set_id'])
1764
                    )
1765 5
                    ->execute()
1766
                    ->fetchAll();
1767 5
1768
                $payLoad = ['subSet' => $subSet];
1769
                SignalSlotUtility::emitSignal(
0 ignored issues
show
Deprecated Code introduced by
The function AOE\Crawler\Utility\Sign...otUtility::emitSignal() has been deprecated. ( Ignorable by Annotation )

If this is a false-positive, you can also ignore this issue in your code via the ignore-deprecated  annotation

1769
                /** @scrutinizer ignore-deprecated */ SignalSlotUtility::emitSignal(
Loading history...
1770
                    self::class,
1771
                    SignalSlotUtility::SIGNAL_QUEUE_ENTRY_FLUSH,
1772
                    $payLoad
1773
                );
1774
            }
1775
        }
1776
1777
        $queryBuilder
1778
            ->delete($this->tableName)
0 ignored issues
show
Deprecated Code introduced by
The property AOE\Crawler\Controller\C...rController::$tableName has been deprecated: Since v9.2.5 - This will be remove in v10 ( Ignorable by Annotation )

If this is a false-positive, you can also ignore this issue in your code via the ignore-deprecated  annotation

1778
            ->delete(/** @scrutinizer ignore-deprecated */ $this->tableName)

This property has been deprecated. The supplier of the class has supplied an explanatory message.

The explanatory message should give you some clue as to whether and when the property will be removed from the class and what other property to use instead.

Loading history...
1779
            ->where($realWhere)
1780
            ->execute();
1781
    }
1782
1783
    /**
1784
     * This method determines duplicates for a queue entry with the same parameters and this timestamp.
1785
     * If the timestamp is in the past, it will check if there is any unprocessed queue entry in the past.
1786
     * If the timestamp is in the future it will check, if the queued entry has exactly the same timestamp
1787
     *
1788
     * @param int $tstamp
1789
     * @param array $fieldArray
1790
     *
1791 5
     * @return array
1792 5
     * @deprecated
1793 5
     */
1794 5
    protected function getDuplicateRowsIfExist($tstamp, $fieldArray)
1795
    {
1796
        $rows = [];
1797
1798
        $currentTime = $this->getCurrentTime();
1799
1800
        $queryBuilder = GeneralUtility::makeInstance(ConnectionPool::class)->getQueryBuilderForTable($this->tableName);
0 ignored issues
show
Deprecated Code introduced by
The property AOE\Crawler\Controller\C...rController::$tableName has been deprecated: Since v9.2.5 - This will be remove in v10 ( Ignorable by Annotation )

If this is a false-positive, you can also ignore this issue in your code via the ignore-deprecated  annotation

1800
        $queryBuilder = GeneralUtility::makeInstance(ConnectionPool::class)->getQueryBuilderForTable(/** @scrutinizer ignore-deprecated */ $this->tableName);

This property has been deprecated. The supplier of the class has supplied an explanatory message.

The explanatory message should give you some clue as to whether and when the property will be removed from the class and what other property to use instead.

Loading history...
1801
        $queryBuilder
1802
            ->select('qid')
1803
            ->from(QueueRepository::TABLE_NAME);
1804
        //if this entry is scheduled with "now"
1805
        if ($tstamp <= $currentTime) {
1806 7
            if ($this->extensionSettings['enableTimeslot']) {
1807
                $timeBegin = $currentTime - 100;
1808 7
                $timeEnd = $currentTime + 100;
1809
                $queryBuilder
1810 7
                    ->where(
1811
                        'scheduled BETWEEN ' . $timeBegin . ' AND ' . $timeEnd . ''
1812 7
                    )
1813
                    ->orWhere(
1814 7
                        $queryBuilder->expr()->lte('scheduled', $currentTime)
1815 7
                    );
1816
            } else {
1817 7
                $queryBuilder
1818 2
                    ->where(
1819 1
                        $queryBuilder->expr()->lte('scheduled', $currentTime)
1820 1
                    );
1821
            }
1822 1
        } elseif ($tstamp > $currentTime) {
1823 1
            //entry with a timestamp in the future need to have the same schedule time
1824
            $queryBuilder
1825 1
                ->where(
1826 1
                    $queryBuilder->expr()->eq('scheduled', $tstamp)
1827
                );
1828
        }
1829
1830 1
        $queryBuilder
1831 2
            ->andWhere('NOT exec_time')
1832
            ->andWhere('NOT process_id')
1833
            ->andWhere($queryBuilder->expr()->eq('page_id', $queryBuilder->createNamedParameter($fieldArray['page_id'], PDO::PARAM_INT)))
1834 5
            ->andWhere($queryBuilder->expr()->eq('parameters_hash', $queryBuilder->createNamedParameter($fieldArray['parameters_hash'], PDO::PARAM_STR)));
1835
1836
        $statement = $queryBuilder->execute();
1837 5
1838 5
        while ($row = $statement->fetch()) {
1839
            $rows[] = $row['qid'];
1840
        }
1841
1842
        return $rows;
1843 7
    }
1844 7
1845 7
    /**
1846 7
     * Returns a md5 hash generated from a serialized configuration array.
1847
     *
1848 7
     * @return string
1849
     */
1850 7
    protected function getConfigurationHash(array $configuration)
1851 5
    {
1852
        unset($configuration['paramExpanded']);
1853
        unset($configuration['URLs']);
1854 7
        return md5(serialize($configuration));
1855
    }
1856
1857
    /**
1858
     * Build a URL from a Page and the Query String. If the page has a Site configuration, it can be built by using
1859
     * the Site instance.
1860
     *
1861
     * @param int $httpsOrHttp see tx_crawler_configuration.force_ssl
1862 8
     * @throws SiteNotFoundException
1863
     * @throws InvalidRouteArgumentsException
1864 8
     *
1865 8
     * @deprecated Using CrawlerController::getUrlFromPageAndQueryParameters() is deprecated since 9.1.1 and will be removed in v11.x, please use UrlService->getUrlFromPageAndQueryParameters() instead.
1866 8
     * @codeCoverageIgnore
1867
     */
1868
    protected function getUrlFromPageAndQueryParameters(int $pageId, string $queryString, ?string $alternativeBaseUrl, int $httpsOrHttp): UriInterface
1869
    {
1870
        $urlService = new UrlService();
1871
        return $urlService->getUrlFromPageAndQueryParameters($pageId, $queryString, $alternativeBaseUrl, $httpsOrHttp);
1872
    }
1873
1874
    /**
1875
     * @deprecated
1876
     */
1877 10
    protected function swapIfFirstIsLargerThanSecond(array $reg): array
1878
    {
1879 10
        // Swap if first is larger than last:
1880 10
        if ($reg[1] > $reg[2]) {
1881 5
            $temp = $reg[2];
1882 5
            $reg[2] = $reg[1];
1883 5
            $reg[1] = $temp;
1884 5
        }
1885
1886 5
        return $reg;
1887
    }
1888
1889
    protected function getPageService(): PageService
1890
    {
1891 5
        return new PageService();
1892
    }
1893 5
1894 5
    private function getMaximumUrlsToCompile(): int
1895 3
    {
1896 3
        return $this->maximumUrlsToCompile;
1897 3
    }
1898 3
1899 3
    /**
1900 5
     * @return BackendUserAuthentication
1901
     */
1902
    private function getBackendUser()
1903
    {
1904
        // Make sure the _cli_ user is loaded
1905
        Bootstrap::initializeBackendAuthentication();
1906 5
        if ($this->backendUser === null) {
1907 5
            $this->backendUser = $GLOBALS['BE_USER'];
1908 5
        }
1909 5
        return $this->backendUser;
1910 5
    }
1911
1912
    /**
1913 10
     * Get querybuilder for given table
1914 2
     *
1915 8
     * @return QueryBuilder
1916 6
     */
1917
    private function getQueryBuilder(string $table)
1918
    {
1919 10
        return GeneralUtility::makeInstance(ConnectionPool::class)->getQueryBuilderForTable($table);
1920
    }
1921
}
1922