Passed
Push — Cleanup/CrawlerController ( 6f130c )
by Tomas Norre
15:07 queued 17s
created

CrawlerController::getUrlsForPageId()   B

Complexity

Conditions 10
Paths 12

Size

Total Lines 60
Code Lines 32

Duplication

Lines 0
Ratio 0 %

Code Coverage

Tests 28
CRAP Score 10.3477

Importance

Changes 3
Bugs 0 Features 0
Metric Value
cc 10
eloc 32
c 3
b 0
f 0
nc 12
nop 1
dl 0
loc 60
ccs 28
cts 33
cp 0.8485
crap 10.3477
rs 7.6666

How to fix   Long Method    Complexity   

Long Method

Small methods make your code easier to understand, in particular if combined with a good name. Besides, if your method is small, finding a good name is usually much easier.

For example, if you find yourself adding comments to a method's body, this is usually a good sign to extract the commented part to a new method, and use the comment as a starting point when coming up with a good name for this new method.

Commonly applied refactorings include:

1
<?php
2
3
declare(strict_types=1);
4
5
namespace AOE\Crawler\Controller;
6
7
/***************************************************************
8
 *  Copyright notice
9
 *
10
 *  (c) 2020 AOE GmbH <[email protected]>
11
 *
12
 *  All rights reserved
13
 *
14
 *  This script is part of the TYPO3 project. The TYPO3 project is
15
 *  free software; you can redistribute it and/or modify
16
 *  it under the terms of the GNU General Public License as published by
17
 *  the Free Software Foundation; either version 3 of the License, or
18
 *  (at your option) any later version.
19
 *
20
 *  The GNU General Public License can be found at
21
 *  http://www.gnu.org/copyleft/gpl.html.
22
 *
23
 *  This script is distributed in the hope that it will be useful,
24
 *  but WITHOUT ANY WARRANTY; without even the implied warranty of
25
 *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
26
 *  GNU General Public License for more details.
27
 *
28
 *  This copyright notice MUST APPEAR in all copies of the script!
29
 ***************************************************************/
30
31
use AOE\Crawler\Configuration\ExtensionConfigurationProvider;
32
use AOE\Crawler\Converter\JsonCompatibilityConverter;
33
use AOE\Crawler\Crawler;
34
use AOE\Crawler\CrawlStrategy\CrawlStrategyFactory;
35
use AOE\Crawler\Domain\Model\Process;
36
use AOE\Crawler\Domain\Repository\ConfigurationRepository;
37
use AOE\Crawler\Domain\Repository\ProcessRepository;
38
use AOE\Crawler\Domain\Repository\QueueRepository;
39
use AOE\Crawler\QueueExecutor;
40
use AOE\Crawler\Service\ConfigurationService;
41
use AOE\Crawler\Service\UrlService;
42
use AOE\Crawler\Service\UserService;
43
use AOE\Crawler\Utility\SignalSlotUtility;
44
use AOE\Crawler\Value\QueueFilter;
45
use PDO;
46
use Psr\Http\Message\UriInterface;
47
use Psr\Log\LoggerAwareInterface;
48
use Psr\Log\LoggerAwareTrait;
49
use TYPO3\CMS\Backend\Tree\View\PageTreeView;
50
use TYPO3\CMS\Backend\Utility\BackendUtility;
51
use TYPO3\CMS\Core\Authentication\BackendUserAuthentication;
52
use TYPO3\CMS\Core\Compatibility\PublicMethodDeprecationTrait;
53
use TYPO3\CMS\Core\Compatibility\PublicPropertyDeprecationTrait;
54
use TYPO3\CMS\Core\Core\Bootstrap;
55
use TYPO3\CMS\Core\Core\Environment;
56
use TYPO3\CMS\Core\Database\Connection;
57
use TYPO3\CMS\Core\Database\ConnectionPool;
58
use TYPO3\CMS\Core\Database\Query\QueryBuilder;
59
use TYPO3\CMS\Core\Database\Query\Restriction\DeletedRestriction;
60
use TYPO3\CMS\Core\Database\QueryGenerator;
61
use TYPO3\CMS\Core\Domain\Repository\PageRepository;
62
use TYPO3\CMS\Core\Exception\SiteNotFoundException;
63
use TYPO3\CMS\Core\Imaging\Icon;
64
use TYPO3\CMS\Core\Imaging\IconFactory;
65
use TYPO3\CMS\Core\Routing\InvalidRouteArgumentsException;
66
use TYPO3\CMS\Core\Site\Entity\Site;
67
use TYPO3\CMS\Core\Type\Bitmask\Permission;
68
use TYPO3\CMS\Core\TypoScript\Parser\TypoScriptParser;
69
use TYPO3\CMS\Core\Utility\DebugUtility;
70
use TYPO3\CMS\Core\Utility\GeneralUtility;
71
use TYPO3\CMS\Core\Utility\MathUtility;
72
use TYPO3\CMS\Extbase\Object\ObjectManager;
73
74
/**
75
 * Class CrawlerController
76
 *
77
 * @package AOE\Crawler\Controller
78
 */
79
class CrawlerController implements LoggerAwareInterface
80
{
81
    use LoggerAwareTrait;
82
    use PublicMethodDeprecationTrait;
83
    use PublicPropertyDeprecationTrait;
84
85
    public const CLI_STATUS_NOTHING_PROCCESSED = 0;
86
87
    //queue not empty
88
    public const CLI_STATUS_REMAIN = 1;
89
90
    //(some) queue items where processed
91
    public const CLI_STATUS_PROCESSED = 2;
92
93
    //instance didn't finish
94
    public const CLI_STATUS_ABORTED = 4;
95
96
    public const CLI_STATUS_POLLABLE_PROCESSED = 8;
97
98
    /**
99
     * @var integer
100
     */
101
    public $setID = 0;
102
103
    /**
104
     * @var string
105
     */
106
    public $processID = '';
107
108
    /**
109
     * @var array
110
     */
111
    public $duplicateTrack = [];
112
113
    /**
114
     * @var array
115
     */
116
    public $downloadUrls = [];
117
118
    /**
119
     * @var array
120
     */
121
    public $incomingProcInstructions = [];
122
123
    /**
124
     * @var array
125
     */
126
    public $incomingConfigurationSelection = [];
127
128
    /**
129
     * @var bool
130
     */
131
    public $registerQueueEntriesInternallyOnly = false;
132
133
    /**
134
     * @var array
135
     */
136
    public $queueEntries = [];
137
138
    /**
139
     * @var array
140
     */
141
    public $urlList = [];
142
143
    /**
144
     * @var array
145
     */
146
    public $extensionSettings = [];
147
148
    /**
149
     * Mount Point
150
     *
151
     * @var bool
152
     * Todo: Check what this is used for and adjust the type hint or code, as bool doesn't match the current code.
153
     */
154
    public $MP = false;
155
156
    /**
157
     * @var string
158
     * @deprecated
159
     */
160
    protected $processFilename;
161
162
    /**
163
     * Holds the internal access mode can be 'gui','cli' or 'cli_im'
164
     *
165
     * @var string
166
     * @deprecated
167
     */
168
    protected $accessMode;
169
170
    /**
171
     * @var QueueRepository
172
     */
173
    protected $queueRepository;
174
175
    /**
176
     * @var ProcessRepository
177
     */
178
    protected $processRepository;
179
180
    /**
181
     * @var ConfigurationRepository
182
     */
183
    protected $configurationRepository;
184
185
    /**
186
     * @var string
187
     */
188
    protected $tableName = 'tx_crawler_queue';
189
190
    /**
191
     * @var QueueExecutor
192
     */
193
    protected $queueExecutor;
194
195
    /**
196
     * @var int
197
     */
198
    protected $maximumUrlsToCompile = 1;
199
200
    /**
201
     * @var IconFactory
202
     */
203
    protected $iconFactory;
204
205
    /**
206
     * @var string[]
207
     */
208
    private $deprecatedPublicMethods = [
0 ignored issues
show
introduced by
The private property $deprecatedPublicMethods is not used, and could be removed.
Loading history...
209
        'compileUrls' => 'Using CrawlerController->compileUrls() is deprecated since 9.2.5, and will be removed in v11.x',
210
        'cleanUpOldQueueEntries' => 'Using CrawlerController::cleanUpOldQueueEntries() is deprecated since 9.0.1 and will be removed in v11.x, please use QueueRepository->cleanUpOldQueueEntries() instead.',
211
        'CLI_debug' => 'Using CrawlerController->CLI_debug() is deprecated since 9.1.3 and will be removed in v11.x',
212
        'CLI_releaseProcesses' => 'Using CrawlerController->CLI_releaseProcesses() is deprecated since 9.2.2 and will be removed in v11.x',
213
        'CLI_runHooks' => 'Using CrawlerController->CLI_runHooks() is deprecated since 9.1.5 and will be removed in v11.x',
214
        'getAccessMode' => 'Using CrawlerController->getAccessMode() is deprecated since 9.1.3 and will be removed in v11.x',
215
        'getLogEntriesForPageId' => 'Using CrawlerController->getLogEntriesForPageId() is deprecated since 9.1.5 and will be remove in v11.x',
216
        'getLogEntriesForSetId' => 'Using crawlerController::getLogEntriesForSetId() is deprecated since 9.0.1 and will be removed in v11.x',
217
        'hasGroupAccess' => 'Using CrawlerController->getLogEntriesForPageId() is deprecated since 9.2.2 and will be remove in v11.x, please use UserService::hasGroupAccess() instead.',
218
        'flushQueue' => 'Using CrawlerController::flushQueue() is deprecated since 9.0.1 and will be removed in v11.x, please use QueueRepository->flushQueue() instead.',
219
        'setAccessMode' => 'Using CrawlerController->setAccessMode() is deprecated since 9.1.3 and will be removed in v11.x',
220
        'getDisabled' => 'Using CrawlerController->getDisabled() is deprecated since 9.1.3 and will be removed in v11.x, please use Crawler->isDisabled() instead',
221
        'setDisabled' => 'Using CrawlerController->setDisabled() is deprecated since 9.1.3 and will be removed in v11.x, please use Crawler->setDisabled() instead',
222
        'getProcessFilename' => 'Using CrawlerController->getProcessFilename() is deprecated since 9.1.3 and will be removed in v11.x',
223
        'setProcessFilename' => 'Using CrawlerController->setProcessFilename() is deprecated since 9.1.3 and will be removed in v11.x',
224
        'getDuplicateRowsIfExist' => 'Using CrawlerController->getDuplicateRowsIfExist() is deprecated since 9.1.4 and will be remove in v11.x, please use QueueRepository->getDuplicateQueueItemsIfExists() instead',
225
        'swapIfFirstIsLargerThanSecond' => 'Using CrawlerController->swapIfFirstIsLargerThanSecond() is deprecated since 9.2.5, and will be removed in v11.x',
226
        'expandParameters' => 'Using CrawlerController->expandParameters() is deprecated since 9.2.5, and will be removed in v11.x',
227
    ];
228
229 41
    /**
230
     * @var string[]
231 41
     */
232 41
    private $deprecatedPublicProperties = [
233 41
        'accessMode' => 'Using CrawlerController->accessMode is deprecated since 9.1.3 and will be removed in v11.x',
234 41
        'processFilename' => 'Using CrawlerController->accessMode is deprecated since 9.1.3 and will be removed in v11.x',
235 41
    ];
236 41
237
    /**
238 41
     * @var BackendUserAuthentication|null
239
     */
240
    private $backendUser;
241 41
242 41
    /**
243 41
     * @var integer
244
     */
245
    private $scheduledTime = 0;
246 41
247
    /**
248
     * @var integer
249
     */
250 41
    private $reqMinute = 0;
251 41
252 41
    /**
253
     * @var bool
254
     */
255
    private $submitCrawlUrls = false;
256
257
    /**
258
     * @var bool
259 1
     */
260
    private $downloadCrawlUrls = false;
261 1
262
    /**
263
     * @var PageRepository
264
     */
265
    private $pageRepository;
266
267 1
    /**
268
     * @var Crawler
269 1
     */
270 1
    private $crawler;
271
272
    /**
273
     * @var ConfigurationService
274
     */
275
    private $configurationService;
276
277 2
    /**
278
     * @var UrlService
279 2
     */
280 1
    private $urlService;
281
282 1
    /************************************
283 1
     *
284
     * Getting URLs based on Page TSconfig
285
     *
286 2
     ************************************/
287
288
    public function __construct()
289
    {
290
        $objectManager = GeneralUtility::makeInstance(ObjectManager::class);
291
        $crawlStrategyFactory = GeneralUtility::makeInstance(CrawlStrategyFactory::class);
292
        $this->queueRepository = $objectManager->get(QueueRepository::class);
293 2
        $this->processRepository = $objectManager->get(ProcessRepository::class);
294
        $this->configurationRepository = $objectManager->get(ConfigurationRepository::class);
295 2
        $this->pageRepository = GeneralUtility::makeInstance(PageRepository::class);
296
        $this->queueExecutor = GeneralUtility::makeInstance(QueueExecutor::class, $crawlStrategyFactory);
297
        $this->iconFactory = GeneralUtility::makeInstance(IconFactory::class);
298
        $this->crawler = GeneralUtility::makeInstance(Crawler::class);
299
        $this->configurationService = GeneralUtility::makeInstance(ConfigurationService::class);
300
        $this->urlService = GeneralUtility::makeInstance(UrlService::class);
301 3
302
        $this->processFilename = Environment::getVarPath() . '/lock/tx_crawler.proc';
0 ignored issues
show
Deprecated Code introduced by
The property AOE\Crawler\Controller\C...oller::$processFilename has been deprecated. ( Ignorable by Annotation )

If this is a false-positive, you can also ignore this issue in your code via the ignore-deprecated  annotation

302
        /** @scrutinizer ignore-deprecated */ $this->processFilename = Environment::getVarPath() . '/lock/tx_crawler.proc';
Loading history...
303 3
304 3
        /** @var ExtensionConfigurationProvider $configurationProvider */
305
        $configurationProvider = GeneralUtility::makeInstance(ExtensionConfigurationProvider::class);
306
        $settings = $configurationProvider->getExtensionConfiguration();
307
        $this->extensionSettings = is_array($settings) ? $settings : [];
308
309 1
        if (MathUtility::convertToPositiveInteger($this->extensionSettings['countInARun']) === 0) {
310
            $this->extensionSettings['countInARun'] = 100;
311 1
        }
312
313
        $this->extensionSettings['processLimit'] = MathUtility::forceIntegerInRange($this->extensionSettings['processLimit'], 1, 99, 1);
314
        $this->setMaximumUrlsToCompile(MathUtility::forceIntegerInRange($this->extensionSettings['maxCompileUrls'], 1, 1000000000, 10000));
315
    }
316
317 12
    public function setMaximumUrlsToCompile(int $maximumUrlsToCompile): void
318
    {
319 12
        $this->maximumUrlsToCompile = $maximumUrlsToCompile;
320 12
    }
321
322
    /**
323
     * Method to set the accessMode can be gui, cli or cli_im
324
     *
325
     * @return string
326
     * @deprecated
327 8
     */
328
    public function getAccessMode()
329 8
    {
330 8
        return $this->accessMode;
0 ignored issues
show
Deprecated Code introduced by
The property AOE\Crawler\Controller\C...Controller::$accessMode has been deprecated. ( Ignorable by Annotation )

If this is a false-positive, you can also ignore this issue in your code via the ignore-deprecated  annotation

330
        return /** @scrutinizer ignore-deprecated */ $this->accessMode;
Loading history...
331
    }
332
333 8
    /**
334 8
     * @param string $accessMode
335 1
     * @deprecated
336 1
     */
337
    public function setAccessMode($accessMode): void
338
    {
339
        $this->accessMode = $accessMode;
0 ignored issues
show
Deprecated Code introduced by
The property AOE\Crawler\Controller\C...Controller::$accessMode has been deprecated. ( Ignorable by Annotation )

If this is a false-positive, you can also ignore this issue in your code via the ignore-deprecated  annotation

339
        /** @scrutinizer ignore-deprecated */ $this->accessMode = $accessMode;
Loading history...
340 8
    }
341 7
342 3
    /**
343 3
     * Set disabled status to prevent processes from being processed
344
     * @deprecated
345
     */
346
    public function setDisabled(?bool $disabled = true): void
347 8
    {
348 4
        if ($disabled) {
349 1
            GeneralUtility::writeFile($this->processFilename, 'disabled');
0 ignored issues
show
Deprecated Code introduced by
The property AOE\Crawler\Controller\C...oller::$processFilename has been deprecated. ( Ignorable by Annotation )

If this is a false-positive, you can also ignore this issue in your code via the ignore-deprecated  annotation

349
            GeneralUtility::writeFile(/** @scrutinizer ignore-deprecated */ $this->processFilename, 'disabled');
Loading history...
350 1
        } elseif (is_file($this->processFilename)) {
0 ignored issues
show
Deprecated Code introduced by
The property AOE\Crawler\Controller\C...oller::$processFilename has been deprecated. ( Ignorable by Annotation )

If this is a false-positive, you can also ignore this issue in your code via the ignore-deprecated  annotation

350
        } elseif (is_file(/** @scrutinizer ignore-deprecated */ $this->processFilename)) {
Loading history...
351 1
            unlink($this->processFilename);
0 ignored issues
show
Deprecated Code introduced by
The property AOE\Crawler\Controller\C...oller::$processFilename has been deprecated. ( Ignorable by Annotation )

If this is a false-positive, you can also ignore this issue in your code via the ignore-deprecated  annotation

351
            unlink(/** @scrutinizer ignore-deprecated */ $this->processFilename);
Loading history...
352 1
        }
353
    }
354
355
    /**
356
     * Get disable status
357 8
     * @deprecated
358
     */
359 3
    public function getDisabled(): bool
360
    {
361
        return is_file($this->processFilename);
0 ignored issues
show
Deprecated Code introduced by
The property AOE\Crawler\Controller\C...oller::$processFilename has been deprecated. ( Ignorable by Annotation )

If this is a false-positive, you can also ignore this issue in your code via the ignore-deprecated  annotation

361
        return is_file(/** @scrutinizer ignore-deprecated */ $this->processFilename);
Loading history...
362
    }
363
364
    /**
365
     * @param string $filenameWithPath
366
     * @deprecated
367
     */
368
    public function setProcessFilename($filenameWithPath): void
369
    {
370
        $this->processFilename = $filenameWithPath;
0 ignored issues
show
Deprecated Code introduced by
The property AOE\Crawler\Controller\C...oller::$processFilename has been deprecated. ( Ignorable by Annotation )

If this is a false-positive, you can also ignore this issue in your code via the ignore-deprecated  annotation

370
        /** @scrutinizer ignore-deprecated */ $this->processFilename = $filenameWithPath;
Loading history...
371
    }
372
373
    /**
374
     * @return string
375
     * @deprecated
376
     */
377
    public function getProcessFilename()
378 8
    {
379
        return $this->processFilename;
0 ignored issues
show
Deprecated Code introduced by
The property AOE\Crawler\Controller\C...oller::$processFilename has been deprecated. ( Ignorable by Annotation )

If this is a false-positive, you can also ignore this issue in your code via the ignore-deprecated  annotation

379
        return /** @scrutinizer ignore-deprecated */ $this->processFilename;
Loading history...
380
    }
381
382
    /**
383
     * Sets the extensions settings (unserialized pendant of $TYPO3_CONF_VARS['EXT']['extConf']['crawler']).
384
     */
385
    public function setExtensionSettings(array $extensionSettings): void
386
    {
387
        $this->extensionSettings = $extensionSettings;
388
    }
389
390 4
    /**
391
     * Check if the given page should be crawled
392 4
     *
393
     * @return false|string false if the page should be crawled (not excluded), true / skipMessage if it should be skipped
394 4
     */
395 3
    public function checkIfPageShouldBeSkipped(array $pageRow)
396 3
    {
397
        // if page is hidden
398 1
        if (! $this->extensionSettings['crawlHiddenPages'] && $pageRow['hidden']) {
399 1
            return 'Because page is hidden';
400
        }
401
402 4
        if (GeneralUtility::inList('3,4,199,254,255', $pageRow['doktype'])) {
403
            return 'Because doktype is not allowed';
404
        }
405
406
        foreach ($GLOBALS['TYPO3_CONF_VARS']['EXTCONF']['crawler']['excludeDoktype'] ?? [] as $key => $doktypeList) {
407
            if (GeneralUtility::inList($doktypeList, $pageRow['doktype'])) {
408
                return 'Doktype was excluded by "' . $key . '"';
409
            }
410
        }
411
412
        // veto hook
413
        foreach ($GLOBALS['TYPO3_CONF_VARS']['EXTCONF']['crawler']['pageVeto'] ?? [] as $key => $func) {
414
            $params = [
415
                'pageRow' => $pageRow,
416
            ];
417
            // expects "false" if page is ok and "true" or a skipMessage if this page should _not_ be crawled
418
            $veto = GeneralUtility::callUserFunction($func, $params, $this);
419
            if ($veto !== false) {
420 2
                if (is_string($veto)) {
421
                    return $veto;
422
                }
423
                return 'Veto from hook "' . htmlspecialchars($key) . '"';
424
            }
425
        }
426
427
        return false;
428
    }
429
430
    /**
431 2
     * Wrapper method for getUrlsForPageId()
432
     * It returns an array of configurations and no urls!
433
     *
434 2
     * @param array $pageRow Page record with at least dok-type and uid columns.
435 2
     * @param string $skipMessage
436 2
     * @return array
437 2
     * @see getUrlsForPageId()
438
     */
439 2
    public function getUrlsForPageRow(array $pageRow, &$skipMessage = '')
440 2
    {
441
        if (! is_int($pageRow['uid'])) {
442
            $skipMessage = 'PageUid ' . $pageRow['uid'] . ' was not an integer';
443 2
            return [];
444 2
        }
445 2
446 2
        $message = $this->checkIfPageShouldBeSkipped($pageRow);
447 2
        if ($message === false) {
448
            $res = $this->getUrlsForPageId($pageRow['uid']);
449
            $skipMessage = '';
450
        } else {
451 2
            $skipMessage = $message;
452
            $res = [];
453 2
        }
454
455
        return $res;
456
    }
457
458 2
    /**
459 2
     * Creates a list of URLs from input array (and submits them to queue if asked for)
460 2
     * See Web > Info module script + "indexed_search"'s crawler hook-client using this!
461 2
     *
462 2
     * @param array $vv Information about URLs from pageRow to crawl.
463
     * @param array $pageRow Page row
464
     * @param int $scheduledTime Unix time to schedule indexing to, typically time()
465 2
     * @param int $reqMinute Number of requests per minute (creates the interleave between requests)
466 2
     * @param bool $submitCrawlUrls If set, submits the URLs to queue
467 2
     * @param bool $downloadCrawlUrls If set (and submitcrawlUrls is false) will fill $downloadUrls with entries)
468 2
     * @param array $duplicateTrack Array which is passed by reference and contains the an id per url to secure we will not crawl duplicates
469 2
     * @param array $downloadUrls Array which will be filled with URLS for download if flag is set.
470 2
     * @param array $incomingProcInstructions Array of processing instructions
471 2
     * @return string List of URLs (meant for display in backend module)
472 2
     */
473
    public function urlListFromUrlArray(
474 2
        array $vv,
475 2
        array $pageRow,
476
        $scheduledTime,
477
        $reqMinute,
478
        $submitCrawlUrls,
479
        $downloadCrawlUrls,
480 2
        array &$duplicateTrack,
481
        array &$downloadUrls,
482 2
        array $incomingProcInstructions
483
    ) {
484
        if (! is_array($vv['URLs'])) {
485 2
            return 'ERROR - no URL generated';
486
        }
487
        $urlLog = [];
488
        $pageId = (int) $pageRow['uid'];
489
        $configurationHash = $this->getConfigurationHash($vv);
490
        $skipInnerCheck = $this->queueRepository->noUnprocessedQueueEntriesForPageWithConfigurationHashExist($pageId, $configurationHash);
491
492
        $urlService = new UrlService();
493
494
        foreach ($vv['URLs'] as $urlQuery) {
495 5
            if (! $this->drawURLs_PIfilter($vv['subCfg']['procInstrFilter'], $incomingProcInstructions)) {
496
                continue;
497 5
            }
498 1
            $url = (string) $urlService->getUrlFromPageAndQueryParameters(
499
                $pageId,
500
                $urlQuery,
501 4
                $vv['subCfg']['baseUrl'] ?? null,
502 4
                $vv['subCfg']['force_ssl'] ?? 0
503 2
            );
504
505
            // Create key by which to determine unique-ness:
506 2
            $uKey = $url . '|' . $vv['subCfg']['userGroups'] . '|' . $vv['subCfg']['procInstrFilter'];
507
508
            if (isset($duplicateTrack[$uKey])) {
509 3
                //if the url key is registered just display it and do not resubmit is
510
                $urlLog[] = '<em><span class="text-muted">' . htmlspecialchars($url) . '</span></em>';
511 3
            } else {
512 3
                // Scheduled time:
513
                $schTime = $scheduledTime + round(count($duplicateTrack) * (60 / $reqMinute));
514
                $schTime = intval($schTime / 60) * 60;
515
                $formattedDate = BackendUtility::datetime($schTime);
516
                $this->urlList[] = '[' . $formattedDate . '] ' . $url;
517
                $urlList = '[' . $formattedDate . '] ' . htmlspecialchars($url);
518
519
                // Submit for crawling!
520 3
                if ($submitCrawlUrls) {
521
                    $added = $this->addUrl(
522
                        $pageId,
523
                        $url,
524
                        $vv['subCfg'],
525
                        $scheduledTime,
526
                        $configurationHash,
527
                        $skipInnerCheck
528
                    );
529 3
                    if ($added === false) {
530
                        $urlList .= ' (URL already existed)';
531
                    }
532
                } elseif ($downloadCrawlUrls) {
533
                    $downloadUrls[$url] = $url;
534
                }
535
                $urlLog[] = $urlList;
536 2
            }
537
            $duplicateTrack[$uKey] = true;
538
        }
539 2
540
        return implode('<br>', $urlLog);
541 2
    }
542
543
    /**
544 2
     * Returns true if input processing instruction is among registered ones.
545 2
     *
546 1
     * @param string $piString PI to test
547 1
     * @param array $incomingProcInstructions Processing instructions
548
     * @return boolean
549 1
     */
550
    public function drawURLs_PIfilter($piString, array $incomingProcInstructions)
551 1
    {
552 1
        if (empty($incomingProcInstructions)) {
553
            return true;
554 1
        }
555 1
556
        foreach ($incomingProcInstructions as $pi) {
557 1
            if (GeneralUtility::inList($piString, $pi)) {
558
                return true;
559
            }
560
        }
561 1
        return false;
562
    }
563
564 1
    public function getPageTSconfigForId(int $id): array
565 1
    {
566 1
        if (! $this->MP) {
567 1
            $pageTSconfig = BackendUtility::getPagesTSconfig($id);
568 1
        } else {
569
            // TODO: Please check, this makes no sense to split a boolean value.
570
            [, $mountPointId] = explode('-', $this->MP);
0 ignored issues
show
Bug introduced by
$this->MP of type true is incompatible with the type string expected by parameter $string of explode(). ( Ignorable by Annotation )

If this is a false-positive, you can also ignore this issue in your code via the ignore-type  annotation

570
            [, $mountPointId] = explode('-', /** @scrutinizer ignore-type */ $this->MP);
Loading history...
571 1
            $pageTSconfig = BackendUtility::getPagesTSconfig($mountPointId);
0 ignored issues
show
Bug introduced by
$mountPointId of type string is incompatible with the type integer expected by parameter $id of TYPO3\CMS\Backend\Utilit...ity::getPagesTSconfig(). ( Ignorable by Annotation )

If this is a false-positive, you can also ignore this issue in your code via the ignore-type  annotation

571
            $pageTSconfig = BackendUtility::getPagesTSconfig(/** @scrutinizer ignore-type */ $mountPointId);
Loading history...
572 1
        }
573
574
        // Call a hook to alter configuration
575
        if (is_array($GLOBALS['TYPO3_CONF_VARS']['EXTCONF']['crawler']['getPageTSconfigForId'])) {
576
            $params = [
577
                'pageId' => $id,
578
                'pageTSConfig' => &$pageTSconfig,
579
            ];
580 2
            foreach ($GLOBALS['TYPO3_CONF_VARS']['EXTCONF']['crawler']['getPageTSconfigForId'] as $userFunc) {
581 2
                GeneralUtility::callUserFunction($userFunc, $params, $this);
582
            }
583
        }
584 1
        return $pageTSconfig;
585 1
    }
586
587
    /**
588
     * This methods returns an array of configurations.
589 1
     * Adds no urls!
590 1
     */
591
    public function getUrlsForPageId(int $pageId): array
592
    {
593 1
        // Get page TSconfig for page ID
594
        $pageTSconfig = $this->getPageTSconfigForId($pageId);
595
596 1
        $res = [];
597 1
598
        // Fetch Crawler Configuration from pageTSConfig
599
        $res = $this->configurationService->getConfigurationFromPageTS($pageTSconfig, $pageId, $res);
600 1
601 1
        // Get configuration from tx_crawler_configuration records up the rootline
602 1
        $crawlerConfigurations = $this->configurationRepository->getCrawlerConfigurationRecordsFromRootLine($pageId);
603 1
        foreach ($crawlerConfigurations as $configurationRecord) {
604 1
605 1
            // check access to the configuration record
606 1
            if (empty($configurationRecord['begroups']) || $this->getBackendUser()->isAdmin() || UserService::hasGroupAccess($this->getBackendUser()->user['usergroup_cached_list'], $configurationRecord['begroups'])) {
607
                $pidOnlyList = implode(',', GeneralUtility::trimExplode(',', $configurationRecord['pidsonly'], true));
608
609 1
                // process configuration if it is not page-specific or if the specific page is the current page:
610 1
                // TODO: Check if $pidOnlyList can be kept as Array instead of imploded
611 1
                if (! strcmp($configurationRecord['pidsonly'], '') || GeneralUtility::inList($pidOnlyList, strval($pageId))) {
612 1
                    $key = $configurationRecord['name'];
613 1
614 1
                    // don't overwrite previously defined paramSets
615 1
                    if (! isset($res[$key])) {
616
617
                        /* @var $TSparserObject TypoScriptParser */
618
                        $TSparserObject = GeneralUtility::makeInstance(TypoScriptParser::class);
619
                        $TSparserObject->parse($configurationRecord['processing_instruction_parameters_ts']);
620
621
                        $subCfg = [
622 2
                            'procInstrFilter' => $configurationRecord['processing_instruction_filter'],
623
                            'procInstrParams.' => $TSparserObject->setup,
624
                            'baseUrl' => $configurationRecord['base_url'],
625
                            'force_ssl' => (int) $configurationRecord['force_ssl'],
626
                            'userGroups' => $configurationRecord['fegroups'],
627
                            'exclude' => $configurationRecord['exclude'],
628 2
                            'key' => $key,
629
                        ];
630
631
                        if (! in_array($pageId, $this->expandExcludeString($subCfg['exclude']), true)) {
632
                            $res[$key] = [];
633
                            $res[$key]['subCfg'] = $subCfg;
634
                            $res[$key]['paramParsed'] = GeneralUtility::explodeUrl2Array($configurationRecord['configuration']);
635 1
                            $res[$key]['paramExpanded'] = $this->configurationService->expandParameters($res[$key]['paramParsed'], $pageId);
636
                            $res[$key]['URLs'] = $this->urlService->compileUrls($res[$key]['paramExpanded'], ['?id=' . $pageId], $this->getMaximumUrlsToCompile());
637 1
                            $res[$key]['origin'] = 'tx_crawler_configuration_' . $configurationRecord['uid'];
638 1
                        }
639 1
                    }
640 1
                }
641
            }
642
        }
643
644
        foreach ($GLOBALS['TYPO3_CONF_VARS']['EXTCONF']['crawler']['processUrls'] ?? [] as $func) {
645
            $params = [
646 1
                'res' => &$res,
647 1
            ];
648 1
            GeneralUtility::callUserFunction($func, $params, $this);
649 1
        }
650
        return $res;
651
    }
652 1
653 1
    /**
654 1
     * Find all configurations of subpages of a page
655 1
     * TODO: Write Functional Tests
656 1
     */
657
    public function getConfigurationsForBranch(int $rootid, int $depth): array
658
    {
659
        $configurationsForBranch = [];
660 1
        $pageTSconfig = $this->getPageTSconfigForId($rootid);
661
        $sets = $pageTSconfig['tx_crawler.']['crawlerCfg.']['paramSets.'] ?? [];
662 1
        foreach ($sets as $key => $value) {
663 1
            if (! is_array($value)) {
664 1
                continue;
665 1
            }
666
            $configurationsForBranch[] = substr($key, -1) === '.' ? substr($key, 0, -1) : $key;
667 1
        }
668
        $pids = [];
669 1
        $rootLine = BackendUtility::BEgetRootLine($rootid);
670 1
        foreach ($rootLine as $node) {
671
            $pids[] = $node['uid'];
672 1
        }
673
        /* @var PageTreeView $tree */
674
        $tree = GeneralUtility::makeInstance(PageTreeView::class);
675
        $perms_clause = $this->getBackendUser()->getPagePermsClause(Permission::PAGE_SHOW);
676
        $tree->init(empty($perms_clause) ? '' : ('AND ' . $perms_clause));
677
        $tree->getTree($rootid, $depth, '');
678
        foreach ($tree->tree as $node) {
679
            $pids[] = $node['row']['uid'];
680
        }
681
682
        $queryBuilder = $this->getQueryBuilder('tx_crawler_configuration');
683
        $statement = $queryBuilder
684 3
            ->select('name')
685
            ->from('tx_crawler_configuration')
686 3
            ->where(
687 1
                $queryBuilder->expr()->in('pid', $queryBuilder->createNamedParameter($pids, Connection::PARAM_INT_ARRAY))
688
            )
689 2
            ->execute();
690 2
691 1
        while ($row = $statement->fetch()) {
0 ignored issues
show
Deprecated Code introduced by
The function Doctrine\DBAL\ForwardCompatibility\Result::fetch() has been deprecated: Use fetchNumeric(), fetchAssociative() or fetchOne() instead. ( Ignorable by Annotation )

If this is a false-positive, you can also ignore this issue in your code via the ignore-deprecated  annotation

691
        while ($row = /** @scrutinizer ignore-deprecated */ $statement->fetch()) {

This function has been deprecated. The supplier of the function has supplied an explanatory message.

The explanatory message should give you some clue as to whether and when the function will be removed and what other function to use instead.

Loading history...
692
            $configurationsForBranch[] = $row['name'];
693
        }
694 1
        return $configurationsForBranch;
695
    }
696
697
    /**
698
     * Check if a user has access to an item
699
     * (e.g. get the group list of the current logged in user from $GLOBALS['TSFE']->gr_list)
700
     *
701
     * @param string $groupList Comma-separated list of (fe_)group UIDs from a user
702
     * @param string $accessList Comma-separated list of (fe_)group UIDs of the item to access
703
     * @return bool TRUE if at least one of the users group UIDs is in the access list or the access list is empty
704
     * @see \TYPO3\CMS\Frontend\Page\PageRepository::getMultipleGroupsWhereClause()
705
     * @deprecated
706
     * @codeCoverageIgnore
707
     */
708
    public function hasGroupAccess($groupList, $accessList)
709
    {
710
        if (empty($accessList)) {
711
            return true;
712
        }
713
        foreach (GeneralUtility::intExplode(',', $groupList) as $groupUid) {
714 9
            if (GeneralUtility::inList($accessList, $groupUid)) {
715
                return true;
716
            }
717 9
        }
718 9
        return false;
719
    }
720
721 9
    /**
722
     * Will expand the parameters configuration to individual values. This follows a certain syntax of the value of each parameter.
723 9
     * Syntax of values:
724 9
     * - Basically: If the value is wrapped in [...] it will be expanded according to the following syntax, otherwise the value is taken literally
725
     * - Configuration is splitted by "|" and the parts are processed individually and finally added together
726
     * - For each configuration part:
727 9
     *         - "[int]-[int]" = Integer range, will be expanded to all values in between, values included, starting from low to high (max. 1000). Example "1-34" or "-40--30"
728 9
     *         - "_TABLE:[TCA table name];[_PID:[optional page id, default is current page]];[_ENABLELANG:1]" = Look up of table records from PID, filtering out deleted records. Example "_TABLE:tt_content; _PID:123"
729
     *        _ENABLELANG:1 picks only original records without their language overlays
730
     *         - Default: Literal value
731 9
     *
732 1
     * @param array $paramArray Array with key (GET var name) and values (value of GET var which is configuration for expansion)
733
     * @param integer $pid Current page ID
734
     * @return array
735 1
     * @deprecated
736 1
     *
737 1
     * TODO: Write Functional Tests
738 1
     */
739 1
    public function expandParameters($paramArray, $pid)
740
    {
741
        // Traverse parameter names:
742
        foreach ($paramArray as $p => $v) {
743 8
            $v = trim($v);
744
745
            // If value is encapsulated in square brackets it means there are some ranges of values to find, otherwise the value is literal
746 6
            if (strpos($v, '[') === 0 && substr($v, -1) === ']') {
747 6
                // So, find the value inside brackets and reset the paramArray value as an array.
748 6
                $v = substr($v, 1, -1);
749 6
                $paramArray[$p] = [];
750 6
751
                // Explode parts and traverse them:
752
                $parts = explode('|', $v);
753
                foreach ($parts as $pV) {
754 6
755 6
                    // Look for integer range: (fx. 1-34 or -40--30 // reads minus 40 to minus 30)
756 6
                    if (preg_match('/^(-?[0-9]+)\s*-\s*(-?[0-9]+)$/', trim($pV), $reg)) {
757 6
                        $reg = $this->swapIfFirstIsLargerThanSecond($reg);
0 ignored issues
show
Deprecated Code introduced by
The function AOE\Crawler\Controller\C...rstIsLargerThanSecond() has been deprecated. ( Ignorable by Annotation )

If this is a false-positive, you can also ignore this issue in your code via the ignore-deprecated  annotation

757
                        $reg = /** @scrutinizer ignore-deprecated */ $this->swapIfFirstIsLargerThanSecond($reg);
Loading history...
758 6
759 6
                        // Traverse range, add values:
760
                        // Limit to size of range!
761 6
                        $runAwayBrake = 1000;
762 6
                        for ($a = $reg[1]; $a <= $reg[2]; $a++) {
763 6
                            $paramArray[$p][] = $a;
764
                            $runAwayBrake--;
765 6
                            if ($runAwayBrake <= 0) {
766
                                break;
767 2
                            }
768 2
                        }
769 2
                    } elseif (strpos(trim($pV), '_TABLE:') === 0) {
770
771 4
                        // Parse parameters:
772
                        $subparts = GeneralUtility::trimExplode(';', $pV);
773
                        $subpartParams = [];
774 6
                        foreach ($subparts as $spV) {
775 6
                            [$pKey, $pVal] = GeneralUtility::trimExplode(':', $spV);
776 6
                            $subpartParams[$pKey] = $pVal;
777
                        }
778
779 6
                        // Table exists:
780 6
                        if (isset($GLOBALS['TCA'][$subpartParams['_TABLE']])) {
781 6
                            $lookUpPid = isset($subpartParams['_PID']) ? intval($subpartParams['_PID']) : intval($pid);
782 6
                            $recursiveDepth = isset($subpartParams['_RECURSIVE']) ? intval($subpartParams['_RECURSIVE']) : 0;
783 6
                            $pidField = isset($subpartParams['_PIDFIELD']) ? trim($subpartParams['_PIDFIELD']) : 'pid';
784
                            $where = $subpartParams['_WHERE'] ?? '';
785
                            $addTable = $subpartParams['_ADDTABLE'] ?? '';
786 6
787
                            $fieldName = $subpartParams['_FIELD'] ? $subpartParams['_FIELD'] : 'uid';
788
                            if ($fieldName === 'uid' || $GLOBALS['TCA'][$subpartParams['_TABLE']]['columns'][$fieldName]) {
789
                                $queryBuilder = $this->getQueryBuilder($subpartParams['_TABLE']);
790 6
791
                                if ($recursiveDepth > 0) {
792 6
                                    /** @var QueryGenerator $queryGenerator */
793
                                    $queryGenerator = GeneralUtility::makeInstance(QueryGenerator::class);
794
                                    $pidList = $queryGenerator->getTreeList($lookUpPid, $recursiveDepth, 0, 1);
795
                                    $pidArray = GeneralUtility::intExplode(',', $pidList);
796
                                } else {
797
                                    $pidArray = [(string) $lookUpPid];
798
                                }
799
800
                                $queryBuilder->getRestrictions()
801 6
                                    ->removeAll()
802
                                    ->add(GeneralUtility::makeInstance(DeletedRestriction::class));
803 6
804 6
                                $queryBuilder
805 6
                                    ->select($fieldName)
806
                                    ->from($subpartParams['_TABLE'])
807
                                    ->where(
808 6
                                        $queryBuilder->expr()->in($pidField, $queryBuilder->createNamedParameter($pidArray, Connection::PARAM_INT_ARRAY)),
809 6
                                        $where
810
                                    );
811
812
                                if (! empty($addTable)) {
813
                                    // TODO: Check if this works as intended!
814 2
                                    $queryBuilder->add('from', $addTable);
815
                                }
816
                                $transOrigPointerField = $GLOBALS['TCA'][$subpartParams['_TABLE']]['ctrl']['transOrigPointerField'];
817 9
818
                                if ($subpartParams['_ENABLELANG'] && $transOrigPointerField) {
819
                                    $queryBuilder->andWhere(
820
                                        $queryBuilder->expr()->lte(
821
                                            $transOrigPointerField,
822
                                            0
823
                                        )
824
                                    );
825
                                }
826
827
                                $statement = $queryBuilder->execute();
828
829
                                $rows = [];
830
                                while ($row = $statement->fetch()) {
0 ignored issues
show
Deprecated Code introduced by
The function Doctrine\DBAL\ForwardCompatibility\Result::fetch() has been deprecated: Use fetchNumeric(), fetchAssociative() or fetchOne() instead. ( Ignorable by Annotation )

If this is a false-positive, you can also ignore this issue in your code via the ignore-deprecated  annotation

830
                                while ($row = /** @scrutinizer ignore-deprecated */ $statement->fetch()) {

This function has been deprecated. The supplier of the function has supplied an explanatory message.

The explanatory message should give you some clue as to whether and when the function will be removed and what other function to use instead.

Loading history...
831
                                    $rows[$row[$fieldName]] = $row;
832 9
                                }
833 9
834
                                if (is_array($rows)) {
835
                                    $paramArray[$p] = array_merge($paramArray[$p], array_keys($rows));
836 2
                                }
837
                            }
838
                        }
839
                    } else {
840 9
                        // Just add value:
841
                        $paramArray[$p][] = $pV;
842
                    }
843
                    // Hook for processing own expandParameters place holder
844
                    if (is_array($GLOBALS['TYPO3_CONF_VARS']['SC_OPTIONS']['crawler/class.tx_crawler_lib.php']['expandParameters'])) {
845
                        $_params = [
846
                            'pObj' => &$this,
847
                            'paramArray' => &$paramArray,
848
                            'currentKey' => $p,
849
                            'currentValue' => $pV,
850
                            'pid' => $pid,
851 5
                        ];
852
                        foreach ($GLOBALS['TYPO3_CONF_VARS']['SC_OPTIONS']['crawler/class.tx_crawler_lib.php']['expandParameters'] as $_funcRef) {
853 5
                            GeneralUtility::callUserFunction($_funcRef, $_params, $this);
854 5
                        }
855
                    }
856
                }
857 4
858 4
                // Make unique set of values and sort array by key:
859 4
                $paramArray[$p] = array_unique($paramArray[$p]);
860
                ksort($paramArray);
861
            } else {
862 4
                // Set the literal value as only value in array:
863 4
                $paramArray[$p] = [$v];
864 3
            }
865 3
        }
866
867 3
        return $paramArray;
868
    }
869
870
    /**
871
     * Compiling URLs from parameter array (output of expandParameters())
872 4
     * The number of URLs will be the multiplication of the number of parameter values for each key
873
     *
874
     * @param array $paramArray Output of expandParameters(): Array with keys (GET var names) and for each an array of values
875
     * @param array $urls URLs accumulated in this array (for recursion)
876
     * @deprecated
877
     */
878
    public function compileUrls(array $paramArray, array $urls): array
879
    {
880
        return $this->urlService->compileUrls($paramArray, $urls, $this->getMaximumUrlsToCompile());
881
    }
882
883
    /************************************
884
     *
885
     * Crawler log
886
     *
887
     ************************************/
888
889
    /**
890
     * Return array of records from crawler queue for input page ID
891 4
     *
892
     * @param integer $id Page ID for which to look up log entries.
893 4
     * @param boolean $doFlush If TRUE, then entries selected at DELETED(!) instead of selected!
894
     * @param boolean $doFullFlush
895 4
     * @param integer $itemsPerPage Limit the amount of entries per page default is 10
896 4
     * @return array
897 4
     *
898 4
     * @deprecated
899
     */
900 4
    public function getLogEntriesForPageId($id, QueueFilter $queueFilter, $doFlush = false, $doFullFlush = false, $itemsPerPage = 10)
0 ignored issues
show
Unused Code introduced by
The parameter $doFullFlush is not used and could be removed. ( Ignorable by Annotation )

If this is a false-positive, you can also ignore this issue in your code via the ignore-unused  annotation

900
    public function getLogEntriesForPageId($id, QueueFilter $queueFilter, $doFlush = false, /** @scrutinizer ignore-unused */ $doFullFlush = false, $itemsPerPage = 10)

This check looks for parameters that have been defined for a function or method, but which are not used in the method body.

Loading history...
901
    {
902 4
        $queryBuilder = GeneralUtility::makeInstance(ConnectionPool::class)->getQueryBuilderForTable($this->tableName);
903 4
        $queryBuilder
904 4
            ->select('*')
905 4
            ->from($this->tableName)
906
            ->where(
907
                $queryBuilder->expr()->eq('page_id', $queryBuilder->createNamedParameter($id, PDO::PARAM_INT))
908
            )
909 4
            ->orderBy('scheduled', 'DESC');
910 4
911
        $expressionBuilder = GeneralUtility::makeInstance(ConnectionPool::class)
912 4
            ->getConnectionForTable($this->tableName)
913
            ->getExpressionBuilder();
914
        $query = $expressionBuilder->andX();
0 ignored issues
show
Unused Code introduced by
The assignment to $query is dead and can be removed.
Loading history...
915 4
        // PHPStorm adds the highlight that the $addWhere is immediately overwritten,
916
        // but the $query = $expressionBuilder->andX() ensures that the $addWhere is written correctly with AND
917
        // between the statements, it's not a mistake in the code.
918
        switch ($queueFilter) {
919
            case 'pending':
920
                $queryBuilder->andWhere($queryBuilder->expr()->eq('exec_time', 0));
921
                break;
922
            case 'finished':
923
                $queryBuilder->andWhere($queryBuilder->expr()->gt('exec_time', 0));
924
                break;
925
        }
926
927
        if ($doFlush) {
928
            $this->queueRepository->flushQueue($queueFilter);
929 6
        }
930
        if ($itemsPerPage > 0) {
931 6
            $queryBuilder
932
                ->setMaxResults((int) $itemsPerPage);
933 6
        }
934 6
935 6
        return $queryBuilder->execute()->fetchAll();
936 6
    }
937
938 6
    /**
939
     * Return array of records from crawler queue for input set ID
940 6
     *
941 6
     * @param int $set_id Set ID for which to look up log entries.
942 6
     * @param string $filter Filter: "all" => all entries, "pending" => all that is not yet run, "finished" => all complete ones
943 6
     * @param bool $doFlush If TRUE, then entries selected at DELETED(!) instead of selected!
944
     * @param int $itemsPerPage Limit the amount of entries per page default is 10
945
     * @return array
946
     *
947 6
     * @deprecated
948 6
     */
949 6
    public function getLogEntriesForSetId(int $set_id, string $filter = '', bool $doFlush = false, bool $doFullFlush = false, int $itemsPerPage = 10)
950 1
    {
951 1
        $queryBuilder = GeneralUtility::makeInstance(ConnectionPool::class)->getQueryBuilderForTable($this->tableName);
952 1
        $queryBuilder
953 5
            ->select('*')
954 1
            ->from($this->tableName)
955 1
            ->where(
956 1
                $queryBuilder->expr()->eq('set_id', $queryBuilder->createNamedParameter($set_id, PDO::PARAM_INT))
957
            )
958 6
            ->orderBy('scheduled', 'DESC');
959 4
960 4
        $expressionBuilder = GeneralUtility::makeInstance(ConnectionPool::class)
961 4
            ->getConnectionForTable($this->tableName)
962
            ->getExpressionBuilder();
963 2
        $query = $expressionBuilder->andX();
964
        // PHPStorm adds the highlight that the $addWhere is immediately overwritten,
965 2
        // but the $query = $expressionBuilder->andX() ensures that the $addWhere is written correctly with AND
966
        // between the statements, it's not a mistake in the code.
967
        $addWhere = '';
968 2
        switch ($filter) {
969
            case 'pending':
970
                $queryBuilder->andWhere($queryBuilder->expr()->eq('exec_time', 0));
971
                $addWhere = $query->add($expressionBuilder->eq('exec_time', 0));
0 ignored issues
show
Unused Code introduced by
The assignment to $addWhere is dead and can be removed.
Loading history...
972
                break;
973
            case 'finished':
974
                $queryBuilder->andWhere($queryBuilder->expr()->gt('exec_time', 0));
975
                $addWhere = $query->add($expressionBuilder->gt('exec_time', 0));
976
                break;
977
        }
978
        if ($doFlush) {
979
            $addWhere = $query->add($expressionBuilder->eq('set_id', (int) $set_id));
980
            $this->flushQueue($doFullFlush ? '' : $addWhere);
0 ignored issues
show
Deprecated Code introduced by
The function AOE\Crawler\Controller\C...ontroller::flushQueue() has been deprecated. ( Ignorable by Annotation )

If this is a false-positive, you can also ignore this issue in your code via the ignore-deprecated  annotation

980
            /** @scrutinizer ignore-deprecated */ $this->flushQueue($doFullFlush ? '' : $addWhere);
Loading history...
981
            return [];
982
        }
983
        if ($itemsPerPage > 0) {
984
            $queryBuilder
985
                ->setMaxResults((int) $itemsPerPage);
986
        }
987
988
        return $queryBuilder->execute()->fetchAll();
989
    }
990
991
    /**
992
     * Adding call back entries to log (called from hooks typically, see indexed search class "class.crawler.php"
993
     *
994
     * @param integer $setId Set ID
995
     * @param array $params Parameters to pass to call back function
996
     * @param string $callBack Call back object reference, eg. 'EXT:indexed_search/class.crawler.php:&tx_indexedsearch_crawler'
997
     * @param integer $page_id Page ID to attach it to
998
     * @param integer $schedule Time at which to activate
999
     */
1000
    public function addQueueEntry_callBack($setId, $params, $callBack, $page_id = 0, $schedule = 0): void
1001
    {
1002
        if (! is_array($params)) {
0 ignored issues
show
introduced by
The condition is_array($params) is always true.
Loading history...
1003
            $params = [];
1004
        }
1005
        $params['_CALLBACKOBJ'] = $callBack;
1006
1007
        GeneralUtility::makeInstance(ConnectionPool::class)->getConnectionForTable('tx_crawler_queue')
1008
            ->insert(
1009
                'tx_crawler_queue',
1010
                [
1011
                    'page_id' => (int) $page_id,
1012
                    'parameters' => json_encode($params),
1013
                    'scheduled' => (int) $schedule ?: $this->getCurrentTime(),
1014
                    'exec_time' => 0,
1015
                    'set_id' => (int) $setId,
1016
                    'result_data' => '',
1017
                ]
1018 6
            );
1019
    }
1020
1021
    /************************************
1022
     *
1023
     * URL setting
1024
     *
1025
     ************************************/
1026 6
1027 6
    /**
1028
     * Setting a URL for crawling:
1029
     *
1030
     * @param integer $id Page ID
1031 6
     * @param string $url Complete URL
1032
     * @param array $subCfg Sub configuration array (from TS config)
1033
     * @param integer $tstamp Scheduled-time
1034
     * @param string $configurationHash (optional) configuration hash
1035 6
     * @param bool $skipInnerDuplicationCheck (optional) skip inner duplication check
1036 6
     * @return bool
1037 1
     */
1038
    public function addUrl(
1039
        $id,
1040
        $url,
1041 6
        array $subCfg,
1042 6
        $tstamp,
1043 3
        $configurationHash = '',
1044
        $skipInnerDuplicationCheck = false
1045
    ) {
1046
        $urlAdded = false;
1047 6
        $rows = [];
1048
1049 6
        // Creating parameters:
1050 6
        $parameters = [
1051 6
            'url' => $url,
1052 6
        ];
1053 6
1054 6
        // fe user group simulation:
1055 6
        $uGs = implode(',', array_unique(GeneralUtility::intExplode(',', $subCfg['userGroups'], true)));
1056 6
        if ($uGs) {
1057 6
            $parameters['feUserGroupList'] = $uGs;
1058
        }
1059
1060 6
        // Setting processing instructions
1061
        $parameters['procInstructions'] = GeneralUtility::trimExplode(',', $subCfg['procInstrFilter']);
1062 1
        if (is_array($subCfg['procInstrParams.'])) {
1063
            $parameters['procInstrParams'] = $subCfg['procInstrParams.'];
1064 5
        }
1065
1066 4
        // Compile value array:
1067
        $parameters_serialized = json_encode($parameters);
1068
        $fieldArray = [
1069 5
            'page_id' => (int) $id,
1070 4
            'parameters' => $parameters_serialized,
1071 4
            'parameters_hash' => GeneralUtility::shortMD5($parameters_serialized),
1072 4
            'configuration_hash' => $configurationHash,
1073 4
            'scheduled' => $tstamp,
1074
            'exec_time' => 0,
1075 4
            'set_id' => (int) $this->setID,
1076 4
            'result_data' => '',
1077 4
            'configuration' => $subCfg['key'],
1078 4
        ];
1079
1080 1
        if ($this->registerQueueEntriesInternallyOnly) {
1081
            //the entries will only be registered and not stored to the database
1082
            $this->queueEntries[] = $fieldArray;
1083
        } else {
1084 6
            if (! $skipInnerDuplicationCheck) {
1085
                // check if there is already an equal entry
1086
                $rows = $this->queueRepository->getDuplicateQueueItemsIfExists(
1087
                    (bool) $this->extensionSettings['enableTimeslot'],
1088
                    $tstamp,
1089
                    $this->getCurrentTime(),
1090
                    $fieldArray['page_id'],
1091
                    $fieldArray['parameters_hash']
1092
                );
1093
            }
1094
1095
            if (empty($rows)) {
1096
                $connectionForCrawlerQueue = GeneralUtility::makeInstance(ConnectionPool::class)->getConnectionForTable('tx_crawler_queue');
1097
                $connectionForCrawlerQueue->insert(
1098
                    'tx_crawler_queue',
1099
                    $fieldArray
1100
                );
1101
                $uid = $connectionForCrawlerQueue->lastInsertId('tx_crawler_queue', 'qid');
1102
                $rows[] = $uid;
1103
                $urlAdded = true;
1104
1105
                $signalPayload = ['uid' => $uid, 'fieldArray' => $fieldArray];
1106
                SignalSlotUtility::emitSignal(
0 ignored issues
show
Deprecated Code introduced by
The function AOE\Crawler\Utility\Sign...otUtility::emitSignal() has been deprecated. ( Ignorable by Annotation )

If this is a false-positive, you can also ignore this issue in your code via the ignore-deprecated  annotation

1106
                /** @scrutinizer ignore-deprecated */ SignalSlotUtility::emitSignal(
Loading history...
1107
                    self::class,
1108
                    SignalSlotUtility::SIGNAL_URL_ADDED_TO_QUEUE,
1109
                    $signalPayload
1110
                );
1111
            } else {
1112
                $signalPayload = ['rows' => $rows, 'fieldArray' => $fieldArray];
1113
                SignalSlotUtility::emitSignal(
0 ignored issues
show
Deprecated Code introduced by
The function AOE\Crawler\Utility\Sign...otUtility::emitSignal() has been deprecated. ( Ignorable by Annotation )

If this is a false-positive, you can also ignore this issue in your code via the ignore-deprecated  annotation

1113
                /** @scrutinizer ignore-deprecated */ SignalSlotUtility::emitSignal(
Loading history...
1114
                    self::class,
1115
                    SignalSlotUtility::SIGNAL_DUPLICATE_URL_IN_QUEUE,
1116
                    $signalPayload
1117
                );
1118
            }
1119
        }
1120
1121
        return $urlAdded;
1122
    }
1123
1124
    /**
1125
     * Returns the current system time
1126
     *
1127
     * @return int
1128
     */
1129
    public function getCurrentTime()
1130
    {
1131
        return time();
1132
    }
1133
1134
    /************************************
1135
     *
1136
     * URL reading
1137
     *
1138
     ************************************/
1139
1140
    /**
1141
     * Read URL for single queue entry
1142
     *
1143
     * @param integer $queueId
1144
     * @param boolean $force If set, will process even if exec_time has been set!
1145
     *
1146
     * @return int|null
1147
     */
1148
    public function readUrl($queueId, $force = false)
1149
    {
1150
        $queryBuilder = GeneralUtility::makeInstance(ConnectionPool::class)->getQueryBuilderForTable($this->tableName);
1151
        $ret = 0;
1152
        $this->logger->debug('crawler-readurl start ' . microtime(true));
0 ignored issues
show
Bug introduced by
The method debug() does not exist on null. ( Ignorable by Annotation )

If this is a false-positive, you can also ignore this issue in your code via the ignore-call  annotation

1152
        $this->logger->/** @scrutinizer ignore-call */ 
1153
                       debug('crawler-readurl start ' . microtime(true));

This check looks for calls to methods that do not seem to exist on a given type. It looks for the method on the type itself as well as in inherited classes or implemented interfaces.

This is most likely a typographical error or the method has been renamed.

Loading history...
1153
1154
        $queryBuilder
1155
            ->select('*')
1156
            ->from('tx_crawler_queue')
1157
            ->where(
1158
                $queryBuilder->expr()->eq('qid', $queryBuilder->createNamedParameter($queueId, PDO::PARAM_INT))
1159
            );
1160
        if (! $force) {
1161
            $queryBuilder
1162
                ->andWhere('exec_time = 0')
1163
                ->andWhere('process_scheduled > 0');
1164
        }
1165
        $queueRec = $queryBuilder->execute()->fetch();
1166
1167
        if (! is_array($queueRec)) {
1168
            return;
1169
        }
1170
1171
        SignalSlotUtility::emitSignal(
0 ignored issues
show
Deprecated Code introduced by
The function AOE\Crawler\Utility\Sign...otUtility::emitSignal() has been deprecated. ( Ignorable by Annotation )

If this is a false-positive, you can also ignore this issue in your code via the ignore-deprecated  annotation

1171
        /** @scrutinizer ignore-deprecated */ SignalSlotUtility::emitSignal(
Loading history...
1172
            self::class,
1173
            SignalSlotUtility::SIGNAL_QUEUEITEM_PREPROCESS,
1174
            [$queueId, &$queueRec]
1175
        );
1176
1177
        // Set exec_time to lock record:
1178
        $field_array = ['exec_time' => $this->getCurrentTime()];
1179
1180
        if (isset($this->processID)) {
1181
            //if mulitprocessing is used we need to store the id of the process which has handled this entry
1182
            $field_array['process_id_completed'] = $this->processID;
1183
        }
1184
1185
        GeneralUtility::makeInstance(ConnectionPool::class)->getConnectionForTable('tx_crawler_queue')
1186
            ->update(
1187
                'tx_crawler_queue',
1188
                $field_array,
1189
                ['qid' => (int) $queueId]
1190
            );
1191
1192
        $result = $this->queueExecutor->executeQueueItem($queueRec, $this);
1193
        if ($result['content'] === null) {
1194
            $resultData = 'An errors happened';
0 ignored issues
show
Unused Code introduced by
The assignment to $resultData is dead and can be removed.
Loading history...
1195
        } else {
1196
            /** @var JsonCompatibilityConverter $jsonCompatibilityConverter */
1197
            $jsonCompatibilityConverter = GeneralUtility::makeInstance(JsonCompatibilityConverter::class);
1198
            $resultData = $jsonCompatibilityConverter->convert($result['content']);
1199
1200
            //atm there's no need to point to specific pollable extensions
1201
            if (is_array($resultData) && is_array($GLOBALS['TYPO3_CONF_VARS']['EXTCONF']['crawler']['pollSuccess'])) {
1202
                foreach ($GLOBALS['TYPO3_CONF_VARS']['EXTCONF']['crawler']['pollSuccess'] as $pollable) {
1203
                    // only check the success value if the instruction is runnig
1204
                    // it is important to name the pollSuccess key same as the procInstructions key
1205
                    if (is_array($resultData['parameters']['procInstructions'])
1206
                        && in_array(
1207
                            $pollable,
1208
                            $resultData['parameters']['procInstructions'], true
1209
                        )
1210
                    ) {
1211
                        if (! empty($resultData['success'][$pollable]) && $resultData['success'][$pollable]) {
1212
                            $ret |= self::CLI_STATUS_POLLABLE_PROCESSED;
1213
                        }
1214
                    }
1215
                }
1216
            }
1217
        }
1218
        // Set result in log which also denotes the end of the processing of this entry.
1219
        $field_array = ['result_data' => json_encode($result)];
1220
1221
        SignalSlotUtility::emitSignal(
0 ignored issues
show
Deprecated Code introduced by
The function AOE\Crawler\Utility\Sign...otUtility::emitSignal() has been deprecated. ( Ignorable by Annotation )

If this is a false-positive, you can also ignore this issue in your code via the ignore-deprecated  annotation

1221
        /** @scrutinizer ignore-deprecated */ SignalSlotUtility::emitSignal(
Loading history...
1222
            self::class,
1223
            SignalSlotUtility::SIGNAL_QUEUEITEM_POSTPROCESS,
1224
            [$queueId, &$field_array]
1225
        );
1226
1227
        GeneralUtility::makeInstance(ConnectionPool::class)->getConnectionForTable('tx_crawler_queue')
1228
            ->update(
1229
                'tx_crawler_queue',
1230
                $field_array,
1231
                ['qid' => (int) $queueId]
1232
            );
1233
1234
        $this->logger->debug('crawler-readurl stop ' . microtime(true));
1235
        return $ret;
1236
    }
1237
1238
    /**
1239
     * Read URL for not-yet-inserted log-entry
1240
     *
1241
     * @param array $field_array Queue field array,
1242
     *
1243
     * @return array|bool|mixed|string
1244
     */
1245
    public function readUrlFromArray($field_array)
1246
    {
1247
        // Set exec_time to lock record:
1248
        $field_array['exec_time'] = $this->getCurrentTime();
1249
        $connectionForCrawlerQueue = GeneralUtility::makeInstance(ConnectionPool::class)->getConnectionForTable($this->tableName);
1250
        $connectionForCrawlerQueue->insert(
1251
            $this->tableName,
1252
            $field_array
1253
        );
1254
        $queueId = $field_array['qid'] = $connectionForCrawlerQueue->lastInsertId($this->tableName, 'qid');
1255
        $result = $this->queueExecutor->executeQueueItem($field_array, $this);
1256
1257
        // Set result in log which also denotes the end of the processing of this entry.
1258
        $field_array = ['result_data' => json_encode($result)];
1259
1260
        SignalSlotUtility::emitSignal(
0 ignored issues
show
Deprecated Code introduced by
The function AOE\Crawler\Utility\Sign...otUtility::emitSignal() has been deprecated. ( Ignorable by Annotation )

If this is a false-positive, you can also ignore this issue in your code via the ignore-deprecated  annotation

1260
        /** @scrutinizer ignore-deprecated */ SignalSlotUtility::emitSignal(
Loading history...
1261
            self::class,
1262
            SignalSlotUtility::SIGNAL_QUEUEITEM_POSTPROCESS,
1263
            [$queueId, &$field_array]
1264
        );
1265
1266
        $connectionForCrawlerQueue->update(
1267
            $this->tableName,
1268
            $field_array,
1269
            ['qid' => $queueId]
1270
        );
1271
1272
        return $result;
1273
    }
1274
1275
    /*****************************
1276
     *
1277
     * Compiling URLs to crawl - tools
1278
     *
1279
     *****************************/
1280
1281
    /**
1282
     * @param integer $id Root page id to start from.
1283
     * @param integer $depth Depth of tree, 0=only id-page, 1= on sublevel, 99 = infinite
1284
     * @param integer $scheduledTime Unix Time when the URL is timed to be visited when put in queue
1285
     * @param integer $reqMinute Number of requests per minute (creates the interleave between requests)
1286
     * @param boolean $submitCrawlUrls If set, submits the URLs to queue in database (real crawling)
1287
     * @param boolean $downloadCrawlUrls If set (and submitcrawlUrls is false) will fill $downloadUrls with entries)
1288
     * @param array $incomingProcInstructions Array of processing instructions
1289
     * @param array $configurationSelection Array of configuration keys
1290
     * @return string
1291
     */
1292
    public function getPageTreeAndUrls(
1293
        $id,
1294
        $depth,
1295
        $scheduledTime,
1296
        $reqMinute,
1297
        $submitCrawlUrls,
1298
        $downloadCrawlUrls,
1299
        array $incomingProcInstructions,
1300
        array $configurationSelection
1301
    ) {
1302
        $this->scheduledTime = $scheduledTime;
1303
        $this->reqMinute = $reqMinute;
1304
        $this->submitCrawlUrls = $submitCrawlUrls;
1305
        $this->downloadCrawlUrls = $downloadCrawlUrls;
1306
        $this->incomingProcInstructions = $incomingProcInstructions;
1307
        $this->incomingConfigurationSelection = $configurationSelection;
1308
1309
        $this->duplicateTrack = [];
1310
        $this->downloadUrls = [];
1311
1312
        // Drawing tree:
1313
        /* @var PageTreeView $tree */
1314
        $tree = GeneralUtility::makeInstance(PageTreeView::class);
1315
        $perms_clause = $this->getBackendUser()->getPagePermsClause(Permission::PAGE_SHOW);
1316
        $tree->init('AND ' . $perms_clause);
1317
1318
        $pageInfo = BackendUtility::readPageAccess($id, $perms_clause);
1319
        if (is_array($pageInfo)) {
1320
            // Set root row:
1321
            $tree->tree[] = [
1322
                'row' => $pageInfo,
1323
                'HTML' => $this->iconFactory->getIconForRecord('pages', $pageInfo, Icon::SIZE_SMALL),
1324
            ];
1325
        }
1326
1327
        // Get branch beneath:
1328
        if ($depth) {
1329
            $tree->getTree($id, $depth, '');
1330
        }
1331
1332
        // Traverse page tree:
1333
        $code = '';
1334
1335
        foreach ($tree->tree as $data) {
1336
            $this->MP = false;
1337
1338
            // recognize mount points
1339
            if ($data['row']['doktype'] === PageRepository::DOKTYPE_MOUNTPOINT) {
1340
                $mountpage = $this->pageRepository->getPage($data['row']['uid']);
1341
1342
                // fetch mounted pages
1343
                $this->MP = $mountpage[0]['mount_pid'] . '-' . $data['row']['uid'];
0 ignored issues
show
Documentation Bug introduced by
The property $MP was declared of type boolean, but $mountpage[0]['mount_pid...' . $data['row']['uid'] is of type string. Maybe add a type cast?

This check looks for assignments to scalar types that may be of the wrong type.

To ensure the code behaves as expected, it may be a good idea to add an explicit type cast.

$answer = 42;

$correct = false;

$correct = (bool) $answer;
Loading history...
1344
1345
                $mountTree = GeneralUtility::makeInstance(PageTreeView::class);
1346
                $mountTree->init('AND ' . $perms_clause);
1347
                $mountTree->getTree($mountpage[0]['mount_pid'], $depth);
1348
1349
                foreach ($mountTree->tree as $mountData) {
1350
                    $code .= $this->drawURLs_addRowsForPage(
1351
                        $mountData['row'],
1352 1
                        $mountData['HTML'] . BackendUtility::getRecordTitle('pages', $mountData['row'], true)
1353
                    );
1354
                }
1355 1
1356 1
                // replace page when mount_pid_ol is enabled
1357
                if ($mountpage[0]['mount_pid_ol']) {
1358 1
                    $data['row']['uid'] = $mountpage[0]['mount_pid'];
1359 1
                } else {
1360
                    // if the mount_pid_ol is not set the MP must not be used for the mountpoint page
1361 1
                    $this->MP = false;
1362
                }
1363
            }
1364
1365
            $code .= $this->drawURLs_addRowsForPage(
1366
                $data['row'],
1367
                $data['HTML'] . BackendUtility::getRecordTitle('pages', $data['row'], true)
1368
            );
1369
        }
1370
1371
        return $code;
1372
    }
1373
1374
    /**
1375
     * Expands exclude string
1376
     *
1377
     * @param string $excludeString Exclude string
1378
     * @return array
1379
     */
1380
    public function expandExcludeString($excludeString)
1381
    {
1382
        // internal static caches;
1383
        static $expandedExcludeStringCache;
1384
        static $treeCache;
1385
1386
        if (empty($expandedExcludeStringCache[$excludeString])) {
1387
            $pidList = [];
1388
1389
            if (! empty($excludeString)) {
1390
                /** @var PageTreeView $tree */
1391
                $tree = GeneralUtility::makeInstance(PageTreeView::class);
1392 1
                $tree->init('AND ' . $this->getBackendUser()->getPagePermsClause(Permission::PAGE_SHOW));
1393
1394
                $excludeParts = GeneralUtility::trimExplode(',', $excludeString);
1395 1
1396
                foreach ($excludeParts as $excludePart) {
1397
                    [$pid, $depth] = GeneralUtility::trimExplode('+', $excludePart);
1398
1399
                    // default is "page only" = "depth=0"
1400
                    if (empty($depth)) {
1401
                        $depth = (stristr($excludePart, '+')) ? 99 : 0;
1402
                    }
1403
1404
                    $pidList[] = (int) $pid;
1405
1406
                    if ($depth > 0) {
1407
                        if (empty($treeCache[$pid][$depth])) {
1408
                            $tree->reset();
1409
                            $tree->getTree($pid, $depth);
0 ignored issues
show
Bug introduced by
$pid of type string is incompatible with the type integer expected by parameter $uid of TYPO3\CMS\Backend\Tree\V...ractTreeView::getTree(). ( Ignorable by Annotation )

If this is a false-positive, you can also ignore this issue in your code via the ignore-type  annotation

1409
                            $tree->getTree(/** @scrutinizer ignore-type */ $pid, $depth);
Loading history...
1410
                            $treeCache[$pid][$depth] = $tree->tree;
1411
                        }
1412
1413
                        foreach ($treeCache[$pid][$depth] as $data) {
1414
                            $pidList[] = (int) $data['row']['uid'];
1415
                        }
1416
                    }
1417
                }
1418
            }
1419
1420
            $expandedExcludeStringCache[$excludeString] = array_unique($pidList);
1421
        }
1422
1423
        return $expandedExcludeStringCache[$excludeString];
1424
    }
1425
1426
    /**
1427
     * Create the rows for display of the page tree
1428
     * For each page a number of rows are shown displaying GET variable configuration
1429
     */
1430
    public function drawURLs_addRowsForPage(array $pageRow, string $pageTitle): string
1431
    {
1432
        $skipMessage = '';
1433
1434
        // Get list of configurations
1435
        $configurations = $this->getUrlsForPageRow($pageRow, $skipMessage);
1436
        $configurations = ConfigurationService::removeDisallowedConfigurations($this->incomingConfigurationSelection, $configurations);
1437
1438
        // Traverse parameter combinations:
1439
        $c = 0;
1440
        $content = '';
1441
        if (! empty($configurations)) {
1442
            foreach ($configurations as $confKey => $confArray) {
1443
1444
                // Title column:
1445
                if (! $c) {
1446
                    $titleClm = '<td rowspan="' . count($configurations) . '">' . $pageTitle . '</td>';
1447
                } else {
1448
                    $titleClm = '';
1449
                }
1450
1451
                if (! in_array($pageRow['uid'], $this->expandExcludeString($confArray['subCfg']['exclude']), true)) {
1452
1453
                    // URL list:
1454
                    $urlList = $this->urlListFromUrlArray(
1455
                        $confArray,
1456
                        $pageRow,
1457
                        $this->scheduledTime,
1458
                        $this->reqMinute,
1459
                        $this->submitCrawlUrls,
1460
                        $this->downloadCrawlUrls,
1461
                        $this->duplicateTrack,
1462
                        $this->downloadUrls,
1463
                        // if empty the urls won't be filtered by processing instructions
1464
                        $this->incomingProcInstructions
1465
                    );
1466
1467
                    // Expanded parameters:
1468
                    $paramExpanded = '';
1469
                    $calcAccu = [];
1470
                    $calcRes = 1;
1471
                    foreach ($confArray['paramExpanded'] as $gVar => $gVal) {
1472
                        $paramExpanded .= '
1473
                            <tr>
1474
                                <td>' . htmlspecialchars('&' . $gVar . '=') . '<br/>' .
1475
                            '(' . count($gVal) . ')' .
1476
                            '</td>
1477
                                <td nowrap="nowrap">' . nl2br(htmlspecialchars(implode(chr(10), $gVal))) . '</td>
1478
                            </tr>
1479
                        ';
1480
                        $calcRes *= count($gVal);
1481
                        $calcAccu[] = count($gVal);
1482
                    }
1483
                    $paramExpanded = '<table>' . $paramExpanded . '</table>';
1484
                    $paramExpanded .= 'Comb: ' . implode('*', $calcAccu) . '=' . $calcRes;
1485
1486
                    // Options
1487
                    $optionValues = '';
1488
                    if ($confArray['subCfg']['userGroups']) {
1489
                        $optionValues .= 'User Groups: ' . $confArray['subCfg']['userGroups'] . '<br/>';
1490
                    }
1491
                    if ($confArray['subCfg']['procInstrFilter']) {
1492
                        $optionValues .= 'ProcInstr: ' . $confArray['subCfg']['procInstrFilter'] . '<br/>';
1493
                    }
1494
1495
                    // Compile row:
1496
                    $content .= '
1497
                        <tr>
1498
                            ' . $titleClm . '
1499
                            <td>' . htmlspecialchars($confKey) . '</td>
1500
                            <td>' . nl2br(htmlspecialchars(rawurldecode(trim(str_replace('&', chr(10) . '&', GeneralUtility::implodeArrayForUrl('', $confArray['paramParsed'])))))) . '</td>
1501
                            <td>' . $paramExpanded . '</td>
1502
                            <td nowrap="nowrap">' . $urlList . '</td>
1503
                            <td nowrap="nowrap">' . $optionValues . '</td>
1504
                            <td nowrap="nowrap">' . DebugUtility::viewArray($confArray['subCfg']['procInstrParams.']) . '</td>
1505
                        </tr>';
1506
                } else {
1507
                    $content .= '<tr>
1508
                            ' . $titleClm . '
1509
                            <td>' . htmlspecialchars($confKey) . '</td>
1510
                            <td colspan="5"><em>No entries</em> (Page is excluded in this configuration)</td>
1511
                        </tr>';
1512
                }
1513
1514
                $c++;
1515
            }
1516
        } else {
1517
            $message = ! empty($skipMessage) ? ' (' . $skipMessage . ')' : '';
1518
1519
            // Compile row:
1520
            $content .= '
1521
                <tr>
1522
                    <td>' . $pageTitle . '</td>
1523
                    <td colspan="6"><em>No entries</em>' . $message . '</td>
1524
                </tr>';
1525
        }
1526
1527
        return $content;
1528
    }
1529
1530
    /*****************************
1531
     *
1532
     * CLI functions
1533
     *
1534
     *****************************/
1535
1536
    /**
1537
     * Running the functionality of the CLI (crawling URLs from queue)
1538
     */
1539
    public function CLI_run(int $countInARun, int $sleepTime, int $sleepAfterFinish): int
1540
    {
1541
        $result = 0;
1542
        $counter = 0;
1543
1544
        // First, run hooks:
1545
        foreach ($GLOBALS['TYPO3_CONF_VARS']['EXTCONF']['crawler']['cli_hooks'] ?? [] as $objRef) {
1546
            trigger_error(
1547
                'This hook (crawler/cli_hooks) is deprecated since 9.1.5 and will be removed when dropping support for TYPO3 9LTS and 10LTS',
1548
                E_USER_DEPRECATED
1549
            );
1550
            $hookObj = GeneralUtility::makeInstance($objRef);
1551
            if (is_object($hookObj)) {
1552
                $hookObj->crawler_init($this);
1553
            }
1554
        }
1555
1556
        // Clean up the queue
1557
        $this->queueRepository->cleanupQueue();
1558
1559
        // Select entries:
1560
        $rows = $this->queueRepository->fetchRecordsToBeCrawled($countInARun);
1561
1562
        if (! empty($rows)) {
1563
            $quidList = [];
1564
1565
            foreach ($rows as $r) {
1566
                $quidList[] = $r['qid'];
1567
            }
1568
1569
            $processId = $this->CLI_buildProcessId();
1570
1571
            //save the number of assigned queue entries to determine how many have been processed later
1572
            $numberOfAffectedRows = $this->queueRepository->updateProcessIdAndSchedulerForQueueIds($quidList, $processId);
1573
            $this->processRepository->updateProcessAssignItemsCount($numberOfAffectedRows, $processId);
1574
1575
            if ($numberOfAffectedRows !== count($quidList)) {
1576
                return ($result | self::CLI_STATUS_ABORTED);
1577
            }
1578
1579
            foreach ($rows as $r) {
1580
                $result |= $this->readUrl($r['qid']);
1581
1582
                $counter++;
1583
                // Just to relax the system
1584
                usleep((int) $sleepTime);
1585
1586
                // if during the start and the current read url the cli has been disable we need to return from the function
1587
                // mark the process NOT as ended.
1588
                if ($this->crawler->isDisabled()) {
1589
                    return ($result | self::CLI_STATUS_ABORTED);
1590
                }
1591
1592
                if (! $this->processRepository->isProcessActive($this->CLI_buildProcessId())) {
1593
                    $this->CLI_debug('conflict / timeout (' . $this->CLI_buildProcessId() . ')');
0 ignored issues
show
Deprecated Code introduced by
The function AOE\Crawler\Controller\C...Controller::CLI_debug() has been deprecated. ( Ignorable by Annotation )

If this is a false-positive, you can also ignore this issue in your code via the ignore-deprecated  annotation

1593
                    /** @scrutinizer ignore-deprecated */ $this->CLI_debug('conflict / timeout (' . $this->CLI_buildProcessId() . ')');
Loading history...
1594
                    $result |= self::CLI_STATUS_ABORTED;
1595
                    //possible timeout
1596
                    break;
1597
                }
1598
            }
1599
1600
            sleep((int) $sleepAfterFinish);
1601
        }
1602
1603
        if ($counter > 0) {
1604
            $result |= self::CLI_STATUS_PROCESSED;
1605
        }
1606
1607
        return $result;
1608
    }
1609
1610
    /**
1611
     * Activate hooks
1612
     * @deprecated
1613
     */
1614
    public function CLI_runHooks(): void
1615
    {
1616
        foreach ($GLOBALS['TYPO3_CONF_VARS']['EXTCONF']['crawler']['cli_hooks'] ?? [] as $objRef) {
1617
            $hookObj = GeneralUtility::makeInstance($objRef);
1618
            if (is_object($hookObj)) {
1619
                $hookObj->crawler_init($this);
1620
            }
1621
        }
1622
    }
1623
1624
    /**
1625
     * Try to acquire a new process with the given id
1626
     * also performs some auto-cleanup for orphan processes
1627
     * @param string $id identification string for the process
1628
     * @return boolean
1629
     * @todo preemption might not be the most elegant way to clean up
1630
     */
1631
    public function CLI_checkAndAcquireNewProcess($id)
1632
    {
1633
        $ret = true;
1634
1635
        $systemProcessId = getmypid();
1636
        if (! $systemProcessId) {
1637
            return false;
1638
        }
1639
1640
        $processCount = 0;
1641
        $orphanProcesses = [];
1642
1643
        $activeProcesses = $this->processRepository->findAllActive();
1644
        $currentTime = $this->getCurrentTime();
1645
1646
        /** @var Process $process */
1647
        foreach ($activeProcesses as $process) {
1648
            if ($process->getTtl() < $currentTime) {
1649
                $orphanProcesses[] = $process->getProcessId();
1650
            } else {
1651
                $processCount++;
1652
            }
1653
        }
1654
1655
        // if there are less than allowed active processes then add a new one
1656
        if ($processCount < (int) $this->extensionSettings['processLimit']) {
1657
            GeneralUtility::makeInstance(ConnectionPool::class)->getConnectionForTable('tx_crawler_process')->insert(
1658
                'tx_crawler_process',
1659
                [
1660
                    'process_id' => $id,
1661
                    'active' => 1,
1662
                    'ttl' => $currentTime + (int) $this->extensionSettings['processMaxRunTime'],
1663
                    'system_process_id' => $systemProcessId,
1664
                ]
1665
            );
1666
        } else {
1667
            $ret = false;
1668
        }
1669
1670
        $this->processRepository->deleteProcessesMarkedAsDeleted();
1671
        $this->processRepository->markRequestedProcessesAsNotActive($orphanProcesses);
1672
        $this->queueRepository->unsetProcessScheduledAndProcessIdForQueueEntries($orphanProcesses);
1673
1674
        return $ret;
1675
    }
1676
1677
    /**
1678
     * Release a process and the required resources
1679
     *
1680
     * @param mixed $releaseIds string with a single process-id or array with multiple process-ids
1681
     * @return boolean
1682
     * @deprecated
1683
     */
1684
    public function CLI_releaseProcesses($releaseIds)
1685
    {
1686
        $queryBuilder = GeneralUtility::makeInstance(ConnectionPool::class)->getQueryBuilderForTable($this->tableName);
1687
1688
        if (! is_array($releaseIds)) {
1689
            $releaseIds = [$releaseIds];
1690
        }
1691
1692
        if (empty($releaseIds)) {
1693
            //nothing to release
1694
            return false;
1695
        }
1696
1697
        // some kind of 2nd chance algo - this way you need at least 2 processes to have a real cleanup
1698
        // this ensures that a single process can't mess up the entire process table
1699
1700
        // mark all processes as deleted which have no "waiting" queue-entires and which are not active
1701
1702
        $queryBuilder
1703
            ->update($this->tableName, 'q')
1704
            ->where(
1705
                'q.process_id IN(SELECT p.process_id FROM tx_crawler_process as p WHERE p.active = 0)'
1706
            )
1707
            ->set('q.process_scheduled', 0)
1708
            ->set('q.process_id', '')
1709
            ->execute();
1710
1711
        // FIXME: Not entirely sure that this is equivalent to the previous version
1712
        $queryBuilder->resetQueryPart('set');
1713
1714
        $queryBuilder
1715
            ->update('tx_crawler_process')
1716 1
            ->where(
1717
                $queryBuilder->expr()->eq('active', 0),
1718 1
                'process_id IN(SELECT q.process_id FROM tx_crawler_queue as q WHERE q.exec_time = 0)'
1719
            )
1720
            ->set('system_process_id', 0)
1721 1
            ->execute();
1722
1723
        $this->processRepository->markRequestedProcessesAsNotActive($releaseIds);
1724
        $this->queueRepository->unsetProcessScheduledAndProcessIdForQueueEntries($releaseIds);
1725
1726
        return true;
1727
    }
1728
1729
    /**
1730
     * Create a unique Id for the current process
1731
     *
1732
     * @return string the ID
1733
     */
1734
    public function CLI_buildProcessId()
1735
    {
1736
        if (! $this->processID) {
1737
            $this->processID = GeneralUtility::shortMD5(microtime(true));
1738
        }
1739
        return $this->processID;
1740
    }
1741
1742
    /**
1743
     * Prints a message to the stdout (only if debug-mode is enabled)
1744 1
     *
1745
     * @param string $msg the message
1746 1
     * @deprecated
1747 1
     * @codeCoverageIgnore
1748
     */
1749 1
    public function CLI_debug($msg): void
1750 1
    {
1751 1
        if ((int) $this->extensionSettings['processDebug']) {
1752 1
            echo $msg . "\n";
1753
            flush();
1754
        }
1755
    }
1756
1757
    /**
1758
     * Cleans up entries that stayed for too long in the queue. These are:
1759
     * - processed entries that are over 1.5 days in age
1760
     * - scheduled entries that are over 7 days old
1761 5
     *
1762
     * @deprecated
1763 5
     */
1764
    public function cleanUpOldQueueEntries(): void
1765 5
    {
1766
        // 24*60*60 Seconds in 24 hours
1767 5
        $processedAgeInSeconds = $this->extensionSettings['cleanUpProcessedAge'] * 86400;
1768
        $scheduledAgeInSeconds = $this->extensionSettings['cleanUpScheduledAge'] * 86400;
1769
1770
        $now = time();
1771
        $condition = '(exec_time<>0 AND exec_time<' . ($now - $processedAgeInSeconds) . ') OR scheduled<=' . ($now - $scheduledAgeInSeconds);
1772
        $this->flushQueue($condition);
0 ignored issues
show
Deprecated Code introduced by
The function AOE\Crawler\Controller\C...ontroller::flushQueue() has been deprecated. ( Ignorable by Annotation )

If this is a false-positive, you can also ignore this issue in your code via the ignore-deprecated  annotation

1772
        /** @scrutinizer ignore-deprecated */ $this->flushQueue($condition);
Loading history...
1773
    }
1774
1775
    /**
1776
     * Removes queue entries
1777
     *
1778
     * @param string $where SQL related filter for the entries which should be removed
1779
     *
1780
     * @deprecated
1781
     */
1782
    protected function flushQueue($where = ''): void
1783
    {
1784
        $realWhere = strlen((string) $where) > 0 ? $where : '1=1';
1785
1786
        $queryBuilder = $this->getQueryBuilder($this->tableName);
1787
1788
        $groups = $queryBuilder
0 ignored issues
show
Deprecated Code introduced by
The function Doctrine\DBAL\ForwardCom...lity\Result::fetchAll() has been deprecated: Use fetchAllNumeric(), fetchAllAssociative() or fetchFirstColumn() instead. ( Ignorable by Annotation )

If this is a false-positive, you can also ignore this issue in your code via the ignore-deprecated  annotation

1788
        $groups = /** @scrutinizer ignore-deprecated */ $queryBuilder

This function has been deprecated. The supplier of the function has supplied an explanatory message.

The explanatory message should give you some clue as to whether and when the function will be removed and what other function to use instead.

Loading history...
1789
            ->selectLiteral('DISTINCT set_id')
1790
            ->from($this->tableName)
1791 5
            ->where($realWhere)
1792 5
            ->execute()
1793 5
            ->fetchAll();
1794 5
        if (is_array($groups)) {
0 ignored issues
show
introduced by
The condition is_array($groups) is always true.
Loading history...
1795
            foreach ($groups as $group) {
1796
                $subSet = $queryBuilder
0 ignored issues
show
Deprecated Code introduced by
The function Doctrine\DBAL\ForwardCom...lity\Result::fetchAll() has been deprecated: Use fetchAllNumeric(), fetchAllAssociative() or fetchFirstColumn() instead. ( Ignorable by Annotation )

If this is a false-positive, you can also ignore this issue in your code via the ignore-deprecated  annotation

1796
                $subSet = /** @scrutinizer ignore-deprecated */ $queryBuilder

This function has been deprecated. The supplier of the function has supplied an explanatory message.

The explanatory message should give you some clue as to whether and when the function will be removed and what other function to use instead.

Loading history...
1797
                    ->select('qid', 'set_id')
1798
                    ->from($this->tableName)
1799
                    ->where(
1800
                        $realWhere,
1801
                        $queryBuilder->expr()->eq('set_id', $group['set_id'])
1802
                    )
1803
                    ->execute()
1804
                    ->fetchAll();
1805
1806 7
                $payLoad = ['subSet' => $subSet];
1807
                SignalSlotUtility::emitSignal(
0 ignored issues
show
Deprecated Code introduced by
The function AOE\Crawler\Utility\Sign...otUtility::emitSignal() has been deprecated. ( Ignorable by Annotation )

If this is a false-positive, you can also ignore this issue in your code via the ignore-deprecated  annotation

1807
                /** @scrutinizer ignore-deprecated */ SignalSlotUtility::emitSignal(
Loading history...
1808 7
                    self::class,
1809
                    SignalSlotUtility::SIGNAL_QUEUE_ENTRY_FLUSH,
1810 7
                    $payLoad
1811
                );
1812 7
            }
1813
        }
1814 7
1815 7
        $queryBuilder
1816
            ->delete($this->tableName)
1817 7
            ->where($realWhere)
1818 2
            ->execute();
1819 1
    }
1820 1
1821
    /**
1822 1
     * This method determines duplicates for a queue entry with the same parameters and this timestamp.
1823 1
     * If the timestamp is in the past, it will check if there is any unprocessed queue entry in the past.
1824
     * If the timestamp is in the future it will check, if the queued entry has exactly the same timestamp
1825 1
     *
1826 1
     * @param int $tstamp
1827
     * @param array $fieldArray
1828
     *
1829
     * @return array
1830 1
     * @deprecated
1831 2
     */
1832
    protected function getDuplicateRowsIfExist($tstamp, $fieldArray)
1833
    {
1834 5
        $rows = [];
1835
1836
        $currentTime = $this->getCurrentTime();
1837 5
1838 5
        $queryBuilder = GeneralUtility::makeInstance(ConnectionPool::class)->getQueryBuilderForTable($this->tableName);
1839
        $queryBuilder
1840
            ->select('qid')
1841
            ->from('tx_crawler_queue');
1842
        //if this entry is scheduled with "now"
1843 7
        if ($tstamp <= $currentTime) {
1844 7
            if ($this->extensionSettings['enableTimeslot']) {
1845 7
                $timeBegin = $currentTime - 100;
1846 7
                $timeEnd = $currentTime + 100;
1847
                $queryBuilder
1848 7
                    ->where(
1849
                        'scheduled BETWEEN ' . $timeBegin . ' AND ' . $timeEnd . ''
1850 7
                    )
1851 5
                    ->orWhere(
1852
                        $queryBuilder->expr()->lte('scheduled', $currentTime)
1853
                    );
1854 7
            } else {
1855
                $queryBuilder
1856
                    ->where(
1857
                        $queryBuilder->expr()->lte('scheduled', $currentTime)
1858
                    );
1859
            }
1860
        } elseif ($tstamp > $currentTime) {
1861
            //entry with a timestamp in the future need to have the same schedule time
1862 8
            $queryBuilder
1863
                ->where(
1864 8
                    $queryBuilder->expr()->eq('scheduled', $tstamp)
1865 8
                );
1866 8
        }
1867
1868
        $queryBuilder
1869
            ->andWhere('NOT exec_time')
1870
            ->andWhere('NOT process_id')
1871
            ->andWhere($queryBuilder->expr()->eq('page_id', $queryBuilder->createNamedParameter($fieldArray['page_id'], PDO::PARAM_INT)))
1872
            ->andWhere($queryBuilder->expr()->eq('parameters_hash', $queryBuilder->createNamedParameter($fieldArray['parameters_hash'], PDO::PARAM_STR)));
1873
1874
        $statement = $queryBuilder->execute();
1875
1876
        while ($row = $statement->fetch()) {
1877 10
            $rows[] = $row['qid'];
1878
        }
1879 10
1880 10
        return $rows;
1881 5
    }
1882 5
1883 5
    /**
1884 5
     * Returns a md5 hash generated from a serialized configuration array.
1885
     *
1886 5
     * @return string
1887
     */
1888
    protected function getConfigurationHash(array $configuration)
1889
    {
1890
        unset($configuration['paramExpanded']);
1891 5
        unset($configuration['URLs']);
1892
        return md5(serialize($configuration));
1893 5
    }
1894 5
1895 3
    /**
1896 3
     * Build a URL from a Page and the Query String. If the page has a Site configuration, it can be built by using
1897 3
     * the Site instance.
1898 3
     *
1899 3
     * @param int $httpsOrHttp see tx_crawler_configuration.force_ssl
1900 5
     * @throws SiteNotFoundException
1901
     * @throws InvalidRouteArgumentsException
1902
     *
1903
     * @deprecated Using CrawlerController::getUrlFromPageAndQueryParameters() is deprecated since 9.1.1 and will be removed in v11.x, please use UrlService->getUrlFromPageAndQueryParameters() instead.
1904
     * @codeCoverageIgnore
1905
     */
1906 5
    protected function getUrlFromPageAndQueryParameters(int $pageId, string $queryString, ?string $alternativeBaseUrl, int $httpsOrHttp): UriInterface
1907 5
    {
1908 5
        $urlService = new UrlService();
1909 5
        return $urlService->getUrlFromPageAndQueryParameters($pageId, $queryString, $alternativeBaseUrl, $httpsOrHttp);
1910 5
    }
1911
1912
    /**
1913 10
     * @deprecated
1914 2
     */
1915 8
    protected function swapIfFirstIsLargerThanSecond(array $reg): array
1916 6
    {
1917
        // Swap if first is larger than last:
1918
        if ($reg[1] > $reg[2]) {
1919 10
            $temp = $reg[2];
1920
            $reg[2] = $reg[1];
1921
            $reg[1] = $temp;
1922 1
        }
1923
1924
        return $reg;
1925 1
    }
1926
1927
    private function getMaximumUrlsToCompile(): int
1928
    {
1929
        return $this->maximumUrlsToCompile;
1930
    }
1931 1
1932
    /**
1933
     * @return BackendUserAuthentication
1934
     */
1935
    private function getBackendUser()
1936
    {
1937 1
        // Make sure the _cli_ user is loaded
1938
        Bootstrap::initializeBackendAuthentication();
1939
        if ($this->backendUser === null) {
1940 1
            $this->backendUser = $GLOBALS['BE_USER'];
1941 1
        }
1942 1
        return $this->backendUser;
1943
    }
1944 1
1945
    /**
1946
     * Get querybuilder for given table
1947
     *
1948
     * @return QueryBuilder
1949
     */
1950
    private function getQueryBuilder(string $table)
1951
    {
1952 12
        return GeneralUtility::makeInstance(ConnectionPool::class)->getQueryBuilderForTable($table);
1953
    }
1954
}
1955