Passed
Pull Request — master (#677)
by Tomas Norre
09:30 queued 06:06
created

CrawlerController::urlListFromUrlArray()   B

Complexity

Conditions 8
Paths 8

Size

Total Lines 68
Code Lines 39

Duplication

Lines 0
Ratio 0 %

Code Coverage

Tests 28
CRAP Score 8.2225

Importance

Changes 1
Bugs 0 Features 0
Metric Value
cc 8
eloc 39
c 1
b 0
f 0
nc 8
nop 9
dl 0
loc 68
ccs 28
cts 33
cp 0.8485
crap 8.2225
rs 8.0515

How to fix   Long Method    Many Parameters   

Long Method

Small methods make your code easier to understand, in particular if combined with a good name. Besides, if your method is small, finding a good name is usually much easier.

For example, if you find yourself adding comments to a method's body, this is usually a good sign to extract the commented part to a new method, and use the comment as a starting point when coming up with a good name for this new method.

Commonly applied refactorings include:

Many Parameters

Methods with many parameters are not only hard to understand, but their parameters also often become inconsistent when you need more, or different data.

There are several approaches to avoid long parameter lists:

1
<?php
2
3
declare(strict_types=1);
4
5
namespace AOE\Crawler\Controller;
6
7
/***************************************************************
8
 *  Copyright notice
9
 *
10
 *  (c) 2020 AOE GmbH <[email protected]>
11
 *
12
 *  All rights reserved
13
 *
14
 *  This script is part of the TYPO3 project. The TYPO3 project is
15
 *  free software; you can redistribute it and/or modify
16
 *  it under the terms of the GNU General Public License as published by
17
 *  the Free Software Foundation; either version 3 of the License, or
18
 *  (at your option) any later version.
19
 *
20
 *  The GNU General Public License can be found at
21
 *  http://www.gnu.org/copyleft/gpl.html.
22
 *
23
 *  This script is distributed in the hope that it will be useful,
24
 *  but WITHOUT ANY WARRANTY; without even the implied warranty of
25
 *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
26
 *  GNU General Public License for more details.
27
 *
28
 *  This copyright notice MUST APPEAR in all copies of the script!
29
 ***************************************************************/
30
31
use AOE\Crawler\Configuration\ExtensionConfigurationProvider;
32
use AOE\Crawler\Converter\JsonCompatibilityConverter;
33
use AOE\Crawler\Crawler;
34
use AOE\Crawler\CrawlStrategy\CrawlStrategyFactory;
35
use AOE\Crawler\Domain\Model\Process;
36
use AOE\Crawler\Domain\Repository\ConfigurationRepository;
37
use AOE\Crawler\Domain\Repository\ProcessRepository;
38
use AOE\Crawler\Domain\Repository\QueueRepository;
39
use AOE\Crawler\QueueExecutor;
40
use AOE\Crawler\Service\UrlService;
41
use AOE\Crawler\Utility\SignalSlotUtility;
42
use AOE\Crawler\Value\QueueFilter;
43
use PDO;
44
use Psr\Http\Message\UriInterface;
45
use Psr\Log\LoggerAwareInterface;
46
use Psr\Log\LoggerAwareTrait;
47
use TYPO3\CMS\Backend\Tree\View\PageTreeView;
48
use TYPO3\CMS\Backend\Utility\BackendUtility;
49
use TYPO3\CMS\Core\Authentication\BackendUserAuthentication;
50
use TYPO3\CMS\Core\Compatibility\PublicMethodDeprecationTrait;
51
use TYPO3\CMS\Core\Compatibility\PublicPropertyDeprecationTrait;
52
use TYPO3\CMS\Core\Core\Bootstrap;
53
use TYPO3\CMS\Core\Core\Environment;
54
use TYPO3\CMS\Core\Database\Connection;
55
use TYPO3\CMS\Core\Database\ConnectionPool;
56
use TYPO3\CMS\Core\Database\Query\QueryBuilder;
57
use TYPO3\CMS\Core\Database\Query\Restriction\DeletedRestriction;
58
use TYPO3\CMS\Core\Database\QueryGenerator;
59
use TYPO3\CMS\Core\Exception\SiteNotFoundException;
60
use TYPO3\CMS\Core\Imaging\Icon;
61
use TYPO3\CMS\Core\Imaging\IconFactory;
62
use TYPO3\CMS\Core\Routing\InvalidRouteArgumentsException;
63
use TYPO3\CMS\Core\Site\Entity\Site;
64
use TYPO3\CMS\Core\Type\Bitmask\Permission;
65
use TYPO3\CMS\Core\TypoScript\Parser\TypoScriptParser;
66
use TYPO3\CMS\Core\Utility\DebugUtility;
67
use TYPO3\CMS\Core\Utility\GeneralUtility;
68
use TYPO3\CMS\Core\Utility\MathUtility;
69
use TYPO3\CMS\Extbase\Object\ObjectManager;
70
use TYPO3\CMS\Frontend\Page\PageRepository;
71
72
/**
73
 * Class CrawlerController
74
 *
75
 * @package AOE\Crawler\Controller
76
 */
77
class CrawlerController implements LoggerAwareInterface
78
{
79
    use LoggerAwareTrait;
80
    use PublicMethodDeprecationTrait;
81
    use PublicPropertyDeprecationTrait;
82
83
    public const CLI_STATUS_NOTHING_PROCCESSED = 0;
84
85
    //queue not empty
86
    public const CLI_STATUS_REMAIN = 1;
87
88
    //(some) queue items where processed
89
    public const CLI_STATUS_PROCESSED = 2;
90
91
    //instance didn't finish
92
    public const CLI_STATUS_ABORTED = 4;
93
94
    public const CLI_STATUS_POLLABLE_PROCESSED = 8;
95
96
    /**
97
     * @var integer
98
     */
99
    public $setID = 0;
100
101
    /**
102
     * @var string
103
     */
104
    public $processID = '';
105
106
    /**
107
     * @var array
108
     */
109
    public $duplicateTrack = [];
110
111
    /**
112
     * @var array
113
     */
114
    public $downloadUrls = [];
115
116
    /**
117
     * @var array
118
     */
119
    public $incomingProcInstructions = [];
120
121
    /**
122
     * @var array
123
     */
124
    public $incomingConfigurationSelection = [];
125
126
    /**
127
     * @var bool
128
     */
129
    public $registerQueueEntriesInternallyOnly = false;
130
131
    /**
132
     * @var array
133
     */
134
    public $queueEntries = [];
135
136
    /**
137
     * @var array
138
     */
139
    public $urlList = [];
140
141
    /**
142
     * @var array
143
     */
144
    public $extensionSettings = [];
145
146
    /**
147
     * Mount Point
148
     *
149
     * @var bool
150
     * Todo: Check what this is used for and adjust the type hint or code, as bool doesn't match the current code.
151
     */
152
    public $MP = false;
153
154
    /**
155
     * @var string
156
     * @deprecated
157
     */
158
    protected $processFilename;
159
160
    /**
161
     * Holds the internal access mode can be 'gui','cli' or 'cli_im'
162
     *
163
     * @var string
164
     * @deprecated
165
     */
166
    protected $accessMode;
167
168
    /**
169
     * @var QueueRepository
170
     */
171
    protected $queueRepository;
172
173
    /**
174
     * @var ProcessRepository
175
     */
176
    protected $processRepository;
177
178
    /**
179
     * @var ConfigurationRepository
180
     */
181
    protected $configurationRepository;
182
183
    /**
184
     * @var string
185
     */
186
    protected $tableName = 'tx_crawler_queue';
187
188
    /**
189
     * @var QueueExecutor
190
     */
191
    protected $queueExecutor;
192
193
    /**
194
     * @var int
195
     */
196
    protected $maximumUrlsToCompile = 10000;
197
198
    /**
199
     * @var IconFactory
200
     */
201
    protected $iconFactory;
202
203
    /**
204
     * @var string[]
205
     */
206
    private $deprecatedPublicMethods = [
0 ignored issues
show
introduced by
The private property $deprecatedPublicMethods is not used, and could be removed.
Loading history...
207
        'cleanUpOldQueueEntries' => 'Using CrawlerController::cleanUpOldQueueEntries() is deprecated since 9.0.1 and will be removed in v11.x, please use QueueRepository->cleanUpOldQueueEntries() instead.',
208
        'CLI_debug' => 'Using CrawlerController->CLI_debug() is deprecated since 9.1.3 and will be removed in v11.x',
209
        'getAccessMode' => 'Using CrawlerController->getAccessMode() is deprecated since 9.1.3 and will be removed in v11.x',
210
        'getLogEntriesForPageId' => 'Using CrawlerController->getLogEntriesForPageId() is deprecated since 9.1.5 and will be remove in v11.x',
211
        'getLogEntriesForSetId' => 'Using crawlerController::getLogEntriesForSetId() is deprecated since 9.0.1 and will be removed in v11.x',
212
        'flushQueue' => 'Using CrawlerController::flushQueue() is deprecated since 9.0.1 and will be removed in v11.x, please use QueueRepository->flushQueue() instead.',
213
        'setAccessMode' => 'Using CrawlerController->setAccessMode() is deprecated since 9.1.3 and will be removed in v11.x',
214
        'getDisabled' => 'Using CrawlerController->getDisabled() is deprecated since 9.1.3 and will be removed in v11.x, please use Crawler->isDisabled() instead',
215
        'setDisabled' => 'Using CrawlerController->setDisabled() is deprecated since 9.1.3 and will be removed in v11.x, please use Crawler->setDisabled() instead',
216
        'getProcessFilename' => 'Using CrawlerController->getProcessFilename() is deprecated since 9.1.3 and will be removed in v11.x',
217
        'setProcessFilename' => 'Using CrawlerController->setProcessFilename() is deprecated since 9.1.3 and will be removed in v11.x',
218
        'getDuplicateRowsIfExist' => 'Using CrawlerController->getDuplicateRowsIfExist() is deprecated since 9.1.4 and will be remove in v11.x, please use QueueRepository->getDuplicateQueueItemsIfExists() instead',
219
    ];
220
221
    /**
222
     * @var string[]
223
     */
224
    private $deprecatedPublicProperties = [
1 ignored issue
show
introduced by
The private property $deprecatedPublicProperties is not used, and could be removed.
Loading history...
225
        'accessMode' => 'Using CrawlerController->accessMode is deprecated since 9.1.3 and will be removed in v11.x',
226
        'processFilename' => 'Using CrawlerController->accessMode is deprecated since 9.1.3 and will be removed in v11.x',
227
    ];
228
229
    /**
230
     * @var BackendUserAuthentication|null
231
     */
232
    private $backendUser;
233
234
    /**
235
     * @var integer
236
     */
237
    private $scheduledTime = 0;
238
239
    /**
240
     * @var integer
241
     */
242
    private $reqMinute = 0;
243
244
    /**
245
     * @var bool
246
     */
247
    private $submitCrawlUrls = false;
248
249
    /**
250
     * @var bool
251
     */
252
    private $downloadCrawlUrls = false;
253
254
    /**
255
     * @var PageRepository
256
     */
257
    private $pageRepository;
258
259
    /**
260
     * @var Crawler
261
     */
262
    private $crawler;
263
264
    /************************************
265
     *
266
     * Getting URLs based on Page TSconfig
267
     *
268
     ************************************/
269
270 36
    public function __construct()
271
    {
272 36
        $objectManager = GeneralUtility::makeInstance(ObjectManager::class);
273 36
        $crawlStrategyFactory = GeneralUtility::makeInstance(CrawlStrategyFactory::class);
274 36
        $this->queueRepository = $objectManager->get(QueueRepository::class);
275 36
        $this->processRepository = $objectManager->get(ProcessRepository::class);
276 36
        $this->configurationRepository = $objectManager->get(ConfigurationRepository::class);
277 36
        $this->pageRepository = $objectManager->get(PageRepository::class);
278 36
        $this->queueExecutor = GeneralUtility::makeInstance(QueueExecutor::class, $crawlStrategyFactory);
279 36
        $this->iconFactory = GeneralUtility::makeInstance(IconFactory::class);
280 36
        $this->crawler = GeneralUtility::makeInstance(Crawler::class);
281
282 36
        $this->processFilename = Environment::getVarPath() . '/lock/tx_crawler.proc';
0 ignored issues
show
Deprecated Code introduced by
The property AOE\Crawler\Controller\C...oller::$processFilename has been deprecated. ( Ignorable by Annotation )

If this is a false-positive, you can also ignore this issue in your code via the ignore-deprecated  annotation

282
        /** @scrutinizer ignore-deprecated */ $this->processFilename = Environment::getVarPath() . '/lock/tx_crawler.proc';
Loading history...
283
284
        /** @var ExtensionConfigurationProvider $configurationProvider */
285 36
        $configurationProvider = GeneralUtility::makeInstance(ExtensionConfigurationProvider::class);
286 36
        $settings = $configurationProvider->getExtensionConfiguration();
287 36
        $this->extensionSettings = is_array($settings) ? $settings : [];
0 ignored issues
show
introduced by
The condition is_array($settings) is always true.
Loading history...
288
289
        // set defaults:
290 36
        if (MathUtility::convertToPositiveInteger($this->extensionSettings['countInARun']) === 0) {
291
            $this->extensionSettings['countInARun'] = 100;
292
        }
293
294 36
        $this->extensionSettings['processLimit'] = MathUtility::forceIntegerInRange($this->extensionSettings['processLimit'], 1, 99, 1);
295 36
        $this->maximumUrlsToCompile = MathUtility::forceIntegerInRange($this->extensionSettings['maxCompileUrls'], 1, 1000000000, 10000);
296 36
    }
297
298
    /**
299
     * Method to set the accessMode can be gui, cli or cli_im
300
     *
301
     * @return string
302
     * @deprecated
303
     */
304 1
    public function getAccessMode()
305
    {
306 1
        return $this->accessMode;
0 ignored issues
show
Deprecated Code introduced by
The property AOE\Crawler\Controller\C...Controller::$accessMode has been deprecated. ( Ignorable by Annotation )

If this is a false-positive, you can also ignore this issue in your code via the ignore-deprecated  annotation

306
        return /** @scrutinizer ignore-deprecated */ $this->accessMode;
Loading history...
307
    }
308
309
    /**
310
     * @param string $accessMode
311
     * @deprecated
312
     */
313 1
    public function setAccessMode($accessMode): void
314
    {
315 1
        $this->accessMode = $accessMode;
0 ignored issues
show
Deprecated Code introduced by
The property AOE\Crawler\Controller\C...Controller::$accessMode has been deprecated. ( Ignorable by Annotation )

If this is a false-positive, you can also ignore this issue in your code via the ignore-deprecated  annotation

315
        /** @scrutinizer ignore-deprecated */ $this->accessMode = $accessMode;
Loading history...
316 1
    }
317
318
    /**
319
     * Set disabled status to prevent processes from being processed
320
     *
321
     * @param bool $disabled (optional, defaults to true)
322
     * @deprecated
323
     */
324 2
    public function setDisabled($disabled = true): void
325
    {
326 2
        if ($disabled) {
327 1
            GeneralUtility::writeFile($this->processFilename, '');
0 ignored issues
show
Deprecated Code introduced by
The property AOE\Crawler\Controller\C...oller::$processFilename has been deprecated. ( Ignorable by Annotation )

If this is a false-positive, you can also ignore this issue in your code via the ignore-deprecated  annotation

327
            GeneralUtility::writeFile(/** @scrutinizer ignore-deprecated */ $this->processFilename, '');
Loading history...
328
        } else {
329 1
            if (is_file($this->processFilename)) {
0 ignored issues
show
Deprecated Code introduced by
The property AOE\Crawler\Controller\C...oller::$processFilename has been deprecated. ( Ignorable by Annotation )

If this is a false-positive, you can also ignore this issue in your code via the ignore-deprecated  annotation

329
            if (is_file(/** @scrutinizer ignore-deprecated */ $this->processFilename)) {
Loading history...
330 1
                unlink($this->processFilename);
0 ignored issues
show
Deprecated Code introduced by
The property AOE\Crawler\Controller\C...oller::$processFilename has been deprecated. ( Ignorable by Annotation )

If this is a false-positive, you can also ignore this issue in your code via the ignore-deprecated  annotation

330
                unlink(/** @scrutinizer ignore-deprecated */ $this->processFilename);
Loading history...
331
            }
332
        }
333 2
    }
334
335
    /**
336
     * Get disable status
337
     *
338
     * @return bool true if disabled
339
     * @deprecated
340
     */
341 2
    public function getDisabled()
342
    {
343 2
        return is_file($this->processFilename);
0 ignored issues
show
Deprecated Code introduced by
The property AOE\Crawler\Controller\C...oller::$processFilename has been deprecated. ( Ignorable by Annotation )

If this is a false-positive, you can also ignore this issue in your code via the ignore-deprecated  annotation

343
        return is_file(/** @scrutinizer ignore-deprecated */ $this->processFilename);
Loading history...
344
    }
345
346
    /**
347
     * @param string $filenameWithPath
348
     * @deprecated
349
     */
350 3
    public function setProcessFilename($filenameWithPath): void
351
    {
352 3
        $this->processFilename = $filenameWithPath;
0 ignored issues
show
Deprecated Code introduced by
The property AOE\Crawler\Controller\C...oller::$processFilename has been deprecated. ( Ignorable by Annotation )

If this is a false-positive, you can also ignore this issue in your code via the ignore-deprecated  annotation

352
        /** @scrutinizer ignore-deprecated */ $this->processFilename = $filenameWithPath;
Loading history...
353 3
    }
354
355
    /**
356
     * @return string
357
     * @deprecated
358
     */
359 1
    public function getProcessFilename()
360
    {
361 1
        return $this->processFilename;
0 ignored issues
show
Deprecated Code introduced by
The property AOE\Crawler\Controller\C...oller::$processFilename has been deprecated. ( Ignorable by Annotation )

If this is a false-positive, you can also ignore this issue in your code via the ignore-deprecated  annotation

361
        return /** @scrutinizer ignore-deprecated */ $this->processFilename;
Loading history...
362
    }
363
364
    /**
365
     * Sets the extensions settings (unserialized pendant of $TYPO3_CONF_VARS['EXT']['extConf']['crawler']).
366
     */
367 14
    public function setExtensionSettings(array $extensionSettings): void
368
    {
369 14
        $this->extensionSettings = $extensionSettings;
370 14
    }
371
372
    /**
373
     * Check if the given page should be crawled
374
     *
375
     * @return false|string false if the page should be crawled (not excluded), true / skipMessage if it should be skipped
376
     */
377 12
    public function checkIfPageShouldBeSkipped(array $pageRow)
378
    {
379
        // if page is hidden
380 12
        if (! $this->extensionSettings['crawlHiddenPages'] && $pageRow['hidden']) {
381 1
            return 'Because page is hidden';
382
        }
383
384 11
        if (GeneralUtility::inList('3,4,199,254,255', $pageRow['doktype'])) {
385 3
            return 'Because doktype is not allowed';
386
        }
387
388 8
        foreach ($GLOBALS['TYPO3_CONF_VARS']['EXTCONF']['crawler']['excludeDoktype'] ?? [] as $key => $doktypeList) {
389 1
            if (GeneralUtility::inList($doktypeList, $pageRow['doktype'])) {
390 1
                return 'Doktype was excluded by "' . $key . '"';
391
            }
392
        }
393
394
        // veto hook
395 7
        foreach ($GLOBALS['TYPO3_CONF_VARS']['EXTCONF']['crawler']['pageVeto'] ?? [] as $key => $func) {
396
            $params = [
397 2
                'pageRow' => $pageRow,
398
            ];
399
            // expects "false" if page is ok and "true" or a skipMessage if this page should _not_ be crawled
400 2
            $veto = GeneralUtility::callUserFunction($func, $params, $this);
401 2
            if ($veto !== false) {
402 2
                if (is_string($veto)) {
403 1
                    return $veto;
404
                }
405 1
                return 'Veto from hook "' . htmlspecialchars($key) . '"';
406
            }
407
        }
408
409 5
        return false;
410
    }
411
412
    /**
413
     * Wrapper method for getUrlsForPageId()
414
     * It returns an array of configurations and no urls!
415
     *
416
     * @param array $pageRow Page record with at least dok-type and uid columns.
417
     * @param string $skipMessage
418
     * @return array
419
     * @see getUrlsForPageId()
420
     */
421 6
    public function getUrlsForPageRow(array $pageRow, &$skipMessage = '')
422
    {
423 6
        if (! is_int($pageRow['uid'])) {
424 6
            $skipMessage = 'PageUid ' . $pageRow['uid'] . ' was not an integer';
425 5
            return [];
426 5
        }
427
428 1
        $message = $this->checkIfPageShouldBeSkipped($pageRow);
429 1
        if ($message === false) {
430
            $res = $this->getUrlsForPageId($pageRow['uid']);
431
            $skipMessage = '';
432 6
        } else {
433
            $skipMessage = $message;
434
            $res = [];
435
        }
436
437
        return $res;
438
    }
439
440
    /**
441
     * Creates a list of URLs from input array (and submits them to queue if asked for)
442
     * See Web > Info module script + "indexed_search"'s crawler hook-client using this!
443
     *
444
     * @param array $vv Information about URLs from pageRow to crawl.
445
     * @param array $pageRow Page row
446
     * @param int $scheduledTime Unix time to schedule indexing to, typically time()
447
     * @param int $reqMinute Number of requests per minute (creates the interleave between requests)
448
     * @param bool $submitCrawlUrls If set, submits the URLs to queue
449
     * @param bool $downloadCrawlUrls If set (and submitcrawlUrls is false) will fill $downloadUrls with entries)
450 4
     * @param array $duplicateTrack Array which is passed by reference and contains the an id per url to secure we will not crawl duplicates
451
     * @param array $downloadUrls Array which will be filled with URLS for download if flag is set.
452
     * @param array $incomingProcInstructions Array of processing instructions
453
     * @return string List of URLs (meant for display in backend module)
454
     */
455
    public function urlListFromUrlArray(
456
        array $vv,
457
        array $pageRow,
458
        $scheduledTime,
459
        $reqMinute,
460
        $submitCrawlUrls,
461 4
        $downloadCrawlUrls,
462
        array &$duplicateTrack,
463
        array &$downloadUrls,
464 4
        array $incomingProcInstructions
465 4
    ) {
466 4
        if (! is_array($vv['URLs'])) {
467 4
            return 'ERROR - no URL generated';
468
        }
469 4
        $urlLog = [];
470
        $pageId = (int) $pageRow['uid'];
471 4
        $configurationHash = $this->getConfigurationHash($vv);
472 4
        $skipInnerCheck = $this->queueRepository->noUnprocessedQueueEntriesForPageWithConfigurationHashExist($pageId, $configurationHash);
473
474
        $urlService = new UrlService();
475 4
476 4
        foreach ($vv['URLs'] as $urlQuery) {
477
            if (! $this->drawURLs_PIfilter($vv['subCfg']['procInstrFilter'], $incomingProcInstructions)) {
478 4
                continue;
479 4
            }
480
            $url = (string) $urlService->getUrlFromPageAndQueryParameters(
481
                $pageId,
482
                $urlQuery,
483 4
                $vv['subCfg']['baseUrl'] ?? null,
484
                $vv['subCfg']['force_ssl'] ?? 0
485 4
            );
486
487
            // Create key by which to determine unique-ness:
488
            $uKey = $url . '|' . $vv['subCfg']['userGroups'] . '|' . $vv['subCfg']['procInstrFilter'];
489
490 4
            if (isset($duplicateTrack[$uKey])) {
491 4
                //if the url key is registered just display it and do not resubmit is
492 4
                $urlLog[] = '<em><span class="text-muted">' . htmlspecialchars($url) . '</span></em>';
493 4
            } else {
494 4
                // Scheduled time:
495
                $schTime = $scheduledTime + round(count($duplicateTrack) * (60 / $reqMinute));
496
                $schTime = intval($schTime / 60) * 60;
497 4
                $formattedDate = BackendUtility::datetime($schTime);
498 4
                $this->urlList[] = '[' . $formattedDate . '] ' . $url;
499 4
                $urlList = '[' . $formattedDate . '] ' . htmlspecialchars($url);
500
501 4
                // Submit for crawling!
502
                if ($submitCrawlUrls) {
503
                    $added = $this->addUrl(
504
                        $pageId,
505
                        $url,
506 4
                        $vv['subCfg'],
507 4
                        $scheduledTime,
508
                        $configurationHash,
509
                        $skipInnerCheck
510
                    );
511
                    if ($added === false) {
512 4
                        $urlList .= ' (URL already existed)';
513
                    }
514 4
                } elseif ($downloadCrawlUrls) {
515
                    $downloadUrls[$url] = $url;
516
                }
517 4
                $urlLog[] = $urlList;
518
            }
519
            $duplicateTrack[$uKey] = true;
520
        }
521
522
        return implode('<br>', $urlLog);
523
    }
524
525
    /**
526
     * Returns true if input processing instruction is among registered ones.
527 5
     *
528
     * @param string $piString PI to test
529 5
     * @param array $incomingProcInstructions Processing instructions
530 1
     * @return boolean
531
     */
532
    public function drawURLs_PIfilter($piString, array $incomingProcInstructions)
533 4
    {
534 4
        if (empty($incomingProcInstructions)) {
535 2
            return true;
536
        }
537
538 2
        foreach ($incomingProcInstructions as $pi) {
539
            if (GeneralUtility::inList($piString, $pi)) {
540
                return true;
541 5
            }
542
        }
543 5
        return false;
544 5
    }
545
546
    public function getPageTSconfigForId($id): array
547
    {
548
        if (! $this->MP) {
549
            $pageTSconfig = BackendUtility::getPagesTSconfig($id);
0 ignored issues
show
Deprecated Code introduced by
The function TYPO3\CMS\Backend\Utilit...ity::getPagesTSconfig() has been deprecated. ( Ignorable by Annotation )

If this is a false-positive, you can also ignore this issue in your code via the ignore-deprecated  annotation

549
            $pageTSconfig = /** @scrutinizer ignore-deprecated */ BackendUtility::getPagesTSconfig($id);
Loading history...
550
        } else {
551
            // TODO: Please check, this makes no sense to split a boolean value.
552 5
            [, $mountPointId] = explode('-', $this->MP);
0 ignored issues
show
Bug introduced by
$this->MP of type true is incompatible with the type string expected by parameter $string of explode(). ( Ignorable by Annotation )

If this is a false-positive, you can also ignore this issue in your code via the ignore-type  annotation

552
            [, $mountPointId] = explode('-', /** @scrutinizer ignore-type */ $this->MP);
Loading history...
553
            $pageTSconfig = BackendUtility::getPagesTSconfig($mountPointId);
0 ignored issues
show
Bug introduced by
$mountPointId of type string is incompatible with the type integer expected by parameter $id of TYPO3\CMS\Backend\Utilit...ity::getPagesTSconfig(). ( Ignorable by Annotation )

If this is a false-positive, you can also ignore this issue in your code via the ignore-type  annotation

553
            $pageTSconfig = BackendUtility::getPagesTSconfig(/** @scrutinizer ignore-type */ $mountPointId);
Loading history...
Deprecated Code introduced by
The function TYPO3\CMS\Backend\Utilit...ity::getPagesTSconfig() has been deprecated. ( Ignorable by Annotation )

If this is a false-positive, you can also ignore this issue in your code via the ignore-deprecated  annotation

553
            $pageTSconfig = /** @scrutinizer ignore-deprecated */ BackendUtility::getPagesTSconfig($mountPointId);
Loading history...
554
        }
555
556
        // Call a hook to alter configuration
557
        if (is_array($GLOBALS['TYPO3_CONF_VARS']['EXTCONF']['crawler']['getPageTSconfigForId'])) {
558
            $params = [
559
                'pageId' => $id,
560
                'pageTSConfig' => &$pageTSconfig,
561 5
            ];
562
            foreach ($GLOBALS['TYPO3_CONF_VARS']['EXTCONF']['crawler']['getPageTSconfigForId'] as $userFunc) {
563
                GeneralUtility::callUserFunction($userFunc, $params, $this);
564
            }
565
        }
566
        return $pageTSconfig;
567
    }
568 4
569
    /**
570
     * This methods returns an array of configurations.
571 4
     * Adds no urls!
572
     */
573 4
    public function getUrlsForPageId(int $pageId): array
574
    {
575
        // Get page TSconfig for page ID
576 4
        $pageTSconfig = $this->getPageTSconfigForId($pageId);
577 4
578 3
        $res = [];
579 3
580
        // Fetch Crawler Configuration from pageTSconfig
581 3
        $crawlerCfg = $pageTSconfig['tx_crawler.']['crawlerCfg.']['paramSets.'] ?? [];
582
        foreach ($crawlerCfg as $key => $values) {
583 3
            if (! is_array($values)) {
584 3
                continue;
585
            }
586 3
            $key = str_replace('.', '', $key);
587 3
            // Sub configuration for a single configuration string:
588
            $subCfg = (array) $crawlerCfg[$key . '.'];
589 3
            $subCfg['key'] = $key;
590
591
            if (strcmp($subCfg['procInstrFilter'] ?? '', '')) {
592
                $subCfg['procInstrFilter'] = implode(',', GeneralUtility::trimExplode(',', $subCfg['procInstrFilter']));
593 3
            }
594
            $pidOnlyList = implode(',', GeneralUtility::trimExplode(',', $subCfg['pidsOnly'], true));
595
596 3
            // process configuration if it is not page-specific or if the specific page is the current page:
597 3
            // TODO: Check if $pidOnlyList can be kept as Array instead of imploded
598 3
            if (! strcmp((string) $subCfg['pidsOnly'], '') || GeneralUtility::inList($pidOnlyList, strval($pageId))) {
599 3
600 3
                // Explode, process etc.:
601
                $res[$key] = [];
602
                $res[$key]['subCfg'] = $subCfg;
603 3
                $res[$key]['paramParsed'] = GeneralUtility::explodeUrl2Array($crawlerCfg[$key]);
604 3
                $res[$key]['paramExpanded'] = $this->expandParameters($res[$key]['paramParsed'], $pageId);
605
                $res[$key]['origin'] = 'pagets';
606
607
                // recognize MP value
608
                if (! $this->MP) {
609
                    $res[$key]['URLs'] = $this->compileUrls($res[$key]['paramExpanded'], ['?id=' . $pageId]);
610
                } else {
611
                    $res[$key]['URLs'] = $this->compileUrls($res[$key]['paramExpanded'], ['?id=' . $pageId . '&MP=' . $this->MP]);
0 ignored issues
show
Bug introduced by
Are you sure $this->MP of type true can be used in concatenation? ( Ignorable by Annotation )

If this is a false-positive, you can also ignore this issue in your code via the ignore-type  annotation

611
                    $res[$key]['URLs'] = $this->compileUrls($res[$key]['paramExpanded'], ['?id=' . $pageId . '&MP=' . /** @scrutinizer ignore-type */ $this->MP]);
Loading history...
612 4
                }
613 4
            }
614
        }
615
616 1
        // Get configuration from tx_crawler_configuration records up the rootline
617 1
        $crawlerConfigurations = $this->configurationRepository->getCrawlerConfigurationRecordsFromRootLine($pageId);
618
        foreach ($crawlerConfigurations as $configurationRecord) {
619
620
            // check access to the configuration record
621 1
            if (empty($configurationRecord['begroups']) || $this->getBackendUser()->isAdmin() || $this->hasGroupAccess($this->getBackendUser()->user['usergroup_cached_list'], $configurationRecord['begroups'])) {
622 1
                $pidOnlyList = implode(',', GeneralUtility::trimExplode(',', $configurationRecord['pidsonly'], true));
623
624
                // process configuration if it is not page-specific or if the specific page is the current page:
625 1
                // TODO: Check if $pidOnlyList can be kept as Array instead of imploded
626
                if (! strcmp($configurationRecord['pidsonly'], '') || GeneralUtility::inList($pidOnlyList, strval($pageId))) {
627
                    $key = $configurationRecord['name'];
628 1
629 1
                    // don't overwrite previously defined paramSets
630
                    if (! isset($res[$key])) {
631
632 1
                        /* @var $TSparserObject TypoScriptParser */
633 1
                        $TSparserObject = GeneralUtility::makeInstance(TypoScriptParser::class);
634 1
                        $TSparserObject->parse($configurationRecord['processing_instruction_parameters_ts']);
635 1
636 1
                        $subCfg = [
637 1
                            'procInstrFilter' => $configurationRecord['processing_instruction_filter'],
638 1
                            'procInstrParams.' => $TSparserObject->setup,
639
                            'baseUrl' => $configurationRecord['base_url'],
640
                            'force_ssl' => (int) $configurationRecord['force_ssl'],
641 1
                            'userGroups' => $configurationRecord['fegroups'],
642 1
                            'exclude' => $configurationRecord['exclude'],
643 1
                            'key' => $key,
644 1
                        ];
645 1
646 1
                        if (! in_array($pageId, $this->expandExcludeString($subCfg['exclude']), true)) {
647 1
                            $res[$key] = [];
648
                            $res[$key]['subCfg'] = $subCfg;
649
                            $res[$key]['paramParsed'] = GeneralUtility::explodeUrl2Array($configurationRecord['configuration']);
650
                            $res[$key]['paramExpanded'] = $this->expandParameters($res[$key]['paramParsed'], $pageId);
651
                            $res[$key]['URLs'] = $this->compileUrls($res[$key]['paramExpanded'], ['?id=' . $pageId]);
652
                            $res[$key]['origin'] = 'tx_crawler_configuration_' . $configurationRecord['uid'];
653
                        }
654 4
                    }
655
                }
656
            }
657
        }
658
659
        foreach ($GLOBALS['TYPO3_CONF_VARS']['EXTCONF']['crawler']['processUrls'] ?? [] as $func) {
660 4
            $params = [
661
                'res' => &$res,
662
            ];
663
            GeneralUtility::callUserFunction($func, $params, $this);
664
        }
665
        return $res;
666
    }
667 1
668
    /**
669 1
     * Find all configurations of subpages of a page
670 1
     * TODO: Write Functional Tests
671 1
     */
672 1
    public function getConfigurationsForBranch(int $rootid, int $depth): array
673
    {
674
        $configurationsForBranch = [];
675
        $pageTSconfig = $this->getPageTSconfigForId($rootid);
676
        $sets = $pageTSconfig['tx_crawler.']['crawlerCfg.']['paramSets.'] ?? [];
677
        foreach ($sets as $key => $value) {
678 1
            if (! is_array($value)) {
679 1
                continue;
680 1
            }
681 1
            $configurationsForBranch[] = substr($key, -1) === '.' ? substr($key, 0, -1) : $key;
682
        }
683
        $pids = [];
684 1
        $rootLine = BackendUtility::BEgetRootLine($rootid);
685 1
        foreach ($rootLine as $node) {
686 1
            $pids[] = $node['uid'];
687 1
        }
688 1
        /* @var PageTreeView $tree */
689
        $tree = GeneralUtility::makeInstance(PageTreeView::class);
690
        $perms_clause = $this->getBackendUser()->getPagePermsClause(Permission::PAGE_SHOW);
691
        $tree->init(empty($perms_clause) ? '' : ('AND ' . $perms_clause));
692 1
        $tree->getTree($rootid, $depth, '');
693
        foreach ($tree->tree as $node) {
694 1
            $pids[] = $node['row']['uid'];
695 1
        }
696 1
697 1
        $queryBuilder = $this->getQueryBuilder('tx_crawler_configuration');
698
        $statement = $queryBuilder
699 1
            ->select('name')
700
            ->from('tx_crawler_configuration')
701 1
            ->where(
702 1
                $queryBuilder->expr()->in('pid', $queryBuilder->createNamedParameter($pids, Connection::PARAM_INT_ARRAY))
703
            )
704 1
            ->execute();
705
706
        while ($row = $statement->fetch()) {
707
            $configurationsForBranch[] = $row['name'];
708
        }
709
        return $configurationsForBranch;
710
    }
711
712
    /**
713
     * Check if a user has access to an item
714
     * (e.g. get the group list of the current logged in user from $GLOBALS['TSFE']->gr_list)
715
     *
716 3
     * @param string $groupList Comma-separated list of (fe_)group UIDs from a user
717
     * @param string $accessList Comma-separated list of (fe_)group UIDs of the item to access
718 3
     * @return bool TRUE if at least one of the users group UIDs is in the access list or the access list is empty
719 1
     * @see \TYPO3\CMS\Frontend\Page\PageRepository::getMultipleGroupsWhereClause()
720
     */
721 2
    public function hasGroupAccess($groupList, $accessList)
722 2
    {
723 1
        if (empty($accessList)) {
724
            return true;
725
        }
726 1
        foreach (GeneralUtility::intExplode(',', $groupList) as $groupUid) {
727
            if (GeneralUtility::inList($accessList, $groupUid)) {
728
                return true;
729
            }
730
        }
731
        return false;
732
    }
733
734
    /**
735
     * Will expand the parameters configuration to individual values. This follows a certain syntax of the value of each parameter.
736
     * Syntax of values:
737
     * - Basically: If the value is wrapped in [...] it will be expanded according to the following syntax, otherwise the value is taken literally
738
     * - Configuration is splitted by "|" and the parts are processed individually and finally added together
739
     * - For each configuration part:
740
     *         - "[int]-[int]" = Integer range, will be expanded to all values in between, values included, starting from low to high (max. 1000). Example "1-34" or "-40--30"
741
     *         - "_TABLE:[TCA table name];[_PID:[optional page id, default is current page]];[_ENABLELANG:1]" = Look up of table records from PID, filtering out deleted records. Example "_TABLE:tt_content; _PID:123"
742
     *        _ENABLELANG:1 picks only original records without their language overlays
743
     *         - Default: Literal value
744
     *
745
     * @param array $paramArray Array with key (GET var name) and values (value of GET var which is configuration for expansion)
746 11
     * @param integer $pid Current page ID
747
     * @return array
748
     *
749 11
     * TODO: Write Functional Tests
750 11
     */
751
    public function expandParameters($paramArray, $pid)
752
    {
753 11
        // Traverse parameter names:
754
        foreach ($paramArray as $p => $v) {
755 11
            $v = trim($v);
756 11
757
            // If value is encapsulated in square brackets it means there are some ranges of values to find, otherwise the value is literal
758
            if (strpos($v, '[') === 0 && substr($v, -1) === ']') {
759 11
                // So, find the value inside brackets and reset the paramArray value as an array.
760 11
                $v = substr($v, 1, -1);
761
                $paramArray[$p] = [];
762
763 11
                // Explode parts and traverse them:
764 1
                $parts = explode('|', $v);
765
                foreach ($parts as $pV) {
766
767
                    // Look for integer range: (fx. 1-34 or -40--30 // reads minus 40 to minus 30)
768 1
                    if (preg_match('/^(-?[0-9]+)\s*-\s*(-?[0-9]+)$/', trim($pV), $reg)) {
769 1
                        $reg = $this->swapIfFirstIsLargerThanSecond($reg);
770 1
771 1
                        // Traverse range, add values:
772 1
                        // Limit to size of range!
773
                        $runAwayBrake = 1000;
774
                        for ($a = $reg[1]; $a <= $reg[2]; $a++) {
775
                            $paramArray[$p][] = $a;
776 10
                            $runAwayBrake--;
777
                            if ($runAwayBrake <= 0) {
778
                                break;
779 6
                            }
780 6
                        }
781 6
                    } elseif (strpos(trim($pV), '_TABLE:') === 0) {
782 6
783 6
                        // Parse parameters:
784
                        $subparts = GeneralUtility::trimExplode(';', $pV);
785
                        $subpartParams = [];
786
                        foreach ($subparts as $spV) {
787 6
                            [$pKey, $pVal] = GeneralUtility::trimExplode(':', $spV);
788 6
                            $subpartParams[$pKey] = $pVal;
789 6
                        }
790 6
791 6
                        // Table exists:
792 6
                        if (isset($GLOBALS['TCA'][$subpartParams['_TABLE']])) {
793
                            $lookUpPid = isset($subpartParams['_PID']) ? intval($subpartParams['_PID']) : intval($pid);
794 6
                            $recursiveDepth = isset($subpartParams['_RECURSIVE']) ? intval($subpartParams['_RECURSIVE']) : 0;
795 6
                            $pidField = isset($subpartParams['_PIDFIELD']) ? trim($subpartParams['_PIDFIELD']) : 'pid';
796 6
                            $where = $subpartParams['_WHERE'] ?? '';
797
                            $addTable = $subpartParams['_ADDTABLE'] ?? '';
798 6
799
                            $fieldName = $subpartParams['_FIELD'] ? $subpartParams['_FIELD'] : 'uid';
800 2
                            if ($fieldName === 'uid' || $GLOBALS['TCA'][$subpartParams['_TABLE']]['columns'][$fieldName]) {
801 2
                                $queryBuilder = $this->getQueryBuilder($subpartParams['_TABLE']);
802 2
803
                                if ($recursiveDepth > 0) {
804 4
                                    /** @var QueryGenerator $queryGenerator */
805
                                    $queryGenerator = GeneralUtility::makeInstance(QueryGenerator::class);
806
                                    $pidList = $queryGenerator->getTreeList($lookUpPid, $recursiveDepth, 0, 1);
807 6
                                    $pidArray = GeneralUtility::intExplode(',', $pidList);
808 6
                                } else {
809 6
                                    $pidArray = [(string) $lookUpPid];
810
                                }
811
812 6
                                $queryBuilder->getRestrictions()
813 6
                                    ->removeAll()
814 6
                                    ->add(GeneralUtility::makeInstance(DeletedRestriction::class));
815 6
816
                                $queryBuilder
817
                                    ->select($fieldName)
818
                                    ->from($subpartParams['_TABLE'])
819 6
                                    ->where(
820
                                        $queryBuilder->expr()->in($pidField, $queryBuilder->createNamedParameter($pidArray, Connection::PARAM_INT_ARRAY)),
821
                                        $where
822
                                    );
823 6
824
                                if (! empty($addTable)) {
825 6
                                    // TODO: Check if this works as intended!
826
                                    $queryBuilder->add('from', $addTable);
827
                                }
828
                                $transOrigPointerField = $GLOBALS['TCA'][$subpartParams['_TABLE']]['ctrl']['transOrigPointerField'];
829
830
                                if ($subpartParams['_ENABLELANG'] && $transOrigPointerField) {
831
                                    $queryBuilder->andWhere(
832
                                        $queryBuilder->expr()->lte(
833
                                            $transOrigPointerField,
834 6
                                            0
835
                                        )
836 6
                                    );
837 6
                                }
838 6
839
                                $statement = $queryBuilder->execute();
840
841 6
                                $rows = [];
842 6
                                while ($row = $statement->fetch()) {
843
                                    $rows[$row[$fieldName]] = $row;
844
                                }
845
846
                                if (is_array($rows)) {
847
                                    $paramArray[$p] = array_merge($paramArray[$p], array_keys($rows));
848 4
                                }
849
                            }
850
                        }
851 11
                    } else {
852
                        // Just add value:
853
                        $paramArray[$p][] = $pV;
854
                    }
855
                    // Hook for processing own expandParameters place holder
856
                    if (is_array($GLOBALS['TYPO3_CONF_VARS']['SC_OPTIONS']['crawler/class.tx_crawler_lib.php']['expandParameters'])) {
857
                        $_params = [
858
                            'pObj' => &$this,
859
                            'paramArray' => &$paramArray,
860
                            'currentKey' => $p,
861
                            'currentValue' => $pV,
862
                            'pid' => $pid,
863
                        ];
864
                        foreach ($GLOBALS['TYPO3_CONF_VARS']['SC_OPTIONS']['crawler/class.tx_crawler_lib.php']['expandParameters'] as $_funcRef) {
865
                            GeneralUtility::callUserFunction($_funcRef, $_params, $this);
866 11
                        }
867 11
                    }
868
                }
869
870 4
                // Make unique set of values and sort array by key:
871
                $paramArray[$p] = array_unique($paramArray[$p]);
872
                ksort($paramArray);
873
            } else {
874 11
                // Set the literal value as only value in array:
875
                $paramArray[$p] = [$v];
876
            }
877
        }
878
879
        return $paramArray;
880
    }
881
882
    /**
883
     * Compiling URLs from parameter array (output of expandParameters())
884
     * The number of URLs will be the multiplication of the number of parameter values for each key
885 7
     *
886
     * @param array $paramArray Output of expandParameters(): Array with keys (GET var names) and for each an array of values
887 7
     * @param array $urls URLs accumulated in this array (for recursion)
888 7
     * @return array
889
     */
890
    public function compileUrls($paramArray, array $urls)
891 6
    {
892 6
        if (empty($paramArray)) {
893 6
            return $urls;
894
        }
895
        // shift first off stack:
896 6
        reset($paramArray);
897 6
        $varName = key($paramArray);
898 5
        $valueSet = array_shift($paramArray);
899 5
900
        // Traverse value set:
901 5
        $newUrls = [];
902
        foreach ($urls as $url) {
903
            foreach ($valueSet as $val) {
904
                $newUrls[] = $url . (strcmp((string) $val, '') ? '&' . rawurlencode($varName) . '=' . rawurlencode((string) $val) : '');
905
906 6
                if (count($newUrls) > $this->maximumUrlsToCompile) {
907
                    break;
908
                }
909
            }
910
        }
911
        return $this->compileUrls($paramArray, $newUrls);
912
    }
913
914
    /************************************
915
     *
916
     * Crawler log
917
     *
918
     ************************************/
919
920
    /**
921
     * Return array of records from crawler queue for input page ID
922
     *
923
     * @param integer $id Page ID for which to look up log entries.
924
     * @param boolean $doFlush If TRUE, then entries selected at DELETED(!) instead of selected!
925
     * @param boolean $doFullFlush
926 4
     * @param integer $itemsPerPage Limit the amount of entries per page default is 10
927
     * @return array
928 4
     *
929
     * @deprecated
930 4
     */
931 4
    public function getLogEntriesForPageId($id, QueueFilter $queueFilter, $doFlush = false, $doFullFlush = false, $itemsPerPage = 10)
0 ignored issues
show
Unused Code introduced by
The parameter $doFullFlush is not used and could be removed. ( Ignorable by Annotation )

If this is a false-positive, you can also ignore this issue in your code via the ignore-unused  annotation

931
    public function getLogEntriesForPageId($id, QueueFilter $queueFilter, $doFlush = false, /** @scrutinizer ignore-unused */ $doFullFlush = false, $itemsPerPage = 10)

This check looks for parameters that have been defined for a function or method, but which are not used in the method body.

Loading history...
932 4
    {
933 4
        $queryBuilder = GeneralUtility::makeInstance(ConnectionPool::class)->getQueryBuilderForTable($this->tableName);
934
        $queryBuilder
935 4
            ->select('*')
936
            ->from($this->tableName)
937 4
            ->where(
938 4
                $queryBuilder->expr()->eq('page_id', $queryBuilder->createNamedParameter($id, PDO::PARAM_INT))
939 4
            )
940 4
            ->orderBy('scheduled', 'DESC');
941
942
        $expressionBuilder = GeneralUtility::makeInstance(ConnectionPool::class)
943
            ->getConnectionForTable($this->tableName)
944 4
            ->getExpressionBuilder();
945 4
        $query = $expressionBuilder->andX();
0 ignored issues
show
Unused Code introduced by
The assignment to $query is dead and can be removed.
Loading history...
946
        // PHPStorm adds the highlight that the $addWhere is immediately overwritten,
947
        // but the $query = $expressionBuilder->andX() ensures that the $addWhere is written correctly with AND
948 4
        // between the statements, it's not a mistake in the code.
949
        switch ($queueFilter) {
950
            case 'pending':
951
                $queryBuilder->andWhere($queryBuilder->expr()->eq('exec_time', 0));
952
                break;
953 4
            case 'finished':
954 2
                $queryBuilder->andWhere($queryBuilder->expr()->gt('exec_time', 0));
955
                break;
956 4
        }
957
958 4
        if ($doFlush) {
959
            $this->queueRepository->flushQueue($queueFilter);
960
        }
961 4
        if ($itemsPerPage > 0) {
962
            $queryBuilder
963
                ->setMaxResults((int) $itemsPerPage);
964
        }
965
966
        return $queryBuilder->execute()->fetchAll();
967
    }
968
969
    /**
970
     * Return array of records from crawler queue for input set ID
971
     *
972
     * @param int $set_id Set ID for which to look up log entries.
973
     * @param string $filter Filter: "all" => all entries, "pending" => all that is not yet run, "finished" => all complete ones
974
     * @param bool $doFlush If TRUE, then entries selected at DELETED(!) instead of selected!
975 6
     * @param int $itemsPerPage Limit the amount of entries per page default is 10
976
     * @return array
977 6
     *
978
     * @deprecated
979 6
     */
980 6
    public function getLogEntriesForSetId(int $set_id, string $filter = '', bool $doFlush = false, bool $doFullFlush = false, int $itemsPerPage = 10)
981 6
    {
982 6
        $queryBuilder = GeneralUtility::makeInstance(ConnectionPool::class)->getQueryBuilderForTable($this->tableName);
983
        $queryBuilder
984 6
            ->select('*')
985
            ->from($this->tableName)
986 6
            ->where(
987 6
                $queryBuilder->expr()->eq('set_id', $queryBuilder->createNamedParameter($set_id, PDO::PARAM_INT))
988 6
            )
989 6
            ->orderBy('scheduled', 'DESC');
990
991
        $expressionBuilder = GeneralUtility::makeInstance(ConnectionPool::class)
992
            ->getConnectionForTable($this->tableName)
993 6
            ->getExpressionBuilder();
994 6
        $query = $expressionBuilder->andX();
995 6
        // PHPStorm adds the highlight that the $addWhere is immediately overwritten,
996 1
        // but the $query = $expressionBuilder->andX() ensures that the $addWhere is written correctly with AND
997 1
        // between the statements, it's not a mistake in the code.
998 1
        $addWhere = '';
999 5
        switch ($filter) {
1000 1
            case 'pending':
1001 1
                $queryBuilder->andWhere($queryBuilder->expr()->eq('exec_time', 0));
1002 1
                $addWhere = $query->add($expressionBuilder->eq('exec_time', 0));
0 ignored issues
show
Unused Code introduced by
The assignment to $addWhere is dead and can be removed.
Loading history...
1003
                break;
1004 6
            case 'finished':
1005 4
                $queryBuilder->andWhere($queryBuilder->expr()->gt('exec_time', 0));
1006 4
                $addWhere = $query->add($expressionBuilder->gt('exec_time', 0));
1007 4
                break;
1008
        }
1009 2
        if ($doFlush) {
1010
            $addWhere = $query->add($expressionBuilder->eq('set_id', (int) $set_id));
1011 2
            $this->flushQueue($doFullFlush ? '' : $addWhere);
0 ignored issues
show
Deprecated Code introduced by
The function AOE\Crawler\Controller\C...ontroller::flushQueue() has been deprecated. ( Ignorable by Annotation )

If this is a false-positive, you can also ignore this issue in your code via the ignore-deprecated  annotation

1011
            /** @scrutinizer ignore-deprecated */ $this->flushQueue($doFullFlush ? '' : $addWhere);
Loading history...
1012
            return [];
1013
        }
1014 2
        if ($itemsPerPage > 0) {
1015
            $queryBuilder
1016
                ->setMaxResults((int) $itemsPerPage);
1017
        }
1018
1019
        return $queryBuilder->execute()->fetchAll();
1020
    }
1021
1022
    /**
1023
     * Adding call back entries to log (called from hooks typically, see indexed search class "class.crawler.php"
1024
     *
1025
     * @param integer $setId Set ID
1026
     * @param array $params Parameters to pass to call back function
1027
     * @param string $callBack Call back object reference, eg. 'EXT:indexed_search/class.crawler.php:&tx_indexedsearch_crawler'
1028
     * @param integer $page_id Page ID to attach it to
1029
     * @param integer $schedule Time at which to activate
1030
     */
1031
    public function addQueueEntry_callBack($setId, $params, $callBack, $page_id = 0, $schedule = 0): void
1032
    {
1033
        if (! is_array($params)) {
0 ignored issues
show
introduced by
The condition is_array($params) is always true.
Loading history...
1034
            $params = [];
1035
        }
1036
        $params['_CALLBACKOBJ'] = $callBack;
1037
1038
        GeneralUtility::makeInstance(ConnectionPool::class)->getConnectionForTable('tx_crawler_queue')
1039
            ->insert(
1040
                'tx_crawler_queue',
1041
                [
1042
                    'page_id' => (int) $page_id,
1043
                    'parameters' => json_encode($params),
1044
                    'scheduled' => (int) $schedule ?: $this->getCurrentTime(),
1045
                    'exec_time' => 0,
1046
                    'set_id' => (int) $setId,
1047
                    'result_data' => '',
1048
                ]
1049
            );
1050
    }
1051
1052
    /************************************
1053
     *
1054
     * URL setting
1055
     *
1056
     ************************************/
1057
1058
    /**
1059
     * Setting a URL for crawling:
1060
     *
1061
     * @param integer $id Page ID
1062
     * @param string $url Complete URL
1063
     * @param array $subCfg Sub configuration array (from TS config)
1064 8
     * @param integer $tstamp Scheduled-time
1065
     * @param string $configurationHash (optional) configuration hash
1066
     * @param bool $skipInnerDuplicationCheck (optional) skip inner duplication check
1067
     * @return bool
1068
     */
1069
    public function addUrl(
1070
        $id,
1071
        $url,
1072 8
        array $subCfg,
1073 8
        $tstamp,
1074
        $configurationHash = '',
1075
        $skipInnerDuplicationCheck = false
1076
    ) {
1077 8
        $urlAdded = false;
1078
        $rows = [];
1079
1080
        // Creating parameters:
1081 8
        $parameters = [
1082 8
            'url' => $url,
1083 1
        ];
1084
1085
        // fe user group simulation:
1086
        $uGs = implode(',', array_unique(GeneralUtility::intExplode(',', $subCfg['userGroups'], true)));
1087 8
        if ($uGs) {
1088 8
            $parameters['feUserGroupList'] = $uGs;
1089 5
        }
1090
1091
        // Setting processing instructions
1092
        $parameters['procInstructions'] = GeneralUtility::trimExplode(',', $subCfg['procInstrFilter']);
1093 8
        if (is_array($subCfg['procInstrParams.'])) {
1094
            $parameters['procInstrParams'] = $subCfg['procInstrParams.'];
1095 8
        }
1096 8
1097 8
        // Compile value array:
1098 8
        $parameters_serialized = json_encode($parameters);
1099 8
        $fieldArray = [
1100 8
            'page_id' => (int) $id,
1101 8
            'parameters' => $parameters_serialized,
1102 8
            'parameters_hash' => GeneralUtility::shortMD5($parameters_serialized),
1103 8
            'configuration_hash' => $configurationHash,
1104
            'scheduled' => $tstamp,
1105
            'exec_time' => 0,
1106 8
            'set_id' => (int) $this->setID,
1107
            'result_data' => '',
1108 1
            'configuration' => $subCfg['key'],
1109
        ];
1110 7
1111
        if ($this->registerQueueEntriesInternallyOnly) {
1112 6
            //the entries will only be registered and not stored to the database
1113 6
            $this->queueEntries[] = $fieldArray;
1114
        } else {
1115 6
            if (! $skipInnerDuplicationCheck) {
1116 6
                // check if there is already an equal entry
1117 6
                $rows = $this->queueRepository->getDuplicateQueueItemsIfExists(
1118
                    (bool) $this->extensionSettings['enableTimeslot'],
1119
                    $tstamp,
1120
                    $this->getCurrentTime(),
1121 7
                    $fieldArray['page_id'],
1122 6
                    $fieldArray['parameters_hash']
1123 6
                );
1124 6
            }
1125
1126
            if (empty($rows)) {
1127 6
                $connectionForCrawlerQueue = GeneralUtility::makeInstance(ConnectionPool::class)->getConnectionForTable('tx_crawler_queue');
1128 6
                $connectionForCrawlerQueue->insert(
1129 6
                    'tx_crawler_queue',
1130
                    $fieldArray
1131 6
                );
1132 6
                $uid = $connectionForCrawlerQueue->lastInsertId('tx_crawler_queue', 'qid');
1133 6
                $rows[] = $uid;
1134 6
                $urlAdded = true;
1135
1136
                $signalPayload = ['uid' => $uid, 'fieldArray' => $fieldArray];
1137
                SignalSlotUtility::emitSignal(
1138 3
                    self::class,
1139 3
                    SignalSlotUtility::SIGNAL_URL_ADDED_TO_QUEUE,
1140 3
                    $signalPayload
1141 3
                );
1142
            } else {
1143
                $signalPayload = ['rows' => $rows, 'fieldArray' => $fieldArray];
1144
                SignalSlotUtility::emitSignal(
1145
                    self::class,
1146
                    SignalSlotUtility::SIGNAL_DUPLICATE_URL_IN_QUEUE,
1147 8
                    $signalPayload
1148
                );
1149
            }
1150
        }
1151
1152
        return $urlAdded;
1153
    }
1154
1155 2
    /**
1156
     * Returns the current system time
1157 2
     *
1158
     * @return int
1159
     */
1160
    public function getCurrentTime()
1161
    {
1162
        return time();
1163
    }
1164
1165
    /************************************
1166
     *
1167
     * URL reading
1168
     *
1169
     ************************************/
1170
1171
    /**
1172
     * Read URL for single queue entry
1173
     *
1174
     * @param integer $queueId
1175
     * @param boolean $force If set, will process even if exec_time has been set!
1176
     * @return integer
1177
     */
1178
    public function readUrl($queueId, $force = false)
1179
    {
1180
        $queryBuilder = GeneralUtility::makeInstance(ConnectionPool::class)->getQueryBuilderForTable($this->tableName);
1181
        $ret = 0;
1182
        $this->logger->debug('crawler-readurl start ' . microtime(true));
1183
        // Get entry:
1184
        $queryBuilder
1185
            ->select('*')
1186
            ->from('tx_crawler_queue')
1187
            ->where(
1188
                $queryBuilder->expr()->eq('qid', $queryBuilder->createNamedParameter($queueId, PDO::PARAM_INT))
1189
            );
1190
        if (! $force) {
1191
            $queryBuilder
1192
                ->andWhere('exec_time = 0')
1193
                ->andWhere('process_scheduled > 0');
1194
        }
1195
        $queueRec = $queryBuilder->execute()->fetch();
1196
1197
        if (! is_array($queueRec)) {
1198
            return;
1199
        }
1200
1201
        SignalSlotUtility::emitSignal(
1202
            self::class,
1203
            SignalSlotUtility::SIGNAL_QUEUEITEM_PREPROCESS,
1204
            [$queueId, &$queueRec]
1205
        );
1206
1207
        // Set exec_time to lock record:
1208
        $field_array = ['exec_time' => $this->getCurrentTime()];
1209
1210
        if (isset($this->processID)) {
1211
            //if mulitprocessing is used we need to store the id of the process which has handled this entry
1212
            $field_array['process_id_completed'] = $this->processID;
1213
        }
1214
1215
        GeneralUtility::makeInstance(ConnectionPool::class)->getConnectionForTable('tx_crawler_queue')
1216
            ->update(
1217
                'tx_crawler_queue',
1218
                $field_array,
1219
                ['qid' => (int) $queueId]
1220
            );
1221
1222
        $result = $this->queueExecutor->executeQueueItem($queueRec, $this);
1223
        if ($result['content'] === null) {
1224
            $resultData = 'An errors happened';
1225
        } else {
1226
            /** @var JsonCompatibilityConverter $jsonCompatibilityConverter */
1227
            $jsonCompatibilityConverter = GeneralUtility::makeInstance(JsonCompatibilityConverter::class);
1228
            $resultData = $jsonCompatibilityConverter->convert($result['content']);
1229
        }
1230
1231
        //atm there's no need to point to specific pollable extensions
1232
        if (is_array($GLOBALS['TYPO3_CONF_VARS']['EXTCONF']['crawler']['pollSuccess'])) {
1233
            foreach ($GLOBALS['TYPO3_CONF_VARS']['EXTCONF']['crawler']['pollSuccess'] as $pollable) {
1234
                // only check the success value if the instruction is runnig
1235
                // it is important to name the pollSuccess key same as the procInstructions key
1236
                if (is_array($resultData['parameters']['procInstructions'])
1237
                    && in_array(
1238
                        $pollable,
1239
                        $resultData['parameters']['procInstructions'], true
1240
                    )
1241
                ) {
1242
                    if (! empty($resultData['success'][$pollable]) && $resultData['success'][$pollable]) {
1243
                        $ret |= self::CLI_STATUS_POLLABLE_PROCESSED;
1244
                    }
1245
                }
1246
            }
1247
        }
1248
1249
        // Set result in log which also denotes the end of the processing of this entry.
1250
        $field_array = ['result_data' => json_encode($result)];
1251
1252
        SignalSlotUtility::emitSignal(
1253
            self::class,
1254
            SignalSlotUtility::SIGNAL_QUEUEITEM_POSTPROCESS,
1255
            [$queueId, &$field_array]
1256
        );
1257
1258
        GeneralUtility::makeInstance(ConnectionPool::class)->getConnectionForTable('tx_crawler_queue')
1259
            ->update(
1260
                'tx_crawler_queue',
1261
                $field_array,
1262
                ['qid' => (int) $queueId]
1263
            );
1264
1265
        $this->logger->debug('crawler-readurl stop ' . microtime(true));
1266
        return $ret;
1267
    }
1268
1269
    /**
1270
     * Read URL for not-yet-inserted log-entry
1271
     *
1272
     * @param array $field_array Queue field array,
1273
     *
1274
     * @return array|bool|mixed|string
1275
     */
1276
    public function readUrlFromArray($field_array)
1277
    {
1278
        // Set exec_time to lock record:
1279
        $field_array['exec_time'] = $this->getCurrentTime();
1280
        $connectionForCrawlerQueue = GeneralUtility::makeInstance(ConnectionPool::class)->getConnectionForTable($this->tableName);
1281
        $connectionForCrawlerQueue->insert(
1282
            $this->tableName,
1283
            $field_array
1284
        );
1285
        $queueId = $connectionForCrawlerQueue->lastInsertId($this->tableName, 'qid');
1286
        $result = $this->queueExecutor->executeQueueItem($field_array, $this);
1287
1288
        // Set result in log which also denotes the end of the processing of this entry.
1289
        $field_array = ['result_data' => json_encode($result)];
1290
1291
        SignalSlotUtility::emitSignal(
1292
            self::class,
1293
            SignalSlotUtility::SIGNAL_QUEUEITEM_POSTPROCESS,
1294
            [$queueId, &$field_array]
1295
        );
1296
1297
        $connectionForCrawlerQueue->update(
1298
            $this->tableName,
1299
            $field_array,
1300
            ['qid' => $queueId]
1301
        );
1302
1303
        return $result;
1304
    }
1305
1306
    /*****************************
1307
     *
1308
     * Compiling URLs to crawl - tools
1309
     *
1310
     *****************************/
1311
1312
    /**
1313
     * @param integer $id Root page id to start from.
1314
     * @param integer $depth Depth of tree, 0=only id-page, 1= on sublevel, 99 = infinite
1315
     * @param integer $scheduledTime Unix Time when the URL is timed to be visited when put in queue
1316
     * @param integer $reqMinute Number of requests per minute (creates the interleave between requests)
1317
     * @param boolean $submitCrawlUrls If set, submits the URLs to queue in database (real crawling)
1318
     * @param boolean $downloadCrawlUrls If set (and submitcrawlUrls is false) will fill $downloadUrls with entries)
1319
     * @param array $incomingProcInstructions Array of processing instructions
1320
     * @param array $configurationSelection Array of configuration keys
1321
     * @return string
1322
     */
1323
    public function getPageTreeAndUrls(
1324
        $id,
1325
        $depth,
1326
        $scheduledTime,
1327
        $reqMinute,
1328
        $submitCrawlUrls,
1329
        $downloadCrawlUrls,
1330
        array $incomingProcInstructions,
1331
        array $configurationSelection
1332
    ) {
1333
        $this->scheduledTime = $scheduledTime;
1334
        $this->reqMinute = $reqMinute;
1335
        $this->submitCrawlUrls = $submitCrawlUrls;
1336
        $this->downloadCrawlUrls = $downloadCrawlUrls;
1337
        $this->incomingProcInstructions = $incomingProcInstructions;
1338
        $this->incomingConfigurationSelection = $configurationSelection;
1339
1340
        $this->duplicateTrack = [];
1341
        $this->downloadUrls = [];
1342
1343
        // Drawing tree:
1344
        /* @var PageTreeView $tree */
1345
        $tree = GeneralUtility::makeInstance(PageTreeView::class);
1346
        $perms_clause = $this->getBackendUser()->getPagePermsClause(Permission::PAGE_SHOW);
1347
        $tree->init('AND ' . $perms_clause);
1348
1349
        $pageInfo = BackendUtility::readPageAccess($id, $perms_clause);
1350
        if (is_array($pageInfo)) {
0 ignored issues
show
introduced by
The condition is_array($pageInfo) is always true.
Loading history...
1351
            // Set root row:
1352
            $tree->tree[] = [
1353
                'row' => $pageInfo,
1354
                'HTML' => $this->iconFactory->getIconForRecord('pages', $pageInfo, Icon::SIZE_SMALL),
1355
            ];
1356
        }
1357
1358
        // Get branch beneath:
1359
        if ($depth) {
1360
            $tree->getTree($id, $depth, '');
1361
        }
1362
1363
        // Traverse page tree:
1364
        $code = '';
1365
1366
        foreach ($tree->tree as $data) {
1367
            $this->MP = false;
1368
1369
            // recognize mount points
1370
            if ($data['row']['doktype'] === PageRepository::DOKTYPE_MOUNTPOINT) {
1371
                $mountpage = $this->pageRepository->getPage($data['row']['uid']);
1372
1373
                // fetch mounted pages
1374
                $this->MP = $mountpage[0]['mount_pid'] . '-' . $data['row']['uid'];
0 ignored issues
show
Documentation Bug introduced by
The property $MP was declared of type boolean, but $mountpage[0]['mount_pid...' . $data['row']['uid'] is of type string. Maybe add a type cast?

This check looks for assignments to scalar types that may be of the wrong type.

To ensure the code behaves as expected, it may be a good idea to add an explicit type cast.

$answer = 42;

$correct = false;

$correct = (bool) $answer;
Loading history...
1375
1376
                $mountTree = GeneralUtility::makeInstance(PageTreeView::class);
1377
                $mountTree->init('AND ' . $perms_clause);
1378
                $mountTree->getTree($mountpage[0]['mount_pid'], $depth);
1379
1380
                foreach ($mountTree->tree as $mountData) {
1381
                    $code .= $this->drawURLs_addRowsForPage(
1382
                        $mountData['row'],
1383
                        $mountData['HTML'] . BackendUtility::getRecordTitle('pages', $mountData['row'], true)
1384
                    );
1385
                }
1386
1387
                // replace page when mount_pid_ol is enabled
1388
                if ($mountpage[0]['mount_pid_ol']) {
1389
                    $data['row']['uid'] = $mountpage[0]['mount_pid'];
1390
                } else {
1391
                    // if the mount_pid_ol is not set the MP must not be used for the mountpoint page
1392
                    $this->MP = false;
1393
                }
1394
            }
1395
1396
            $code .= $this->drawURLs_addRowsForPage(
1397
                $data['row'],
1398
                $data['HTML'] . BackendUtility::getRecordTitle('pages', $data['row'], true)
1399
            );
1400
        }
1401
1402
        return $code;
1403
    }
1404
1405
    /**
1406 2
     * Expands exclude string
1407
     *
1408
     * @param string $excludeString Exclude string
1409 2
     * @return array
1410 2
     */
1411
    public function expandExcludeString($excludeString)
1412 2
    {
1413 2
        // internal static caches;
1414
        static $expandedExcludeStringCache;
1415 2
        static $treeCache;
1416
1417 1
        if (empty($expandedExcludeStringCache[$excludeString])) {
1418 1
            $pidList = [];
1419
1420 1
            if (! empty($excludeString)) {
1421
                /** @var PageTreeView $tree */
1422 1
                $tree = GeneralUtility::makeInstance(PageTreeView::class);
1423 1
                $tree->init('AND ' . $this->getBackendUser()->getPagePermsClause(Permission::PAGE_SHOW));
1424
1425
                $excludeParts = GeneralUtility::trimExplode(',', $excludeString);
1426 1
1427 1
                foreach ($excludeParts as $excludePart) {
1428
                    [$pid, $depth] = GeneralUtility::trimExplode('+', $excludePart);
1429
1430 1
                    // default is "page only" = "depth=0"
1431
                    if (empty($depth)) {
1432 1
                        $depth = (stristr($excludePart, '+')) ? 99 : 0;
1433
                    }
1434
1435
                    $pidList[] = (int) $pid;
1436
1437
                    if ($depth > 0) {
1438
                        if (empty($treeCache[$pid][$depth])) {
1439
                            $tree->reset();
1440
                            $tree->getTree($pid, $depth);
0 ignored issues
show
Bug introduced by
$pid of type string is incompatible with the type integer expected by parameter $uid of TYPO3\CMS\Backend\Tree\V...ractTreeView::getTree(). ( Ignorable by Annotation )

If this is a false-positive, you can also ignore this issue in your code via the ignore-type  annotation

1440
                            $tree->getTree(/** @scrutinizer ignore-type */ $pid, $depth);
Loading history...
1441
                            $treeCache[$pid][$depth] = $tree->tree;
1442
                        }
1443
1444
                        foreach ($treeCache[$pid][$depth] as $data) {
1445
                            $pidList[] = (int) $data['row']['uid'];
1446 2
                        }
1447
                    }
1448
                }
1449 2
            }
1450
1451
            $expandedExcludeStringCache[$excludeString] = array_unique($pidList);
1452
        }
1453
1454
        return $expandedExcludeStringCache[$excludeString];
1455
    }
1456
1457
    /**
1458
     * Create the rows for display of the page tree
1459
     * For each page a number of rows are shown displaying GET variable configuration
1460
     */
1461
    public function drawURLs_addRowsForPage(array $pageRow, string $pageTitle): string
1462
    {
1463
        $skipMessage = '';
1464
1465
        // Get list of configurations
1466
        $configurations = $this->getUrlsForPageRow($pageRow, $skipMessage);
1467
1468
        if (! empty($this->incomingConfigurationSelection)) {
1469
            // remove configuration that does not match the current selection
1470
            foreach ($configurations as $confKey => $confArray) {
1471
                if (! in_array($confKey, $this->incomingConfigurationSelection, true)) {
1472
                    unset($configurations[$confKey]);
1473
                }
1474
            }
1475
        }
1476
1477
        // Traverse parameter combinations:
1478
        $c = 0;
1479
        $content = '';
1480
        if (! empty($configurations)) {
1481
            foreach ($configurations as $confKey => $confArray) {
1482
1483
                // Title column:
1484
                if (! $c) {
1485
                    $titleClm = '<td rowspan="' . count($configurations) . '">' . $pageTitle . '</td>';
1486
                } else {
1487
                    $titleClm = '';
1488
                }
1489
1490
                if (! in_array($pageRow['uid'], $this->expandExcludeString($confArray['subCfg']['exclude']), true)) {
1491
1492
                    // URL list:
1493
                    $urlList = $this->urlListFromUrlArray(
1494
                        $confArray,
1495
                        $pageRow,
1496
                        $this->scheduledTime,
1497
                        $this->reqMinute,
1498
                        $this->submitCrawlUrls,
1499
                        $this->downloadCrawlUrls,
1500
                        $this->duplicateTrack,
1501
                        $this->downloadUrls,
1502
                        // if empty the urls won't be filtered by processing instructions
1503
                        $this->incomingProcInstructions
1504
                    );
1505
1506
                    // Expanded parameters:
1507
                    $paramExpanded = '';
1508
                    $calcAccu = [];
1509
                    $calcRes = 1;
1510
                    foreach ($confArray['paramExpanded'] as $gVar => $gVal) {
1511
                        $paramExpanded .= '
1512
                            <tr>
1513
                                <td>' . htmlspecialchars('&' . $gVar . '=') . '<br/>' .
1514
                            '(' . count($gVal) . ')' .
1515
                            '</td>
1516
                                <td nowrap="nowrap">' . nl2br(htmlspecialchars(implode(chr(10), $gVal))) . '</td>
1517
                            </tr>
1518
                        ';
1519
                        $calcRes *= count($gVal);
1520
                        $calcAccu[] = count($gVal);
1521
                    }
1522
                    $paramExpanded = '<table>' . $paramExpanded . '</table>';
1523
                    $paramExpanded .= 'Comb: ' . implode('*', $calcAccu) . '=' . $calcRes;
1524
1525
                    // Options
1526
                    $optionValues = '';
1527
                    if ($confArray['subCfg']['userGroups']) {
1528
                        $optionValues .= 'User Groups: ' . $confArray['subCfg']['userGroups'] . '<br/>';
1529
                    }
1530
                    if ($confArray['subCfg']['procInstrFilter']) {
1531
                        $optionValues .= 'ProcInstr: ' . $confArray['subCfg']['procInstrFilter'] . '<br/>';
1532
                    }
1533
1534
                    // Compile row:
1535
                    $content .= '
1536
                        <tr>
1537
                            ' . $titleClm . '
1538
                            <td>' . htmlspecialchars($confKey) . '</td>
1539
                            <td>' . nl2br(htmlspecialchars(rawurldecode(trim(str_replace('&', chr(10) . '&', GeneralUtility::implodeArrayForUrl('', $confArray['paramParsed'])))))) . '</td>
1540
                            <td>' . $paramExpanded . '</td>
1541
                            <td nowrap="nowrap">' . $urlList . '</td>
1542
                            <td nowrap="nowrap">' . $optionValues . '</td>
1543
                            <td nowrap="nowrap">' . DebugUtility::viewArray($confArray['subCfg']['procInstrParams.']) . '</td>
1544
                        </tr>';
1545
                } else {
1546
                    $content .= '<tr>
1547
                            ' . $titleClm . '
1548
                            <td>' . htmlspecialchars($confKey) . '</td>
1549
                            <td colspan="5"><em>No entries</em> (Page is excluded in this configuration)</td>
1550
                        </tr>';
1551
                }
1552
1553
                $c++;
1554
            }
1555
        } else {
1556
            $message = ! empty($skipMessage) ? ' (' . $skipMessage . ')' : '';
1557
1558
            // Compile row:
1559
            $content .= '
1560
                <tr>
1561
                    <td>' . $pageTitle . '</td>
1562
                    <td colspan="6"><em>No entries</em>' . $message . '</td>
1563
                </tr>';
1564
        }
1565
1566
        return $content;
1567
    }
1568
1569
    /*****************************
1570
     *
1571
     * CLI functions
1572
     *
1573
     *****************************/
1574
1575
    /**
1576
     * Running the functionality of the CLI (crawling URLs from queue)
1577
     */
1578
    public function CLI_run(int $countInARun, int $sleepTime, int $sleepAfterFinish): int
1579
    {
1580
        $result = 0;
1581
        $counter = 0;
1582
1583
        // First, run hooks:
1584
        $this->CLI_runHooks();
1585
1586
        // Clean up the queue
1587
        $this->queueRepository->cleanupQueue();
1588
1589
        // Select entries:
1590
        $rows = $this->queueRepository->fetchRecordsToBeCrawled($countInARun);
1591
1592
        if (! empty($rows)) {
1593
            $quidList = [];
1594
1595
            foreach ($rows as $r) {
1596
                $quidList[] = $r['qid'];
1597
            }
1598
1599
            $processId = $this->CLI_buildProcessId();
1600
1601
            //save the number of assigned queue entries to determine how many have been processed later
1602
            $numberOfAffectedRows = $this->queueRepository->updateProcessIdAndSchedulerForQueueIds($quidList, $processId);
1603
            $this->processRepository->updateProcessAssignItemsCount($numberOfAffectedRows, $processId);
1604
1605
            if ($numberOfAffectedRows !== count($quidList)) {
1606
                $this->CLI_debug('Nothing processed due to multi-process collision (' . $this->CLI_buildProcessId() . ')');
0 ignored issues
show
Deprecated Code introduced by
The function AOE\Crawler\Controller\C...Controller::CLI_debug() has been deprecated. ( Ignorable by Annotation )

If this is a false-positive, you can also ignore this issue in your code via the ignore-deprecated  annotation

1606
                /** @scrutinizer ignore-deprecated */ $this->CLI_debug('Nothing processed due to multi-process collision (' . $this->CLI_buildProcessId() . ')');
Loading history...
1607
                return ($result | self::CLI_STATUS_ABORTED);
1608
            }
1609
1610
            foreach ($rows as $r) {
1611
                $result |= $this->readUrl($r['qid']);
1612
1613
                $counter++;
1614
                // Just to relax the system
1615
                usleep((int) $sleepTime);
1616
1617
                // if during the start and the current read url the cli has been disable we need to return from the function
1618
                // mark the process NOT as ended.
1619
                if ($this->crawler->isDisabled()) {
1620
                    return ($result | self::CLI_STATUS_ABORTED);
1621
                }
1622
1623
                if (! $this->processRepository->isProcessActive($this->CLI_buildProcessId())) {
1624
                    $this->CLI_debug('conflict / timeout (' . $this->CLI_buildProcessId() . ')');
0 ignored issues
show
Deprecated Code introduced by
The function AOE\Crawler\Controller\C...Controller::CLI_debug() has been deprecated. ( Ignorable by Annotation )

If this is a false-positive, you can also ignore this issue in your code via the ignore-deprecated  annotation

1624
                    /** @scrutinizer ignore-deprecated */ $this->CLI_debug('conflict / timeout (' . $this->CLI_buildProcessId() . ')');
Loading history...
1625
                    $result |= self::CLI_STATUS_ABORTED;
1626
                    //possible timeout
1627
                    break;
1628
                }
1629
            }
1630
1631
            sleep((int) $sleepAfterFinish);
1632
1633
            $msg = 'Rows: ' . $counter;
1634
            $this->CLI_debug($msg . ' (' . $this->CLI_buildProcessId() . ')');
0 ignored issues
show
Deprecated Code introduced by
The function AOE\Crawler\Controller\C...Controller::CLI_debug() has been deprecated. ( Ignorable by Annotation )

If this is a false-positive, you can also ignore this issue in your code via the ignore-deprecated  annotation

1634
            /** @scrutinizer ignore-deprecated */ $this->CLI_debug($msg . ' (' . $this->CLI_buildProcessId() . ')');
Loading history...
1635
        } else {
1636
            $this->CLI_debug('Nothing within queue which needs to be processed (' . $this->CLI_buildProcessId() . ')');
0 ignored issues
show
Deprecated Code introduced by
The function AOE\Crawler\Controller\C...Controller::CLI_debug() has been deprecated. ( Ignorable by Annotation )

If this is a false-positive, you can also ignore this issue in your code via the ignore-deprecated  annotation

1636
            /** @scrutinizer ignore-deprecated */ $this->CLI_debug('Nothing within queue which needs to be processed (' . $this->CLI_buildProcessId() . ')');
Loading history...
1637
        }
1638
1639
        if ($counter > 0) {
1640
            $result |= self::CLI_STATUS_PROCESSED;
1641
        }
1642
1643
        return $result;
1644
    }
1645
1646
    /**
1647
     * Activate hooks
1648
     */
1649
    public function CLI_runHooks(): void
1650
    {
1651
        foreach ($GLOBALS['TYPO3_CONF_VARS']['EXTCONF']['crawler']['cli_hooks'] ?? [] as $objRef) {
1652
            $hookObj = GeneralUtility::makeInstance($objRef);
1653
            if (is_object($hookObj)) {
1654
                $hookObj->crawler_init($this);
1655
            }
1656
        }
1657
    }
1658
1659
    /**
1660
     * Try to acquire a new process with the given id
1661
     * also performs some auto-cleanup for orphan processes
1662
     * @param string $id identification string for the process
1663
     * @return boolean
1664
     * @todo preemption might not be the most elegant way to clean up
1665
     */
1666
    public function CLI_checkAndAcquireNewProcess($id)
1667
    {
1668
        $ret = true;
1669
1670
        $systemProcessId = getmypid();
1671
        if (! $systemProcessId) {
1672
            return false;
1673
        }
1674
1675
        $processCount = 0;
1676
        $orphanProcesses = [];
1677
1678
        $activeProcesses = $this->processRepository->findAllActive();
1679
        $currentTime = $this->getCurrentTime();
1680
1681
        /** @var Process $process */
1682
        foreach ($activeProcesses as $process) {
1683
            if ($process->getTtl() < $currentTime) {
1684
                $orphanProcesses[] = $process->getProcessId();
1685
            } else {
1686
                $processCount++;
1687
            }
1688
        }
1689
1690
        // if there are less than allowed active processes then add a new one
1691
        if ($processCount < (int) $this->extensionSettings['processLimit']) {
1692
            $this->CLI_debug('add process ' . $this->CLI_buildProcessId() . ' (' . ($processCount + 1) . '/' . (int) $this->extensionSettings['processLimit'] . ')');
0 ignored issues
show
Deprecated Code introduced by
The function AOE\Crawler\Controller\C...Controller::CLI_debug() has been deprecated. ( Ignorable by Annotation )

If this is a false-positive, you can also ignore this issue in your code via the ignore-deprecated  annotation

1692
            /** @scrutinizer ignore-deprecated */ $this->CLI_debug('add process ' . $this->CLI_buildProcessId() . ' (' . ($processCount + 1) . '/' . (int) $this->extensionSettings['processLimit'] . ')');
Loading history...
1693
1694
            GeneralUtility::makeInstance(ConnectionPool::class)->getConnectionForTable('tx_crawler_process')->insert(
1695
                'tx_crawler_process',
1696
                [
1697
                    'process_id' => $id,
1698
                    'active' => 1,
1699
                    'ttl' => $currentTime + (int) $this->extensionSettings['processMaxRunTime'],
1700
                    'system_process_id' => $systemProcessId,
1701
                ]
1702
            );
1703
        } else {
1704
            $this->CLI_debug('Processlimit reached (' . ($processCount) . '/' . (int) $this->extensionSettings['processLimit'] . ')');
0 ignored issues
show
Deprecated Code introduced by
The function AOE\Crawler\Controller\C...Controller::CLI_debug() has been deprecated. ( Ignorable by Annotation )

If this is a false-positive, you can also ignore this issue in your code via the ignore-deprecated  annotation

1704
            /** @scrutinizer ignore-deprecated */ $this->CLI_debug('Processlimit reached (' . ($processCount) . '/' . (int) $this->extensionSettings['processLimit'] . ')');
Loading history...
1705
            $ret = false;
1706
        }
1707
1708
        $this->processRepository->deleteProcessesMarkedAsDeleted();
1709
        $this->CLI_releaseProcesses($orphanProcesses);
1710
1711
        return $ret;
1712
    }
1713
1714
    /**
1715
     * Release a process and the required resources
1716
     *
1717
     * @param mixed $releaseIds string with a single process-id or array with multiple process-ids
1718
     * @return boolean
1719
     */
1720
    public function CLI_releaseProcesses($releaseIds)
1721
    {
1722
        $queryBuilder = GeneralUtility::makeInstance(ConnectionPool::class)->getQueryBuilderForTable($this->tableName);
1723
1724
        if (! is_array($releaseIds)) {
1725
            $releaseIds = [$releaseIds];
1726
        }
1727
1728
        if (empty($releaseIds)) {
1729
            //nothing to release
1730
            return false;
1731
        }
1732
1733
        // some kind of 2nd chance algo - this way you need at least 2 processes to have a real cleanup
1734
        // this ensures that a single process can't mess up the entire process table
1735
1736
        // mark all processes as deleted which have no "waiting" queue-entires and which are not active
1737
1738
        $queryBuilder
1739
            ->update($this->tableName, 'q')
1740
            ->where(
1741
                'q.process_id IN(SELECT p.process_id FROM tx_crawler_process as p WHERE p.active = 0)'
1742
            )
1743
            ->set('q.process_scheduled', 0)
1744
            ->set('q.process_id', '')
1745
            ->execute();
1746
1747
        // FIXME: Not entirely sure that this is equivalent to the previous version
1748
        $queryBuilder->resetQueryPart('set');
1749
1750
        $queryBuilder
1751
            ->update('tx_crawler_process')
1752
            ->where(
1753
                $queryBuilder->expr()->eq('active', 0),
1754
                'process_id IN(SELECT q.process_id FROM tx_crawler_queue as q WHERE q.exec_time = 0)'
1755
            )
1756
            ->set('system_process_id', 0)
1757
            ->execute();
1758
1759
        $this->processRepository->markRequestedProcessesAsNotActive($releaseIds);
1760
        $this->queueRepository->unsetProcessScheduledAndProcessIdForQueueEntries($releaseIds);
1761
1762
        return true;
1763
    }
1764
1765 1
    /**
1766
     * Create a unique Id for the current process
1767 1
     *
1768
     * @return string the ID
1769
     */
1770 1
    public function CLI_buildProcessId()
1771
    {
1772
        if (! $this->processID) {
1773
            $this->processID = GeneralUtility::shortMD5(microtime(true));
1774
        }
1775
        return $this->processID;
1776
    }
1777
1778
    /**
1779
     * Prints a message to the stdout (only if debug-mode is enabled)
1780
     *
1781
     * @param string $msg the message
1782
     * @deprecated
1783
     * @codeCoverageIgnore
1784
     */
1785
    public function CLI_debug($msg): void
1786
    {
1787
        if ((int) $this->extensionSettings['processDebug']) {
1788
            echo $msg . "\n";
1789
            flush();
1790
        }
1791
    }
1792
1793
    /**
1794
     * Cleans up entries that stayed for too long in the queue. These are:
1795 1
     * - processed entries that are over 1.5 days in age
1796
     * - scheduled entries that are over 7 days old
1797
     *
1798 1
     * @deprecated
1799 1
     */
1800
    public function cleanUpOldQueueEntries(): void
1801 1
    {
1802 1
        // 24*60*60 Seconds in 24 hours
1803 1
        $processedAgeInSeconds = $this->extensionSettings['cleanUpProcessedAge'] * 86400;
1804 1
        $scheduledAgeInSeconds = $this->extensionSettings['cleanUpScheduledAge'] * 86400;
1805
1806
        $now = time();
1807
        $condition = '(exec_time<>0 AND exec_time<' . ($now - $processedAgeInSeconds) . ') OR scheduled<=' . ($now - $scheduledAgeInSeconds);
1808
        $this->flushQueue($condition);
0 ignored issues
show
Deprecated Code introduced by
The function AOE\Crawler\Controller\C...ontroller::flushQueue() has been deprecated. ( Ignorable by Annotation )

If this is a false-positive, you can also ignore this issue in your code via the ignore-deprecated  annotation

1808
        /** @scrutinizer ignore-deprecated */ $this->flushQueue($condition);
Loading history...
1809
    }
1810
1811
    /**
1812
     * Removes queue entries
1813 5
     *
1814
     * @param string $where SQL related filter for the entries which should be removed
1815 5
     *
1816
     * @deprecated
1817 5
     */
1818
    protected function flushQueue($where = ''): void
1819
    {
1820 5
        $realWhere = strlen((string) $where) > 0 ? $where : '1=1';
1821 5
1822 5
        $queryBuilder = $this->getQueryBuilder($this->tableName);
1823 5
1824 5
        $groups = $queryBuilder
1825 5
            ->selectLiteral('DISTINCT set_id')
1826 5
            ->from($this->tableName)
1827
            ->where($realWhere)
1828 4
            ->execute()
1829 4
            ->fetchAll();
1830 4
        if (is_array($groups)) {
0 ignored issues
show
introduced by
The condition is_array($groups) is always true.
Loading history...
1831 4
            foreach ($groups as $group) {
1832 4
                $subSet = $queryBuilder
1833
                    ->select('qid', 'set_id')
1834 4
                    ->from($this->tableName)
1835 4
                    ->where(
1836
                        $realWhere,
1837 4
                        $queryBuilder->expr()->eq('set_id', $group['set_id'])
1838 4
                    )
1839 4
                    ->execute()
1840 4
                    ->fetchAll();
1841
1842
                $payLoad = ['subSet' => $subSet];
1843
                SignalSlotUtility::emitSignal(
1844
                    self::class,
1845
                    SignalSlotUtility::SIGNAL_QUEUE_ENTRY_FLUSH,
1846
                    $payLoad
1847 5
                );
1848 5
            }
1849 5
        }
1850 5
1851
        $queryBuilder
1852
            ->delete($this->tableName)
1853
            ->where($realWhere)
1854
            ->execute();
1855
    }
1856
1857
    /**
1858
     * This method determines duplicates for a queue entry with the same parameters and this timestamp.
1859
     * If the timestamp is in the past, it will check if there is any unprocessed queue entry in the past.
1860
     * If the timestamp is in the future it will check, if the queued entry has exactly the same timestamp
1861
     *
1862
     * @param int $tstamp
1863 5
     * @param array $fieldArray
1864
     *
1865 5
     * @return array
1866
     * @deprecated
1867 5
     */
1868
    protected function getDuplicateRowsIfExist($tstamp, $fieldArray)
1869 5
    {
1870
        $rows = [];
1871 5
1872 5
        $currentTime = $this->getCurrentTime();
1873
1874 5
        $queryBuilder = GeneralUtility::makeInstance(ConnectionPool::class)->getQueryBuilderForTable($this->tableName);
1875 2
        $queryBuilder
1876 1
            ->select('qid')
1877 1
            ->from('tx_crawler_queue');
1878
        //if this entry is scheduled with "now"
1879 1
        if ($tstamp <= $currentTime) {
1880 1
            if ($this->extensionSettings['enableTimeslot']) {
1881
                $timeBegin = $currentTime - 100;
1882 1
                $timeEnd = $currentTime + 100;
1883 1
                $queryBuilder
1884
                    ->where(
1885
                        'scheduled BETWEEN ' . $timeBegin . ' AND ' . $timeEnd . ''
1886
                    )
1887 1
                    ->orWhere(
1888 2
                        $queryBuilder->expr()->lte('scheduled', $currentTime)
1889
                    );
1890
            } else {
1891 3
                $queryBuilder
1892
                    ->where(
1893
                        $queryBuilder->expr()->lte('scheduled', $currentTime)
1894 3
                    );
1895 3
            }
1896
        } elseif ($tstamp > $currentTime) {
1897
            //entry with a timestamp in the future need to have the same schedule time
1898
            $queryBuilder
1899
                ->where(
1900 5
                    $queryBuilder->expr()->eq('scheduled', $tstamp)
1901 5
                );
1902 5
        }
1903 5
1904
        $queryBuilder
1905 5
            ->andWhere('NOT exec_time')
1906
            ->andWhere('NOT process_id')
1907 5
            ->andWhere($queryBuilder->expr()->eq('page_id', $queryBuilder->createNamedParameter($fieldArray['page_id'], PDO::PARAM_INT)))
1908 5
            ->andWhere($queryBuilder->expr()->eq('parameters_hash', $queryBuilder->createNamedParameter($fieldArray['parameters_hash'], PDO::PARAM_STR)));
1909
1910
        $statement = $queryBuilder->execute();
1911 5
1912
        while ($row = $statement->fetch()) {
1913
            $rows[] = $row['qid'];
1914
        }
1915
1916
        return $rows;
1917
    }
1918
1919 10
    /**
1920
     * Returns a md5 hash generated from a serialized configuration array.
1921 10
     *
1922 10
     * @return string
1923 10
     */
1924
    protected function getConfigurationHash(array $configuration)
1925
    {
1926
        unset($configuration['paramExpanded']);
1927
        unset($configuration['URLs']);
1928
        return md5(serialize($configuration));
1929
    }
1930
1931
    /**
1932
     * Build a URL from a Page and the Query String. If the page has a Site configuration, it can be built by using
1933
     * the Site instance.
1934
     *
1935
     * @param int $httpsOrHttp see tx_crawler_configuration.force_ssl
1936
     * @throws SiteNotFoundException
1937
     * @throws InvalidRouteArgumentsException
1938
     *
1939
     * @deprecated Using CrawlerController::getUrlFromPageAndQueryParameters() is deprecated since 9.1.1 and will be removed in v11.x, please use UrlService->getUrlFromPageAndQueryParameters() instead.
1940
     * @codeCoverageIgnore
1941
     */
1942
    protected function getUrlFromPageAndQueryParameters(int $pageId, string $queryString, ?string $alternativeBaseUrl, int $httpsOrHttp): UriInterface
1943 1
    {
1944
        $urlService = new UrlService();
1945
        return $urlService->getUrlFromPageAndQueryParameters($pageId, $queryString, $alternativeBaseUrl, $httpsOrHttp);
1946 1
    }
1947
1948
    protected function swapIfFirstIsLargerThanSecond(array $reg): array
1949
    {
1950
        // Swap if first is larger than last:
1951
        if ($reg[1] > $reg[2]) {
1952 1
            $temp = $reg[2];
1953
            $reg[2] = $reg[1];
1954
            $reg[1] = $temp;
1955
        }
1956
1957
        return $reg;
1958 2
    }
1959
1960
    /**
1961 2
     * @return BackendUserAuthentication
1962 2
     */
1963 2
    private function getBackendUser()
1964
    {
1965 2
        // Make sure the _cli_ user is loaded
1966
        Bootstrap::initializeBackendAuthentication();
1967
        if ($this->backendUser === null) {
1968
            $this->backendUser = $GLOBALS['BE_USER'];
1969
        }
1970
        return $this->backendUser;
1971
    }
1972
1973 12
    /**
1974
     * Get querybuilder for given table
1975 12
     *
1976
     * @return QueryBuilder
1977
     */
1978
    private function getQueryBuilder(string $table)
1979
    {
1980
        return GeneralUtility::makeInstance(ConnectionPool::class)->getQueryBuilderForTable($table);
1981
    }
1982
}
1983