Passed
Pull Request — master (#674)
by Tomas Norre
10:43 queued 07:11
created

CrawlerController::getUrlsForPageId()   C

Complexity

Conditions 16
Paths 96

Size

Total Lines 93
Code Lines 51

Duplication

Lines 0
Ratio 0 %

Code Coverage

Tests 46
CRAP Score 16.0585

Importance

Changes 3
Bugs 0 Features 0
Metric Value
cc 16
eloc 51
c 3
b 0
f 0
nc 96
nop 1
dl 0
loc 93
ccs 46
cts 49
cp 0.9388
crap 16.0585
rs 5.5666

How to fix   Long Method    Complexity   

Long Method

Small methods make your code easier to understand, in particular if combined with a good name. Besides, if your method is small, finding a good name is usually much easier.

For example, if you find yourself adding comments to a method's body, this is usually a good sign to extract the commented part to a new method, and use the comment as a starting point when coming up with a good name for this new method.

Commonly applied refactorings include:

1
<?php
2
3
declare(strict_types=1);
4
5
namespace AOE\Crawler\Controller;
6
7
/***************************************************************
8
 *  Copyright notice
9
 *
10
 *  (c) 2020 AOE GmbH <[email protected]>
11
 *
12
 *  All rights reserved
13
 *
14
 *  This script is part of the TYPO3 project. The TYPO3 project is
15
 *  free software; you can redistribute it and/or modify
16
 *  it under the terms of the GNU General Public License as published by
17
 *  the Free Software Foundation; either version 3 of the License, or
18
 *  (at your option) any later version.
19
 *
20
 *  The GNU General Public License can be found at
21
 *  http://www.gnu.org/copyleft/gpl.html.
22
 *
23
 *  This script is distributed in the hope that it will be useful,
24
 *  but WITHOUT ANY WARRANTY; without even the implied warranty of
25
 *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
26
 *  GNU General Public License for more details.
27
 *
28
 *  This copyright notice MUST APPEAR in all copies of the script!
29
 ***************************************************************/
30
31
use AOE\Crawler\Configuration\ExtensionConfigurationProvider;
32
use AOE\Crawler\Converter\JsonCompatibilityConverter;
33
use AOE\Crawler\Crawler;
34
use AOE\Crawler\CrawlStrategy\CrawlStrategyFactory;
35
use AOE\Crawler\Domain\Model\Process;
36
use AOE\Crawler\Domain\Repository\ConfigurationRepository;
37
use AOE\Crawler\Domain\Repository\ProcessRepository;
38
use AOE\Crawler\Domain\Repository\QueueRepository;
39
use AOE\Crawler\QueueExecutor;
40
use AOE\Crawler\Service\UrlService;
41
use AOE\Crawler\Utility\SignalSlotUtility;
42
use AOE\Crawler\Value\QueueFilter;
43
use PDO;
44
use Psr\Http\Message\UriInterface;
45
use Psr\Log\LoggerAwareInterface;
46
use Psr\Log\LoggerAwareTrait;
47
use TYPO3\CMS\Backend\Tree\View\PageTreeView;
48
use TYPO3\CMS\Backend\Utility\BackendUtility;
49
use TYPO3\CMS\Core\Authentication\BackendUserAuthentication;
50
use TYPO3\CMS\Core\Compatibility\PublicMethodDeprecationTrait;
51
use TYPO3\CMS\Core\Compatibility\PublicPropertyDeprecationTrait;
52
use TYPO3\CMS\Core\Core\Bootstrap;
53
use TYPO3\CMS\Core\Core\Environment;
54
use TYPO3\CMS\Core\Database\Connection;
55
use TYPO3\CMS\Core\Database\ConnectionPool;
56
use TYPO3\CMS\Core\Database\Query\QueryBuilder;
57
use TYPO3\CMS\Core\Database\Query\Restriction\DeletedRestriction;
58
use TYPO3\CMS\Core\Database\QueryGenerator;
59
use TYPO3\CMS\Core\Exception\SiteNotFoundException;
60
use TYPO3\CMS\Core\Imaging\Icon;
61
use TYPO3\CMS\Core\Imaging\IconFactory;
62
use TYPO3\CMS\Core\Routing\InvalidRouteArgumentsException;
63
use TYPO3\CMS\Core\Site\Entity\Site;
64
use TYPO3\CMS\Core\Type\Bitmask\Permission;
65
use TYPO3\CMS\Core\TypoScript\Parser\TypoScriptParser;
66
use TYPO3\CMS\Core\Utility\DebugUtility;
67
use TYPO3\CMS\Core\Utility\GeneralUtility;
68
use TYPO3\CMS\Core\Utility\MathUtility;
69
use TYPO3\CMS\Extbase\Object\ObjectManager;
70
use TYPO3\CMS\Frontend\Page\PageRepository;
71
72
/**
73
 * Class CrawlerController
74
 *
75
 * @package AOE\Crawler\Controller
76
 */
77
class CrawlerController implements LoggerAwareInterface
78
{
79
    use LoggerAwareTrait;
80
    use PublicMethodDeprecationTrait;
81
    use PublicPropertyDeprecationTrait;
82
83
    public const CLI_STATUS_NOTHING_PROCCESSED = 0;
84
85
    //queue not empty
86
    public const CLI_STATUS_REMAIN = 1;
87
88
    //(some) queue items where processed
89
    public const CLI_STATUS_PROCESSED = 2;
90
91
    //instance didn't finish
92
    public const CLI_STATUS_ABORTED = 4;
93
94
    public const CLI_STATUS_POLLABLE_PROCESSED = 8;
95
96
    /**
97
     * @var integer
98
     */
99
    public $setID = 0;
100
101
    /**
102
     * @var string
103
     */
104
    public $processID = '';
105
106
    /**
107
     * @var array
108
     */
109
    public $duplicateTrack = [];
110
111
    /**
112
     * @var array
113
     */
114
    public $downloadUrls = [];
115
116
    /**
117
     * @var array
118
     */
119
    public $incomingProcInstructions = [];
120
121
    /**
122
     * @var array
123
     */
124
    public $incomingConfigurationSelection = [];
125
126
    /**
127
     * @var bool
128
     */
129
    public $registerQueueEntriesInternallyOnly = false;
130
131
    /**
132
     * @var array
133
     */
134
    public $queueEntries = [];
135
136
    /**
137
     * @var array
138
     */
139
    public $urlList = [];
140
141
    /**
142
     * @var array
143
     */
144
    public $extensionSettings = [];
145
146
    /**
147
     * Mount Point
148
     *
149
     * @var bool
150
     * Todo: Check what this is used for and adjust the type hint or code, as bool doesn't match the current code.
151
     */
152
    public $MP = false;
153
154
    /**
155
     * @var string
156
     * @deprecated
157
     */
158
    protected $processFilename;
159
160
    /**
161
     * Holds the internal access mode can be 'gui','cli' or 'cli_im'
162
     *
163
     * @var string
164
     * @deprecated
165
     */
166
    protected $accessMode;
167
168
    /**
169
     * @var QueueRepository
170
     */
171
    protected $queueRepository;
172
173
    /**
174
     * @var ProcessRepository
175
     */
176
    protected $processRepository;
177
178
    /**
179
     * @var ConfigurationRepository
180
     */
181
    protected $configurationRepository;
182
183
    /**
184
     * @var string
185
     */
186
    protected $tableName = 'tx_crawler_queue';
187
188
    /**
189
     * @var QueueExecutor
190
     */
191
    protected $queueExecutor;
192
193
    /**
194
     * @var int
195
     */
196
    protected $maximumUrlsToCompile = 10000;
197
198
    /**
199
     * @var IconFactory
200
     */
201
    protected $iconFactory;
202
203
    /**
204
     * @var string[]
205
     */
206
    private $deprecatedPublicMethods = [
0 ignored issues
show
introduced by
The private property $deprecatedPublicMethods is not used, and could be removed.
Loading history...
207
        'cleanUpOldQueueEntries' => 'Using CrawlerController::cleanUpOldQueueEntries() is deprecated since 9.0.1 and will be removed in v11.x, please use QueueRepository->cleanUpOldQueueEntries() instead.',
208
        'CLI_debug' => 'Using CrawlerController->CLI_debug() is deprecated since 9.1.3 and will be removed in v11.x',
209
        'getAccessMode' => 'Using CrawlerController->getAccessMode() is deprecated since 9.1.3 and will be removed in v11.x',
210
        'getLogEntriesForSetId' => 'Using crawlerController::getLogEntriesForSetId() is deprecated since 9.0.1 and will be removed in v11.x',
211
        'flushQueue' => 'Using CrawlerController::flushQueue() is deprecated since 9.0.1 and will be removed in v11.x, please use QueueRepository->flushQueue() instead.',
212
        'setAccessMode' => 'Using CrawlerController->setAccessMode() is deprecated since 9.1.3 and will be removed in v11.x',
213
        'getDisabled' => 'Using CrawlerController->getDisabled() is deprecated since 9.1.3 and will be removed in v11.x, please use Crawler->isDisabled() instead',
214
        'setDisabled' => 'Using CrawlerController->setDisabled() is deprecated since 9.1.3 and will be removed in v11.x, please use Crawler->setDisabled() instead',
215
        'getProcessFilename' => 'Using CrawlerController->getProcessFilename() is deprecated since 9.1.3 and will be removed in v11.x',
216
        'setProcessFilename' => 'Using CrawlerController->setProcessFilename() is deprecated since 9.1.3 and will be removed in v11.x',
217
        'getDuplicateRowsIfExist' => 'Using CrawlerController->getDuplicateRowsIfExist() is deprecated since 9.1.4 and will be remove in v11.x, please use QueueRepository->getDuplicateQueueItemsIfExists() instead',
218
219
    ];
220
221
    /**
222
     * @var string[]
223
     */
224
    private $deprecatedPublicProperties = [
1 ignored issue
show
introduced by
The private property $deprecatedPublicProperties is not used, and could be removed.
Loading history...
225
        'accessMode' => 'Using CrawlerController->accessMode is deprecated since 9.1.3 and will be removed in v11.x',
226
        'processFilename' => 'Using CrawlerController->accessMode is deprecated since 9.1.3 and will be removed in v11.x',
227
    ];
228
229
    /**
230
     * @var BackendUserAuthentication|null
231
     */
232
    private $backendUser;
233
234
    /**
235
     * @var integer
236
     */
237
    private $scheduledTime = 0;
238
239
    /**
240
     * @var integer
241
     */
242
    private $reqMinute = 0;
243
244
    /**
245
     * @var bool
246
     */
247
    private $submitCrawlUrls = false;
248
249
    /**
250
     * @var bool
251
     */
252
    private $downloadCrawlUrls = false;
253
254
    /**
255
     * @var PageRepository
256
     */
257
    private $pageRepository;
258
259
    /**
260
     * @var Crawler
261
     */
262
    private $crawler;
263
264
    /************************************
265
     *
266
     * Getting URLs based on Page TSconfig
267
     *
268
     ************************************/
269
270 36
    public function __construct()
271
    {
272 36
        $objectManager = GeneralUtility::makeInstance(ObjectManager::class);
273 36
        $crawlStrategyFactory = GeneralUtility::makeInstance(CrawlStrategyFactory::class);
274 36
        $this->queueRepository = $objectManager->get(QueueRepository::class);
275 36
        $this->processRepository = $objectManager->get(ProcessRepository::class);
276 36
        $this->configurationRepository = $objectManager->get(ConfigurationRepository::class);
277 36
        $this->pageRepository = $objectManager->get(PageRepository::class);
278 36
        $this->queueExecutor = GeneralUtility::makeInstance(QueueExecutor::class, $crawlStrategyFactory);
279 36
        $this->iconFactory = GeneralUtility::makeInstance(IconFactory::class);
280 36
        $this->crawler = GeneralUtility::makeInstance(Crawler::class);
281
282 36
        $this->processFilename = Environment::getVarPath() . '/lock/tx_crawler.proc';
0 ignored issues
show
Deprecated Code introduced by
The property AOE\Crawler\Controller\C...oller::$processFilename has been deprecated. ( Ignorable by Annotation )

If this is a false-positive, you can also ignore this issue in your code via the ignore-deprecated  annotation

282
        /** @scrutinizer ignore-deprecated */ $this->processFilename = Environment::getVarPath() . '/lock/tx_crawler.proc';
Loading history...
283
284
        /** @var ExtensionConfigurationProvider $configurationProvider */
285 36
        $configurationProvider = GeneralUtility::makeInstance(ExtensionConfigurationProvider::class);
286 36
        $settings = $configurationProvider->getExtensionConfiguration();
287 36
        $this->extensionSettings = is_array($settings) ? $settings : [];
0 ignored issues
show
introduced by
The condition is_array($settings) is always true.
Loading history...
288
289
        // set defaults:
290 36
        if (MathUtility::convertToPositiveInteger($this->extensionSettings['countInARun']) === 0) {
291
            $this->extensionSettings['countInARun'] = 100;
292
        }
293
294 36
        $this->extensionSettings['processLimit'] = MathUtility::forceIntegerInRange($this->extensionSettings['processLimit'], 1, 99, 1);
295 36
        $this->maximumUrlsToCompile = MathUtility::forceIntegerInRange($this->extensionSettings['maxCompileUrls'], 1, 1000000000, 10000);
296 36
    }
297
298
    /**
299
     * Method to set the accessMode can be gui, cli or cli_im
300
     *
301
     * @return string
302
     * @deprecated
303
     */
304 1
    public function getAccessMode()
305
    {
306 1
        return $this->accessMode;
0 ignored issues
show
Deprecated Code introduced by
The property AOE\Crawler\Controller\C...Controller::$accessMode has been deprecated. ( Ignorable by Annotation )

If this is a false-positive, you can also ignore this issue in your code via the ignore-deprecated  annotation

306
        return /** @scrutinizer ignore-deprecated */ $this->accessMode;
Loading history...
307
    }
308
309
    /**
310
     * @param string $accessMode
311
     * @deprecated
312
     */
313 1
    public function setAccessMode($accessMode): void
314
    {
315 1
        $this->accessMode = $accessMode;
0 ignored issues
show
Deprecated Code introduced by
The property AOE\Crawler\Controller\C...Controller::$accessMode has been deprecated. ( Ignorable by Annotation )

If this is a false-positive, you can also ignore this issue in your code via the ignore-deprecated  annotation

315
        /** @scrutinizer ignore-deprecated */ $this->accessMode = $accessMode;
Loading history...
316 1
    }
317
318
    /**
319
     * Set disabled status to prevent processes from being processed
320
     *
321
     * @param bool $disabled (optional, defaults to true)
322
     * @deprecated
323
     */
324 2
    public function setDisabled($disabled = true): void
325
    {
326 2
        if ($disabled) {
327 1
            GeneralUtility::writeFile($this->processFilename, '');
0 ignored issues
show
Deprecated Code introduced by
The property AOE\Crawler\Controller\C...oller::$processFilename has been deprecated. ( Ignorable by Annotation )

If this is a false-positive, you can also ignore this issue in your code via the ignore-deprecated  annotation

327
            GeneralUtility::writeFile(/** @scrutinizer ignore-deprecated */ $this->processFilename, '');
Loading history...
328
        } else {
329 1
            if (is_file($this->processFilename)) {
0 ignored issues
show
Deprecated Code introduced by
The property AOE\Crawler\Controller\C...oller::$processFilename has been deprecated. ( Ignorable by Annotation )

If this is a false-positive, you can also ignore this issue in your code via the ignore-deprecated  annotation

329
            if (is_file(/** @scrutinizer ignore-deprecated */ $this->processFilename)) {
Loading history...
330 1
                unlink($this->processFilename);
0 ignored issues
show
Deprecated Code introduced by
The property AOE\Crawler\Controller\C...oller::$processFilename has been deprecated. ( Ignorable by Annotation )

If this is a false-positive, you can also ignore this issue in your code via the ignore-deprecated  annotation

330
                unlink(/** @scrutinizer ignore-deprecated */ $this->processFilename);
Loading history...
331
            }
332
        }
333 2
    }
334
335
    /**
336
     * Get disable status
337
     *
338
     * @return bool true if disabled
339
     * @deprecated
340
     */
341 2
    public function getDisabled()
342
    {
343 2
        return is_file($this->processFilename);
0 ignored issues
show
Deprecated Code introduced by
The property AOE\Crawler\Controller\C...oller::$processFilename has been deprecated. ( Ignorable by Annotation )

If this is a false-positive, you can also ignore this issue in your code via the ignore-deprecated  annotation

343
        return is_file(/** @scrutinizer ignore-deprecated */ $this->processFilename);
Loading history...
344
    }
345
346
    /**
347
     * @param string $filenameWithPath
348
     * @deprecated
349
     */
350 3
    public function setProcessFilename($filenameWithPath): void
351
    {
352 3
        $this->processFilename = $filenameWithPath;
0 ignored issues
show
Deprecated Code introduced by
The property AOE\Crawler\Controller\C...oller::$processFilename has been deprecated. ( Ignorable by Annotation )

If this is a false-positive, you can also ignore this issue in your code via the ignore-deprecated  annotation

352
        /** @scrutinizer ignore-deprecated */ $this->processFilename = $filenameWithPath;
Loading history...
353 3
    }
354
355
    /**
356
     * @return string
357
     * @deprecated
358
     */
359 1
    public function getProcessFilename()
360
    {
361 1
        return $this->processFilename;
0 ignored issues
show
Deprecated Code introduced by
The property AOE\Crawler\Controller\C...oller::$processFilename has been deprecated. ( Ignorable by Annotation )

If this is a false-positive, you can also ignore this issue in your code via the ignore-deprecated  annotation

361
        return /** @scrutinizer ignore-deprecated */ $this->processFilename;
Loading history...
362
    }
363
364
    /**
365
     * Sets the extensions settings (unserialized pendant of $TYPO3_CONF_VARS['EXT']['extConf']['crawler']).
366
     */
367 14
    public function setExtensionSettings(array $extensionSettings): void
368
    {
369 14
        $this->extensionSettings = $extensionSettings;
370 14
    }
371
372
    /**
373
     * Check if the given page should be crawled
374
     *
375
     * @return false|string false if the page should be crawled (not excluded), true / skipMessage if it should be skipped
376
     */
377 12
    public function checkIfPageShouldBeSkipped(array $pageRow)
378
    {
379
        // if page is hidden
380 12
        if (! $this->extensionSettings['crawlHiddenPages'] && $pageRow['hidden']) {
381 1
            return 'Because page is hidden';
382
        }
383
384 11
        if (GeneralUtility::inList('3,4,199,254,255', $pageRow['doktype'])) {
385 3
            return 'Because doktype is not allowed';
386
        }
387
388 8
        foreach ($GLOBALS['TYPO3_CONF_VARS']['EXTCONF']['crawler']['excludeDoktype'] ?? [] as $key => $doktypeList) {
389 1
            if (GeneralUtility::inList($doktypeList, $pageRow['doktype'])) {
390 1
                return 'Doktype was excluded by "' . $key . '"';
391
            }
392
        }
393
394
        // veto hook
395 7
        foreach ($GLOBALS['TYPO3_CONF_VARS']['EXTCONF']['crawler']['pageVeto'] ?? [] as $key => $func) {
396
            $params = [
397 2
                'pageRow' => $pageRow,
398
            ];
399
            // expects "false" if page is ok and "true" or a skipMessage if this page should _not_ be crawled
400 2
            $veto = GeneralUtility::callUserFunction($func, $params, $this);
401 2
            if ($veto !== false) {
402 2
                if (is_string($veto)) {
403 1
                    return $veto;
404
                }
405 1
                return 'Veto from hook "' . htmlspecialchars($key) . '"';
406
            }
407
        }
408
409 5
        return false;
410
    }
411
412
    /**
413
     * Wrapper method for getUrlsForPageId()
414
     * It returns an array of configurations and no urls!
415
     *
416
     * @param array $pageRow Page record with at least dok-type and uid columns.
417
     * @param string $skipMessage
418
     * @return array
419
     * @see getUrlsForPageId()
420
     */
421 6
    public function getUrlsForPageRow(array $pageRow, &$skipMessage = '')
422
    {
423 6
        $message = $this->checkIfPageShouldBeSkipped($pageRow);
424 6
        if ($message === false) {
425 5
            $res = $this->getUrlsForPageId($pageRow['uid']);
426 5
            $skipMessage = '';
427
        } else {
428 1
            $skipMessage = $message;
429 1
            $res = [];
430
        }
431
432 6
        return $res;
433
    }
434
435
    /**
436
     * Creates a list of URLs from input array (and submits them to queue if asked for)
437
     * See Web > Info module script + "indexed_search"'s crawler hook-client using this!
438
     *
439
     * @param array $vv Information about URLs from pageRow to crawl.
440
     * @param array $pageRow Page row
441
     * @param int $scheduledTime Unix time to schedule indexing to, typically time()
442
     * @param int $reqMinute Number of requests per minute (creates the interleave between requests)
443
     * @param bool $submitCrawlUrls If set, submits the URLs to queue
444
     * @param bool $downloadCrawlUrls If set (and submitcrawlUrls is false) will fill $downloadUrls with entries)
445
     * @param array $duplicateTrack Array which is passed by reference and contains the an id per url to secure we will not crawl duplicates
446
     * @param array $downloadUrls Array which will be filled with URLS for download if flag is set.
447
     * @param array $incomingProcInstructions Array of processing instructions
448
     * @return string List of URLs (meant for display in backend module)
449
     */
450 4
    public function urlListFromUrlArray(
451
        array $vv,
452
        array $pageRow,
453
        $scheduledTime,
454
        $reqMinute,
455
        $submitCrawlUrls,
456
        $downloadCrawlUrls,
457
        array &$duplicateTrack,
458
        array &$downloadUrls,
459
        array $incomingProcInstructions
460
    ) {
461 4
        if (! is_array($vv['URLs'])) {
462
            return 'ERROR - no URL generated';
463
        }
464 4
        $urlLog = [];
465 4
        $pageId = (int) $pageRow['uid'];
466 4
        $configurationHash = $this->getConfigurationHash($vv);
467 4
        $skipInnerCheck = $this->queueRepository->noUnprocessedQueueEntriesForPageWithConfigurationHashExist($pageId, $configurationHash);
468
469 4
        $urlService = new UrlService();
470
471 4
        foreach ($vv['URLs'] as $urlQuery) {
472 4
            if (! $this->drawURLs_PIfilter($vv['subCfg']['procInstrFilter'], $incomingProcInstructions)) {
473
                continue;
474
            }
475 4
            $url = (string) $urlService->getUrlFromPageAndQueryParameters(
476 4
                $pageId,
477 4
                $urlQuery,
478 4
                $vv['subCfg']['baseUrl'] ?? null,
479 4
                $vv['subCfg']['force_ssl'] ?? 0
480
            );
481
482
            // Create key by which to determine unique-ness:
483 4
            $uKey = $url . '|' . $vv['subCfg']['userGroups'] . '|' . $vv['subCfg']['procInstrFilter'];
484
485 4
            if (isset($duplicateTrack[$uKey])) {
486
                //if the url key is registered just display it and do not resubmit is
487
                $urlLog[] = '<em><span class="text-muted">' . htmlspecialchars($url) . '</span></em>';
488
            } else {
489
                // Scheduled time:
490 4
                $schTime = $scheduledTime + round(count($duplicateTrack) * (60 / $reqMinute));
491 4
                $schTime = intval($schTime / 60) * 60;
492 4
                $formattedDate = BackendUtility::datetime($schTime);
493 4
                $this->urlList[] = '[' . $formattedDate . '] ' . $url;
494 4
                $urlList = '[' . $formattedDate . '] ' . htmlspecialchars($url);
495
496
                // Submit for crawling!
497 4
                if ($submitCrawlUrls) {
498 4
                    $added = $this->addUrl(
499 4
                        $pageId,
500 4
                        $url,
501 4
                        $vv['subCfg'],
502 4
                        $scheduledTime,
503 4
                        $configurationHash,
504 4
                        $skipInnerCheck
505
                    );
506 4
                    if ($added === false) {
507 4
                        $urlList .= ' (URL already existed)';
508
                    }
509
                } elseif ($downloadCrawlUrls) {
510
                    $downloadUrls[$url] = $url;
511
                }
512 4
                $urlLog[] = $urlList;
513
            }
514 4
            $duplicateTrack[$uKey] = true;
515
        }
516
517 4
        return implode('<br>', $urlLog);
518
    }
519
520
    /**
521
     * Returns true if input processing instruction is among registered ones.
522
     *
523
     * @param string $piString PI to test
524
     * @param array $incomingProcInstructions Processing instructions
525
     * @return boolean
526
     */
527 5
    public function drawURLs_PIfilter($piString, array $incomingProcInstructions)
528
    {
529 5
        if (empty($incomingProcInstructions)) {
530 1
            return true;
531
        }
532
533 4
        foreach ($incomingProcInstructions as $pi) {
534 4
            if (GeneralUtility::inList($piString, $pi)) {
535 2
                return true;
536
            }
537
        }
538 2
        return false;
539
    }
540
541 5
    public function getPageTSconfigForId($id): array
542
    {
543 5
        if (! $this->MP) {
544 5
            $pageTSconfig = BackendUtility::getPagesTSconfig($id);
0 ignored issues
show
Deprecated Code introduced by
The function TYPO3\CMS\Backend\Utilit...ity::getPagesTSconfig() has been deprecated. ( Ignorable by Annotation )

If this is a false-positive, you can also ignore this issue in your code via the ignore-deprecated  annotation

544
            $pageTSconfig = /** @scrutinizer ignore-deprecated */ BackendUtility::getPagesTSconfig($id);
Loading history...
545
        } else {
546
            // TODO: Please check, this makes no sense to split a boolean value.
547
            [, $mountPointId] = explode('-', $this->MP);
0 ignored issues
show
Bug introduced by
$this->MP of type true is incompatible with the type string expected by parameter $string of explode(). ( Ignorable by Annotation )

If this is a false-positive, you can also ignore this issue in your code via the ignore-type  annotation

547
            [, $mountPointId] = explode('-', /** @scrutinizer ignore-type */ $this->MP);
Loading history...
548
            $pageTSconfig = BackendUtility::getPagesTSconfig($mountPointId);
0 ignored issues
show
Bug introduced by
$mountPointId of type string is incompatible with the type integer expected by parameter $id of TYPO3\CMS\Backend\Utilit...ity::getPagesTSconfig(). ( Ignorable by Annotation )

If this is a false-positive, you can also ignore this issue in your code via the ignore-type  annotation

548
            $pageTSconfig = BackendUtility::getPagesTSconfig(/** @scrutinizer ignore-type */ $mountPointId);
Loading history...
Deprecated Code introduced by
The function TYPO3\CMS\Backend\Utilit...ity::getPagesTSconfig() has been deprecated. ( Ignorable by Annotation )

If this is a false-positive, you can also ignore this issue in your code via the ignore-deprecated  annotation

548
            $pageTSconfig = /** @scrutinizer ignore-deprecated */ BackendUtility::getPagesTSconfig($mountPointId);
Loading history...
549
        }
550
551
        // Call a hook to alter configuration
552 5
        if (is_array($GLOBALS['TYPO3_CONF_VARS']['EXTCONF']['crawler']['getPageTSconfigForId'])) {
553
            $params = [
554
                'pageId' => $id,
555
                'pageTSConfig' => &$pageTSconfig,
556
            ];
557
            foreach ($GLOBALS['TYPO3_CONF_VARS']['EXTCONF']['crawler']['getPageTSconfigForId'] as $userFunc) {
558
                GeneralUtility::callUserFunction($userFunc, $params, $this);
559
            }
560
        }
561 5
        return $pageTSconfig;
562
    }
563
564
    /**
565
     * This methods returns an array of configurations.
566
     * Adds no urls!
567
     */
568 4
    public function getUrlsForPageId(int $pageId): array
569
    {
570
        // Get page TSconfig for page ID
571 4
        $pageTSconfig = $this->getPageTSconfigForId($pageId);
572
573 4
        $res = [];
574
575
        // Fetch Crawler Configuration from pageTSconfig
576 4
        $crawlerCfg = $pageTSconfig['tx_crawler.']['crawlerCfg.']['paramSets.'] ?? [];
577 4
        foreach ($crawlerCfg as $key => $values) {
578 3
            if (! is_array($values)) {
579 3
                continue;
580
            }
581 3
            $key = str_replace('.', '', $key);
582
            // Sub configuration for a single configuration string:
583 3
            $subCfg = (array) $crawlerCfg[$key . '.'];
584 3
            $subCfg['key'] = $key;
585
586 3
            if (strcmp($subCfg['procInstrFilter'] ?? '', '')) {
587 3
                $subCfg['procInstrFilter'] = implode(',', GeneralUtility::trimExplode(',', $subCfg['procInstrFilter']));
588
            }
589 3
            $pidOnlyList = implode(',', GeneralUtility::trimExplode(',', $subCfg['pidsOnly'], true));
590
591
            // process configuration if it is not page-specific or if the specific page is the current page:
592
            // TODO: Check if $pidOnlyList can be kept as Array instead of imploded
593 3
            if (! strcmp((string) $subCfg['pidsOnly'], '') || GeneralUtility::inList($pidOnlyList, strval($pageId))) {
594
595
                // Explode, process etc.:
596 3
                $res[$key] = [];
597 3
                $res[$key]['subCfg'] = $subCfg;
598 3
                $res[$key]['paramParsed'] = GeneralUtility::explodeUrl2Array($crawlerCfg[$key]);
599 3
                $res[$key]['paramExpanded'] = $this->expandParameters($res[$key]['paramParsed'], $pageId);
600 3
                $res[$key]['origin'] = 'pagets';
601
602
                // recognize MP value
603 3
                if (! $this->MP) {
604 3
                    $res[$key]['URLs'] = $this->compileUrls($res[$key]['paramExpanded'], ['?id=' . $pageId]);
605
                } else {
606
                    $res[$key]['URLs'] = $this->compileUrls($res[$key]['paramExpanded'], ['?id=' . $pageId . '&MP=' . $this->MP]);
0 ignored issues
show
Bug introduced by
Are you sure $this->MP of type true can be used in concatenation? ( Ignorable by Annotation )

If this is a false-positive, you can also ignore this issue in your code via the ignore-type  annotation

606
                    $res[$key]['URLs'] = $this->compileUrls($res[$key]['paramExpanded'], ['?id=' . $pageId . '&MP=' . /** @scrutinizer ignore-type */ $this->MP]);
Loading history...
607
                }
608
            }
609
        }
610
611
        // Get configuration from tx_crawler_configuration records up the rootline
612 4
        $crawlerConfigurations = $this->configurationRepository->getCrawlerConfigurationRecordsFromRootLine($pageId);
613 4
        foreach ($crawlerConfigurations as $configurationRecord) {
614
615
            // check access to the configuration record
616 1
            if (empty($configurationRecord['begroups']) || $this->getBackendUser()->isAdmin() || $this->hasGroupAccess($this->getBackendUser()->user['usergroup_cached_list'], $configurationRecord['begroups'])) {
617 1
                $pidOnlyList = implode(',', GeneralUtility::trimExplode(',', $configurationRecord['pidsonly'], true));
618
619
                // process configuration if it is not page-specific or if the specific page is the current page:
620
                // TODO: Check if $pidOnlyList can be kept as Array instead of imploded
621 1
                if (! strcmp($configurationRecord['pidsonly'], '') || GeneralUtility::inList($pidOnlyList, strval($pageId))) {
622 1
                    $key = $configurationRecord['name'];
623
624
                    // don't overwrite previously defined paramSets
625 1
                    if (! isset($res[$key])) {
626
627
                        /* @var $TSparserObject TypoScriptParser */
628 1
                        $TSparserObject = GeneralUtility::makeInstance(TypoScriptParser::class);
629 1
                        $TSparserObject->parse($configurationRecord['processing_instruction_parameters_ts']);
630
631
                        $subCfg = [
632 1
                            'procInstrFilter' => $configurationRecord['processing_instruction_filter'],
633 1
                            'procInstrParams.' => $TSparserObject->setup,
634 1
                            'baseUrl' => $configurationRecord['base_url'],
635 1
                            'force_ssl' => (int) $configurationRecord['force_ssl'],
636 1
                            'userGroups' => $configurationRecord['fegroups'],
637 1
                            'exclude' => $configurationRecord['exclude'],
638 1
                            'key' => $key,
639
                        ];
640
641 1
                        if (! in_array($pageId, $this->expandExcludeString($subCfg['exclude']), true)) {
642 1
                            $res[$key] = [];
643 1
                            $res[$key]['subCfg'] = $subCfg;
644 1
                            $res[$key]['paramParsed'] = GeneralUtility::explodeUrl2Array($configurationRecord['configuration']);
645 1
                            $res[$key]['paramExpanded'] = $this->expandParameters($res[$key]['paramParsed'], $pageId);
646 1
                            $res[$key]['URLs'] = $this->compileUrls($res[$key]['paramExpanded'], ['?id=' . $pageId]);
647 1
                            $res[$key]['origin'] = 'tx_crawler_configuration_' . $configurationRecord['uid'];
648
                        }
649
                    }
650
                }
651
            }
652
        }
653
654 4
        foreach ($GLOBALS['TYPO3_CONF_VARS']['EXTCONF']['crawler']['processUrls'] ?? [] as $func) {
655
            $params = [
656
                'res' => &$res,
657
            ];
658
            GeneralUtility::callUserFunction($func, $params, $this);
659
        }
660 4
        return $res;
661
    }
662
663
    /**
664
     * Find all configurations of subpages of a page
665
     * TODO: Write Functional Tests
666
     */
667 1
    public function getConfigurationsForBranch(int $rootid, int $depth): array
668
    {
669 1
        $configurationsForBranch = [];
670 1
        $pageTSconfig = $this->getPageTSconfigForId($rootid);
671 1
        $sets = $pageTSconfig['tx_crawler.']['crawlerCfg.']['paramSets.'] ?? [];
672 1
        foreach ($sets as $key => $value) {
673
            if (! is_array($value)) {
674
                continue;
675
            }
676
            $configurationsForBranch[] = substr($key, -1) === '.' ? substr($key, 0, -1) : $key;
677
        }
678 1
        $pids = [];
679 1
        $rootLine = BackendUtility::BEgetRootLine($rootid);
680 1
        foreach ($rootLine as $node) {
681 1
            $pids[] = $node['uid'];
682
        }
683
        /* @var PageTreeView $tree */
684 1
        $tree = GeneralUtility::makeInstance(PageTreeView::class);
685 1
        $perms_clause = $this->getBackendUser()->getPagePermsClause(Permission::PAGE_SHOW);
686 1
        $tree->init(empty($perms_clause) ? '' : ('AND ' . $perms_clause));
687 1
        $tree->getTree($rootid, $depth, '');
688 1
        foreach ($tree->tree as $node) {
689
            $pids[] = $node['row']['uid'];
690
        }
691
692 1
        $queryBuilder = $this->getQueryBuilder('tx_crawler_configuration');
693
        $statement = $queryBuilder
694 1
            ->select('name')
695 1
            ->from('tx_crawler_configuration')
696 1
            ->where(
697 1
                $queryBuilder->expr()->in('pid', $queryBuilder->createNamedParameter($pids, Connection::PARAM_INT_ARRAY))
698
            )
699 1
            ->execute();
700
701 1
        while ($row = $statement->fetch()) {
702 1
            $configurationsForBranch[] = $row['name'];
703
        }
704 1
        return $configurationsForBranch;
705
    }
706
707
    /**
708
     * Check if a user has access to an item
709
     * (e.g. get the group list of the current logged in user from $GLOBALS['TSFE']->gr_list)
710
     *
711
     * @param string $groupList Comma-separated list of (fe_)group UIDs from a user
712
     * @param string $accessList Comma-separated list of (fe_)group UIDs of the item to access
713
     * @return bool TRUE if at least one of the users group UIDs is in the access list or the access list is empty
714
     * @see \TYPO3\CMS\Frontend\Page\PageRepository::getMultipleGroupsWhereClause()
715
     */
716 3
    public function hasGroupAccess($groupList, $accessList)
717
    {
718 3
        if (empty($accessList)) {
719 1
            return true;
720
        }
721 2
        foreach (GeneralUtility::intExplode(',', $groupList) as $groupUid) {
722 2
            if (GeneralUtility::inList($accessList, $groupUid)) {
723 1
                return true;
724
            }
725
        }
726 1
        return false;
727
    }
728
729
    /**
730
     * Will expand the parameters configuration to individual values. This follows a certain syntax of the value of each parameter.
731
     * Syntax of values:
732
     * - Basically: If the value is wrapped in [...] it will be expanded according to the following syntax, otherwise the value is taken literally
733
     * - Configuration is splitted by "|" and the parts are processed individually and finally added together
734
     * - For each configuration part:
735
     *         - "[int]-[int]" = Integer range, will be expanded to all values in between, values included, starting from low to high (max. 1000). Example "1-34" or "-40--30"
736
     *         - "_TABLE:[TCA table name];[_PID:[optional page id, default is current page]];[_ENABLELANG:1]" = Look up of table records from PID, filtering out deleted records. Example "_TABLE:tt_content; _PID:123"
737
     *        _ENABLELANG:1 picks only original records without their language overlays
738
     *         - Default: Literal value
739
     *
740
     * @param array $paramArray Array with key (GET var name) and values (value of GET var which is configuration for expansion)
741
     * @param integer $pid Current page ID
742
     * @return array
743
     *
744
     * TODO: Write Functional Tests
745
     */
746 11
    public function expandParameters($paramArray, $pid)
747
    {
748
        // Traverse parameter names:
749 11
        foreach ($paramArray as $p => $v) {
750 11
            $v = trim($v);
751
752
            // If value is encapsulated in square brackets it means there are some ranges of values to find, otherwise the value is literal
753 11
            if (strpos($v, '[') === 0 && substr($v, -1) === ']') {
754
                // So, find the value inside brackets and reset the paramArray value as an array.
755 11
                $v = substr($v, 1, -1);
756 11
                $paramArray[$p] = [];
757
758
                // Explode parts and traverse them:
759 11
                $parts = explode('|', $v);
760 11
                foreach ($parts as $pV) {
761
762
                    // Look for integer range: (fx. 1-34 or -40--30 // reads minus 40 to minus 30)
763 11
                    if (preg_match('/^(-?[0-9]+)\s*-\s*(-?[0-9]+)$/', trim($pV), $reg)) {
764 1
                        $reg = $this->swapIfFirstIsLargerThanSecond($reg);
765
766
                        // Traverse range, add values:
767
                        // Limit to size of range!
768 1
                        $runAwayBrake = 1000;
769 1
                        for ($a = $reg[1]; $a <= $reg[2]; $a++) {
770 1
                            $paramArray[$p][] = $a;
771 1
                            $runAwayBrake--;
772 1
                            if ($runAwayBrake <= 0) {
773
                                break;
774
                            }
775
                        }
776 10
                    } elseif (strpos(trim($pV), '_TABLE:') === 0) {
777
778
                        // Parse parameters:
779 6
                        $subparts = GeneralUtility::trimExplode(';', $pV);
780 6
                        $subpartParams = [];
781 6
                        foreach ($subparts as $spV) {
782 6
                            [$pKey, $pVal] = GeneralUtility::trimExplode(':', $spV);
783 6
                            $subpartParams[$pKey] = $pVal;
784
                        }
785
786
                        // Table exists:
787 6
                        if (isset($GLOBALS['TCA'][$subpartParams['_TABLE']])) {
788 6
                            $lookUpPid = isset($subpartParams['_PID']) ? intval($subpartParams['_PID']) : intval($pid);
789 6
                            $recursiveDepth = isset($subpartParams['_RECURSIVE']) ? intval($subpartParams['_RECURSIVE']) : 0;
790 6
                            $pidField = isset($subpartParams['_PIDFIELD']) ? trim($subpartParams['_PIDFIELD']) : 'pid';
791 6
                            $where = $subpartParams['_WHERE'] ?? '';
792 6
                            $addTable = $subpartParams['_ADDTABLE'] ?? '';
793
794 6
                            $fieldName = $subpartParams['_FIELD'] ? $subpartParams['_FIELD'] : 'uid';
795 6
                            if ($fieldName === 'uid' || $GLOBALS['TCA'][$subpartParams['_TABLE']]['columns'][$fieldName]) {
796 6
                                $queryBuilder = $this->getQueryBuilder($subpartParams['_TABLE']);
797
798 6
                                if ($recursiveDepth > 0) {
799
                                    /** @var QueryGenerator $queryGenerator */
800 2
                                    $queryGenerator = GeneralUtility::makeInstance(QueryGenerator::class);
801 2
                                    $pidList = $queryGenerator->getTreeList($lookUpPid, $recursiveDepth, 0, 1);
802 2
                                    $pidArray = GeneralUtility::intExplode(',', $pidList);
803
                                } else {
804 4
                                    $pidArray = [(string) $lookUpPid];
805
                                }
806
807 6
                                $queryBuilder->getRestrictions()
808 6
                                    ->removeAll()
809 6
                                    ->add(GeneralUtility::makeInstance(DeletedRestriction::class));
810
811
                                $queryBuilder
812 6
                                    ->select($fieldName)
813 6
                                    ->from($subpartParams['_TABLE'])
814 6
                                    ->where(
815 6
                                        $queryBuilder->expr()->in($pidField, $queryBuilder->createNamedParameter($pidArray, Connection::PARAM_INT_ARRAY)),
816 6
                                        $where
817
                                    );
818
819 6
                                if (! empty($addTable)) {
820
                                    // TODO: Check if this works as intended!
821
                                    $queryBuilder->add('from', $addTable);
822
                                }
823 6
                                $transOrigPointerField = $GLOBALS['TCA'][$subpartParams['_TABLE']]['ctrl']['transOrigPointerField'];
824
825 6
                                if ($subpartParams['_ENABLELANG'] && $transOrigPointerField) {
826
                                    $queryBuilder->andWhere(
827
                                        $queryBuilder->expr()->lte(
828
                                            $transOrigPointerField,
829
                                            0
830
                                        )
831
                                    );
832
                                }
833
834 6
                                $statement = $queryBuilder->execute();
835
836 6
                                $rows = [];
837 6
                                while ($row = $statement->fetch()) {
838 6
                                    $rows[$row[$fieldName]] = $row;
839
                                }
840
841 6
                                if (is_array($rows)) {
842 6
                                    $paramArray[$p] = array_merge($paramArray[$p], array_keys($rows));
843
                                }
844
                            }
845
                        }
846
                    } else {
847
                        // Just add value:
848 4
                        $paramArray[$p][] = $pV;
849
                    }
850
                    // Hook for processing own expandParameters place holder
851 11
                    if (is_array($GLOBALS['TYPO3_CONF_VARS']['SC_OPTIONS']['crawler/class.tx_crawler_lib.php']['expandParameters'])) {
852
                        $_params = [
853
                            'pObj' => &$this,
854
                            'paramArray' => &$paramArray,
855
                            'currentKey' => $p,
856
                            'currentValue' => $pV,
857
                            'pid' => $pid,
858
                        ];
859
                        foreach ($GLOBALS['TYPO3_CONF_VARS']['SC_OPTIONS']['crawler/class.tx_crawler_lib.php']['expandParameters'] as $_funcRef) {
860
                            GeneralUtility::callUserFunction($_funcRef, $_params, $this);
861
                        }
862
                    }
863
                }
864
865
                // Make unique set of values and sort array by key:
866 11
                $paramArray[$p] = array_unique($paramArray[$p]);
867 11
                ksort($paramArray);
868
            } else {
869
                // Set the literal value as only value in array:
870 4
                $paramArray[$p] = [$v];
871
            }
872
        }
873
874 11
        return $paramArray;
875
    }
876
877
    /**
878
     * Compiling URLs from parameter array (output of expandParameters())
879
     * The number of URLs will be the multiplication of the number of parameter values for each key
880
     *
881
     * @param array $paramArray Output of expandParameters(): Array with keys (GET var names) and for each an array of values
882
     * @param array $urls URLs accumulated in this array (for recursion)
883
     * @return array
884
     */
885 7
    public function compileUrls($paramArray, array $urls)
886
    {
887 7
        if (empty($paramArray)) {
888 7
            return $urls;
889
        }
890
        // shift first off stack:
891 6
        reset($paramArray);
892 6
        $varName = key($paramArray);
893 6
        $valueSet = array_shift($paramArray);
894
895
        // Traverse value set:
896 6
        $newUrls = [];
897 6
        foreach ($urls as $url) {
898 5
            foreach ($valueSet as $val) {
899 5
                $newUrls[] = $url . (strcmp((string) $val, '') ? '&' . rawurlencode($varName) . '=' . rawurlencode((string) $val) : '');
900
901 5
                if (count($newUrls) > $this->maximumUrlsToCompile) {
902
                    break;
903
                }
904
            }
905
        }
906 6
        return $this->compileUrls($paramArray, $newUrls);
907
    }
908
909
    /************************************
910
     *
911
     * Crawler log
912
     *
913
     ************************************/
914
915
    /**
916
     * Return array of records from crawler queue for input page ID
917
     *
918
     * @param integer $id Page ID for which to look up log entries.
919
     * @param boolean $doFlush If TRUE, then entries selected at DELETED(!) instead of selected!
920
     * @param boolean $doFullFlush
921
     * @param integer $itemsPerPage Limit the amount of entries per page default is 10
922
     * @return array
923
     */
924 4
    public function getLogEntriesForPageId($id, QueueFilter $queueFilter, $doFlush = false, $doFullFlush = false, $itemsPerPage = 10)
0 ignored issues
show
Unused Code introduced by
The parameter $doFullFlush is not used and could be removed. ( Ignorable by Annotation )

If this is a false-positive, you can also ignore this issue in your code via the ignore-unused  annotation

924
    public function getLogEntriesForPageId($id, QueueFilter $queueFilter, $doFlush = false, /** @scrutinizer ignore-unused */ $doFullFlush = false, $itemsPerPage = 10)

This check looks for parameters that have been defined for a function or method, but which are not used in the method body.

Loading history...
925
    {
926 4
        $queryBuilder = GeneralUtility::makeInstance(ConnectionPool::class)->getQueryBuilderForTable($this->tableName);
927
        $queryBuilder
928 4
            ->select('*')
929 4
            ->from($this->tableName)
930 4
            ->where(
931 4
                $queryBuilder->expr()->eq('page_id', $queryBuilder->createNamedParameter($id, PDO::PARAM_INT))
932
            )
933 4
            ->orderBy('scheduled', 'DESC');
934
935 4
        $expressionBuilder = GeneralUtility::makeInstance(ConnectionPool::class)
936 4
            ->getConnectionForTable($this->tableName)
937 4
            ->getExpressionBuilder();
938 4
        $query = $expressionBuilder->andX();
0 ignored issues
show
Unused Code introduced by
The assignment to $query is dead and can be removed.
Loading history...
939
        // PHPStorm adds the highlight that the $addWhere is immediately overwritten,
940
        // but the $query = $expressionBuilder->andX() ensures that the $addWhere is written correctly with AND
941
        // between the statements, it's not a mistake in the code.
942 4
        switch ($queueFilter) {
943 4
            case 'pending':
944
                $queryBuilder->andWhere($queryBuilder->expr()->eq('exec_time', 0));
945
                break;
946 4
            case 'finished':
947
                $queryBuilder->andWhere($queryBuilder->expr()->gt('exec_time', 0));
948
                break;
949
        }
950
951 4
        if ($doFlush) {
952 2
            $this->queueRepository->flushQueue($queueFilter);
953
        }
954 4
        if ($itemsPerPage > 0) {
955
            $queryBuilder
956 4
                ->setMaxResults((int) $itemsPerPage);
957
        }
958
959 4
        return $queryBuilder->execute()->fetchAll();
960
    }
961
962
    /**
963
     * Return array of records from crawler queue for input set ID
964
     *
965
     * @param int $set_id Set ID for which to look up log entries.
966
     * @param string $filter Filter: "all" => all entries, "pending" => all that is not yet run, "finished" => all complete ones
967
     * @param bool $doFlush If TRUE, then entries selected at DELETED(!) instead of selected!
968
     * @param int $itemsPerPage Limit the amount of entries per page default is 10
969
     * @return array
970
     *
971
     * @deprecated
972
     */
973 6
    public function getLogEntriesForSetId(int $set_id, string $filter = '', bool $doFlush = false, bool $doFullFlush = false, int $itemsPerPage = 10)
974
    {
975 6
        $queryBuilder = GeneralUtility::makeInstance(ConnectionPool::class)->getQueryBuilderForTable($this->tableName);
976
        $queryBuilder
977 6
            ->select('*')
978 6
            ->from($this->tableName)
979 6
            ->where(
980 6
                $queryBuilder->expr()->eq('set_id', $queryBuilder->createNamedParameter($set_id, PDO::PARAM_INT))
981
            )
982 6
            ->orderBy('scheduled', 'DESC');
983
984 6
        $expressionBuilder = GeneralUtility::makeInstance(ConnectionPool::class)
985 6
            ->getConnectionForTable($this->tableName)
986 6
            ->getExpressionBuilder();
987 6
        $query = $expressionBuilder->andX();
988
        // PHPStorm adds the highlight that the $addWhere is immediately overwritten,
989
        // but the $query = $expressionBuilder->andX() ensures that the $addWhere is written correctly with AND
990
        // between the statements, it's not a mistake in the code.
991 6
        $addWhere = '';
992 6
        switch ($filter) {
993 6
            case 'pending':
994 1
                $queryBuilder->andWhere($queryBuilder->expr()->eq('exec_time', 0));
995 1
                $addWhere = $query->add($expressionBuilder->eq('exec_time', 0));
0 ignored issues
show
Unused Code introduced by
The assignment to $addWhere is dead and can be removed.
Loading history...
996 1
                break;
997 5
            case 'finished':
998 1
                $queryBuilder->andWhere($queryBuilder->expr()->gt('exec_time', 0));
999 1
                $addWhere = $query->add($expressionBuilder->gt('exec_time', 0));
1000 1
                break;
1001
        }
1002 6
        if ($doFlush) {
1003 4
            $addWhere = $query->add($expressionBuilder->eq('set_id', (int) $set_id));
1004 4
            $this->flushQueue($doFullFlush ? '' : $addWhere);
0 ignored issues
show
Deprecated Code introduced by
The function AOE\Crawler\Controller\C...ontroller::flushQueue() has been deprecated. ( Ignorable by Annotation )

If this is a false-positive, you can also ignore this issue in your code via the ignore-deprecated  annotation

1004
            /** @scrutinizer ignore-deprecated */ $this->flushQueue($doFullFlush ? '' : $addWhere);
Loading history...
1005 4
            return [];
1006
        }
1007 2
        if ($itemsPerPage > 0) {
1008
            $queryBuilder
1009 2
                ->setMaxResults((int) $itemsPerPage);
1010
        }
1011
1012 2
        return $queryBuilder->execute()->fetchAll();
1013
    }
1014
1015
    /**
1016
     * Adding call back entries to log (called from hooks typically, see indexed search class "class.crawler.php"
1017
     *
1018
     * @param integer $setId Set ID
1019
     * @param array $params Parameters to pass to call back function
1020
     * @param string $callBack Call back object reference, eg. 'EXT:indexed_search/class.crawler.php:&tx_indexedsearch_crawler'
1021
     * @param integer $page_id Page ID to attach it to
1022
     * @param integer $schedule Time at which to activate
1023
     */
1024
    public function addQueueEntry_callBack($setId, $params, $callBack, $page_id = 0, $schedule = 0): void
1025
    {
1026
        if (! is_array($params)) {
0 ignored issues
show
introduced by
The condition is_array($params) is always true.
Loading history...
1027
            $params = [];
1028
        }
1029
        $params['_CALLBACKOBJ'] = $callBack;
1030
1031
        GeneralUtility::makeInstance(ConnectionPool::class)->getConnectionForTable('tx_crawler_queue')
1032
            ->insert(
1033
                'tx_crawler_queue',
1034
                [
1035
                    'page_id' => (int) $page_id,
1036
                    'parameters' => json_encode($params),
1037
                    'scheduled' => (int) $schedule ?: $this->getCurrentTime(),
1038
                    'exec_time' => 0,
1039
                    'set_id' => (int) $setId,
1040
                    'result_data' => '',
1041
                ]
1042
            );
1043
    }
1044
1045
    /************************************
1046
     *
1047
     * URL setting
1048
     *
1049
     ************************************/
1050
1051
    /**
1052
     * Setting a URL for crawling:
1053
     *
1054
     * @param integer $id Page ID
1055
     * @param string $url Complete URL
1056
     * @param array $subCfg Sub configuration array (from TS config)
1057
     * @param integer $tstamp Scheduled-time
1058
     * @param string $configurationHash (optional) configuration hash
1059
     * @param bool $skipInnerDuplicationCheck (optional) skip inner duplication check
1060
     * @return bool
1061
     */
1062 8
    public function addUrl(
1063
        $id,
1064
        $url,
1065
        array $subCfg,
1066
        $tstamp,
1067
        $configurationHash = '',
1068
        $skipInnerDuplicationCheck = false
1069
    ) {
1070 8
        $urlAdded = false;
1071 8
        $rows = [];
1072
1073
        // Creating parameters:
1074
        $parameters = [
1075 8
            'url' => $url,
1076
        ];
1077
1078
        // fe user group simulation:
1079 8
        $uGs = implode(',', array_unique(GeneralUtility::intExplode(',', $subCfg['userGroups'], true)));
1080 8
        if ($uGs) {
1081 1
            $parameters['feUserGroupList'] = $uGs;
1082
        }
1083
1084
        // Setting processing instructions
1085 8
        $parameters['procInstructions'] = GeneralUtility::trimExplode(',', $subCfg['procInstrFilter']);
1086 8
        if (is_array($subCfg['procInstrParams.'])) {
1087 5
            $parameters['procInstrParams'] = $subCfg['procInstrParams.'];
1088
        }
1089
1090
        // Compile value array:
1091 8
        $parameters_serialized = json_encode($parameters);
1092
        $fieldArray = [
1093 8
            'page_id' => (int) $id,
1094 8
            'parameters' => $parameters_serialized,
1095 8
            'parameters_hash' => GeneralUtility::shortMD5($parameters_serialized),
1096 8
            'configuration_hash' => $configurationHash,
1097 8
            'scheduled' => $tstamp,
1098 8
            'exec_time' => 0,
1099 8
            'set_id' => (int) $this->setID,
1100 8
            'result_data' => '',
1101 8
            'configuration' => $subCfg['key'],
1102
        ];
1103
1104 8
        if ($this->registerQueueEntriesInternallyOnly) {
1105
            //the entries will only be registered and not stored to the database
1106 1
            $this->queueEntries[] = $fieldArray;
1107
        } else {
1108 7
            if (! $skipInnerDuplicationCheck) {
1109
                // check if there is already an equal entry
1110 6
                $rows = $this->queueRepository->getDuplicateQueueItemsIfExists(
1111 6
                    (bool) $this->extensionSettings['enableTimeslot'],
1112 6
                    $tstamp,
1113 6
                    $this->getCurrentTime(),
1114 6
                    $fieldArray['page_id'],
1115 6
                    $fieldArray['parameters_hash']
1116
                );
1117
            }
1118
1119 7
            if (empty($rows)) {
1120 6
                $connectionForCrawlerQueue = GeneralUtility::makeInstance(ConnectionPool::class)->getConnectionForTable('tx_crawler_queue');
1121 6
                $connectionForCrawlerQueue->insert(
1122 6
                    'tx_crawler_queue',
1123 6
                    $fieldArray
1124
                );
1125 6
                $uid = $connectionForCrawlerQueue->lastInsertId('tx_crawler_queue', 'qid');
1126 6
                $rows[] = $uid;
1127 6
                $urlAdded = true;
1128
1129 6
                $signalPayload = ['uid' => $uid, 'fieldArray' => $fieldArray];
1130 6
                SignalSlotUtility::emitSignal(
1131 6
                    self::class,
1132 6
                    SignalSlotUtility::SIGNAL_URL_ADDED_TO_QUEUE,
1133 6
                    $signalPayload
1134
                );
1135
            } else {
1136 3
                $signalPayload = ['rows' => $rows, 'fieldArray' => $fieldArray];
1137 3
                SignalSlotUtility::emitSignal(
1138 3
                    self::class,
1139 3
                    SignalSlotUtility::SIGNAL_DUPLICATE_URL_IN_QUEUE,
1140 3
                    $signalPayload
1141
                );
1142
            }
1143
        }
1144
1145 8
        return $urlAdded;
1146
    }
1147
1148
    /**
1149
     * Returns the current system time
1150
     *
1151
     * @return int
1152
     */
1153 2
    public function getCurrentTime()
1154
    {
1155 2
        return time();
1156
    }
1157
1158
    /************************************
1159
     *
1160
     * URL reading
1161
     *
1162
     ************************************/
1163
1164
    /**
1165
     * Read URL for single queue entry
1166
     *
1167
     * @param integer $queueId
1168
     * @param boolean $force If set, will process even if exec_time has been set!
1169
     * @return integer
1170
     */
1171
    public function readUrl($queueId, $force = false)
1172
    {
1173
        $queryBuilder = GeneralUtility::makeInstance(ConnectionPool::class)->getQueryBuilderForTable($this->tableName);
1174
        $ret = 0;
1175
        $this->logger->debug('crawler-readurl start ' . microtime(true));
1176
        // Get entry:
1177
        $queryBuilder
1178
            ->select('*')
1179
            ->from('tx_crawler_queue')
1180
            ->where(
1181
                $queryBuilder->expr()->eq('qid', $queryBuilder->createNamedParameter($queueId, PDO::PARAM_INT))
1182
            );
1183
        if (! $force) {
1184
            $queryBuilder
1185
                ->andWhere('exec_time = 0')
1186
                ->andWhere('process_scheduled > 0');
1187
        }
1188
        $queueRec = $queryBuilder->execute()->fetch();
1189
1190
        if (! is_array($queueRec)) {
1191
            return;
1192
        }
1193
1194
        SignalSlotUtility::emitSignal(
1195
            self::class,
1196
            SignalSlotUtility::SIGNAL_QUEUEITEM_PREPROCESS,
1197
            [$queueId, &$queueRec]
1198
        );
1199
1200
        // Set exec_time to lock record:
1201
        $field_array = ['exec_time' => $this->getCurrentTime()];
1202
1203
        if (isset($this->processID)) {
1204
            //if mulitprocessing is used we need to store the id of the process which has handled this entry
1205
            $field_array['process_id_completed'] = $this->processID;
1206
        }
1207
1208
        GeneralUtility::makeInstance(ConnectionPool::class)->getConnectionForTable('tx_crawler_queue')
1209
            ->update(
1210
                'tx_crawler_queue',
1211
                $field_array,
1212
                ['qid' => (int) $queueId]
1213
            );
1214
1215
        $result = $this->queueExecutor->executeQueueItem($queueRec, $this);
1216
        if ($result['content'] === null) {
1217
            $resultData = 'An errors happened';
1218
        } else {
1219
            /** @var JsonCompatibilityConverter $jsonCompatibilityConverter */
1220
            $jsonCompatibilityConverter = GeneralUtility::makeInstance(JsonCompatibilityConverter::class);
1221
            $resultData = $jsonCompatibilityConverter->convert($result['content']);
1222
        }
1223
1224
        //atm there's no need to point to specific pollable extensions
1225
        if (is_array($GLOBALS['TYPO3_CONF_VARS']['EXTCONF']['crawler']['pollSuccess'])) {
1226
            foreach ($GLOBALS['TYPO3_CONF_VARS']['EXTCONF']['crawler']['pollSuccess'] as $pollable) {
1227
                // only check the success value if the instruction is runnig
1228
                // it is important to name the pollSuccess key same as the procInstructions key
1229
                if (is_array($resultData['parameters']['procInstructions'])
1230
                    && in_array(
1231
                        $pollable,
1232
                        $resultData['parameters']['procInstructions'], true
1233
                    )
1234
                ) {
1235
                    if (! empty($resultData['success'][$pollable]) && $resultData['success'][$pollable]) {
1236
                        $ret |= self::CLI_STATUS_POLLABLE_PROCESSED;
1237
                    }
1238
                }
1239
            }
1240
        }
1241
1242
        // Set result in log which also denotes the end of the processing of this entry.
1243
        $field_array = ['result_data' => json_encode($result)];
1244
1245
        SignalSlotUtility::emitSignal(
1246
            self::class,
1247
            SignalSlotUtility::SIGNAL_QUEUEITEM_POSTPROCESS,
1248
            [$queueId, &$field_array]
1249
        );
1250
1251
        GeneralUtility::makeInstance(ConnectionPool::class)->getConnectionForTable('tx_crawler_queue')
1252
            ->update(
1253
                'tx_crawler_queue',
1254
                $field_array,
1255
                ['qid' => (int) $queueId]
1256
            );
1257
1258
        $this->logger->debug('crawler-readurl stop ' . microtime(true));
1259
        return $ret;
1260
    }
1261
1262
    /**
1263
     * Read URL for not-yet-inserted log-entry
1264
     *
1265
     * @param array $field_array Queue field array,
1266
     *
1267
     * @return array|bool|mixed|string
1268
     */
1269
    public function readUrlFromArray($field_array)
1270
    {
1271
        // Set exec_time to lock record:
1272
        $field_array['exec_time'] = $this->getCurrentTime();
1273
        $connectionForCrawlerQueue = GeneralUtility::makeInstance(ConnectionPool::class)->getConnectionForTable($this->tableName);
1274
        $connectionForCrawlerQueue->insert(
1275
            $this->tableName,
1276
            $field_array
1277
        );
1278
        $queueId = $connectionForCrawlerQueue->lastInsertId($this->tableName, 'qid');
1279
        $result = $this->queueExecutor->executeQueueItem($field_array, $this);
1280
1281
        // Set result in log which also denotes the end of the processing of this entry.
1282
        $field_array = ['result_data' => json_encode($result)];
1283
1284
        SignalSlotUtility::emitSignal(
1285
            self::class,
1286
            SignalSlotUtility::SIGNAL_QUEUEITEM_POSTPROCESS,
1287
            [$queueId, &$field_array]
1288
        );
1289
1290
        $connectionForCrawlerQueue->update(
1291
            $this->tableName,
1292
            $field_array,
1293
            ['qid' => $queueId]
1294
        );
1295
1296
        return $result;
1297
    }
1298
1299
    /*****************************
1300
     *
1301
     * Compiling URLs to crawl - tools
1302
     *
1303
     *****************************/
1304
1305
    /**
1306
     * @param integer $id Root page id to start from.
1307
     * @param integer $depth Depth of tree, 0=only id-page, 1= on sublevel, 99 = infinite
1308
     * @param integer $scheduledTime Unix Time when the URL is timed to be visited when put in queue
1309
     * @param integer $reqMinute Number of requests per minute (creates the interleave between requests)
1310
     * @param boolean $submitCrawlUrls If set, submits the URLs to queue in database (real crawling)
1311
     * @param boolean $downloadCrawlUrls If set (and submitcrawlUrls is false) will fill $downloadUrls with entries)
1312
     * @param array $incomingProcInstructions Array of processing instructions
1313
     * @param array $configurationSelection Array of configuration keys
1314
     * @return string
1315
     */
1316
    public function getPageTreeAndUrls(
1317
        $id,
1318
        $depth,
1319
        $scheduledTime,
1320
        $reqMinute,
1321
        $submitCrawlUrls,
1322
        $downloadCrawlUrls,
1323
        array $incomingProcInstructions,
1324
        array $configurationSelection
1325
    ) {
1326
        $this->scheduledTime = $scheduledTime;
1327
        $this->reqMinute = $reqMinute;
1328
        $this->submitCrawlUrls = $submitCrawlUrls;
1329
        $this->downloadCrawlUrls = $downloadCrawlUrls;
1330
        $this->incomingProcInstructions = $incomingProcInstructions;
1331
        $this->incomingConfigurationSelection = $configurationSelection;
1332
1333
        $this->duplicateTrack = [];
1334
        $this->downloadUrls = [];
1335
1336
        // Drawing tree:
1337
        /* @var PageTreeView $tree */
1338
        $tree = GeneralUtility::makeInstance(PageTreeView::class);
1339
        $perms_clause = $this->getBackendUser()->getPagePermsClause(Permission::PAGE_SHOW);
1340
        $tree->init('AND ' . $perms_clause);
1341
1342
        $pageInfo = BackendUtility::readPageAccess($id, $perms_clause);
1343
        if (is_array($pageInfo)) {
0 ignored issues
show
introduced by
The condition is_array($pageInfo) is always true.
Loading history...
1344
            // Set root row:
1345
            $tree->tree[] = [
1346
                'row' => $pageInfo,
1347
                'HTML' => $this->iconFactory->getIconForRecord('pages', $pageInfo, Icon::SIZE_SMALL),
1348
            ];
1349
        }
1350
1351
        // Get branch beneath:
1352
        if ($depth) {
1353
            $tree->getTree($id, $depth, '');
1354
        }
1355
1356
        // Traverse page tree:
1357
        $code = '';
1358
1359
        foreach ($tree->tree as $data) {
1360
            $this->MP = false;
1361
1362
            // recognize mount points
1363
            if ($data['row']['doktype'] === PageRepository::DOKTYPE_MOUNTPOINT) {
1364
                $mountpage = $this->pageRepository->getPage($data['row']['uid']);
1365
1366
                // fetch mounted pages
1367
                $this->MP = $mountpage[0]['mount_pid'] . '-' . $data['row']['uid'];
0 ignored issues
show
Documentation Bug introduced by
The property $MP was declared of type boolean, but $mountpage[0]['mount_pid...' . $data['row']['uid'] is of type string. Maybe add a type cast?

This check looks for assignments to scalar types that may be of the wrong type.

To ensure the code behaves as expected, it may be a good idea to add an explicit type cast.

$answer = 42;

$correct = false;

$correct = (bool) $answer;
Loading history...
1368
1369
                $mountTree = GeneralUtility::makeInstance(PageTreeView::class);
1370
                $mountTree->init('AND ' . $perms_clause);
1371
                $mountTree->getTree($mountpage[0]['mount_pid'], $depth);
1372
1373
                foreach ($mountTree->tree as $mountData) {
1374
                    $code .= $this->drawURLs_addRowsForPage(
1375
                        $mountData['row'],
1376
                        $mountData['HTML'] . BackendUtility::getRecordTitle('pages', $mountData['row'], true)
1377
                    );
1378
                }
1379
1380
                // replace page when mount_pid_ol is enabled
1381
                if ($mountpage[0]['mount_pid_ol']) {
1382
                    $data['row']['uid'] = $mountpage[0]['mount_pid'];
1383
                } else {
1384
                    // if the mount_pid_ol is not set the MP must not be used for the mountpoint page
1385
                    $this->MP = false;
1386
                }
1387
            }
1388
1389
            $code .= $this->drawURLs_addRowsForPage(
1390
                $data['row'],
1391
                $data['HTML'] . BackendUtility::getRecordTitle('pages', $data['row'], true)
1392
            );
1393
        }
1394
1395
        return $code;
1396
    }
1397
1398
    /**
1399
     * Expands exclude string
1400
     *
1401
     * @param string $excludeString Exclude string
1402
     * @return array
1403
     */
1404 2
    public function expandExcludeString($excludeString)
1405
    {
1406
        // internal static caches;
1407 2
        static $expandedExcludeStringCache;
1408 2
        static $treeCache;
1409
1410 2
        if (empty($expandedExcludeStringCache[$excludeString])) {
1411 2
            $pidList = [];
1412
1413 2
            if (! empty($excludeString)) {
1414
                /** @var PageTreeView $tree */
1415 1
                $tree = GeneralUtility::makeInstance(PageTreeView::class);
1416 1
                $tree->init('AND ' . $this->getBackendUser()->getPagePermsClause(Permission::PAGE_SHOW));
1417
1418 1
                $excludeParts = GeneralUtility::trimExplode(',', $excludeString);
1419
1420 1
                foreach ($excludeParts as $excludePart) {
1421 1
                    [$pid, $depth] = GeneralUtility::trimExplode('+', $excludePart);
1422
1423
                    // default is "page only" = "depth=0"
1424 1
                    if (empty($depth)) {
1425 1
                        $depth = (stristr($excludePart, '+')) ? 99 : 0;
1426
                    }
1427
1428 1
                    $pidList[] = (int) $pid;
1429
1430 1
                    if ($depth > 0) {
1431
                        if (empty($treeCache[$pid][$depth])) {
1432
                            $tree->reset();
1433
                            $tree->getTree($pid, $depth);
0 ignored issues
show
Bug introduced by
$pid of type string is incompatible with the type integer expected by parameter $uid of TYPO3\CMS\Backend\Tree\V...ractTreeView::getTree(). ( Ignorable by Annotation )

If this is a false-positive, you can also ignore this issue in your code via the ignore-type  annotation

1433
                            $tree->getTree(/** @scrutinizer ignore-type */ $pid, $depth);
Loading history...
1434
                            $treeCache[$pid][$depth] = $tree->tree;
1435
                        }
1436
1437
                        foreach ($treeCache[$pid][$depth] as $data) {
1438
                            $pidList[] = (int) $data['row']['uid'];
1439
                        }
1440
                    }
1441
                }
1442
            }
1443
1444 2
            $expandedExcludeStringCache[$excludeString] = array_unique($pidList);
1445
        }
1446
1447 2
        return $expandedExcludeStringCache[$excludeString];
1448
    }
1449
1450
    /**
1451
     * Create the rows for display of the page tree
1452
     * For each page a number of rows are shown displaying GET variable configuration
1453
     */
1454
    public function drawURLs_addRowsForPage(array $pageRow, string $pageTitle): string
1455
    {
1456
        $skipMessage = '';
1457
1458
        // Get list of configurations
1459
        $configurations = $this->getUrlsForPageRow($pageRow, $skipMessage);
1460
1461
        if (! empty($this->incomingConfigurationSelection)) {
1462
            // remove configuration that does not match the current selection
1463
            foreach ($configurations as $confKey => $confArray) {
1464
                if (! in_array($confKey, $this->incomingConfigurationSelection, true)) {
1465
                    unset($configurations[$confKey]);
1466
                }
1467
            }
1468
        }
1469
1470
        // Traverse parameter combinations:
1471
        $c = 0;
1472
        $content = '';
1473
        if (! empty($configurations)) {
1474
            foreach ($configurations as $confKey => $confArray) {
1475
1476
                // Title column:
1477
                if (! $c) {
1478
                    $titleClm = '<td rowspan="' . count($configurations) . '">' . $pageTitle . '</td>';
1479
                } else {
1480
                    $titleClm = '';
1481
                }
1482
1483
                if (! in_array($pageRow['uid'], $this->expandExcludeString($confArray['subCfg']['exclude']), true)) {
1484
1485
                    // URL list:
1486
                    $urlList = $this->urlListFromUrlArray(
1487
                        $confArray,
1488
                        $pageRow,
1489
                        $this->scheduledTime,
1490
                        $this->reqMinute,
1491
                        $this->submitCrawlUrls,
1492
                        $this->downloadCrawlUrls,
1493
                        $this->duplicateTrack,
1494
                        $this->downloadUrls,
1495
                        // if empty the urls won't be filtered by processing instructions
1496
                        $this->incomingProcInstructions
1497
                    );
1498
1499
                    // Expanded parameters:
1500
                    $paramExpanded = '';
1501
                    $calcAccu = [];
1502
                    $calcRes = 1;
1503
                    foreach ($confArray['paramExpanded'] as $gVar => $gVal) {
1504
                        $paramExpanded .= '
1505
                            <tr>
1506
                                <td>' . htmlspecialchars('&' . $gVar . '=') . '<br/>' .
1507
                            '(' . count($gVal) . ')' .
1508
                            '</td>
1509
                                <td nowrap="nowrap">' . nl2br(htmlspecialchars(implode(chr(10), $gVal))) . '</td>
1510
                            </tr>
1511
                        ';
1512
                        $calcRes *= count($gVal);
1513
                        $calcAccu[] = count($gVal);
1514
                    }
1515
                    $paramExpanded = '<table>' . $paramExpanded . '</table>';
1516
                    $paramExpanded .= 'Comb: ' . implode('*', $calcAccu) . '=' . $calcRes;
1517
1518
                    // Options
1519
                    $optionValues = '';
1520
                    if ($confArray['subCfg']['userGroups']) {
1521
                        $optionValues .= 'User Groups: ' . $confArray['subCfg']['userGroups'] . '<br/>';
1522
                    }
1523
                    if ($confArray['subCfg']['procInstrFilter']) {
1524
                        $optionValues .= 'ProcInstr: ' . $confArray['subCfg']['procInstrFilter'] . '<br/>';
1525
                    }
1526
1527
                    // Compile row:
1528
                    $content .= '
1529
                        <tr>
1530
                            ' . $titleClm . '
1531
                            <td>' . htmlspecialchars($confKey) . '</td>
1532
                            <td>' . nl2br(htmlspecialchars(rawurldecode(trim(str_replace('&', chr(10) . '&', GeneralUtility::implodeArrayForUrl('', $confArray['paramParsed'])))))) . '</td>
1533
                            <td>' . $paramExpanded . '</td>
1534
                            <td nowrap="nowrap">' . $urlList . '</td>
1535
                            <td nowrap="nowrap">' . $optionValues . '</td>
1536
                            <td nowrap="nowrap">' . DebugUtility::viewArray($confArray['subCfg']['procInstrParams.']) . '</td>
1537
                        </tr>';
1538
                } else {
1539
                    $content .= '<tr>
1540
                            ' . $titleClm . '
1541
                            <td>' . htmlspecialchars($confKey) . '</td>
1542
                            <td colspan="5"><em>No entries</em> (Page is excluded in this configuration)</td>
1543
                        </tr>';
1544
                }
1545
1546
                $c++;
1547
            }
1548
        } else {
1549
            $message = ! empty($skipMessage) ? ' (' . $skipMessage . ')' : '';
1550
1551
            // Compile row:
1552
            $content .= '
1553
                <tr>
1554
                    <td>' . $pageTitle . '</td>
1555
                    <td colspan="6"><em>No entries</em>' . $message . '</td>
1556
                </tr>';
1557
        }
1558
1559
        return $content;
1560
    }
1561
1562
    /*****************************
1563
     *
1564
     * CLI functions
1565
     *
1566
     *****************************/
1567
1568
    /**
1569
     * Running the functionality of the CLI (crawling URLs from queue)
1570
     */
1571
    public function CLI_run(int $countInARun, int $sleepTime, int $sleepAfterFinish): int
1572
    {
1573
        $result = 0;
1574
        $counter = 0;
1575
1576
        // First, run hooks:
1577
        $this->CLI_runHooks();
1578
1579
        // Clean up the queue
1580
        $this->queueRepository->cleanupQueue();
1581
1582
        // Select entries:
1583
        $rows = $this->queueRepository->fetchRecordsToBeCrawled($countInARun);
1584
1585
        if (! empty($rows)) {
1586
            $quidList = [];
1587
1588
            foreach ($rows as $r) {
1589
                $quidList[] = $r['qid'];
1590
            }
1591
1592
            $processId = $this->CLI_buildProcessId();
1593
1594
            //save the number of assigned queue entries to determine how many have been processed later
1595
            $numberOfAffectedRows = $this->queueRepository->updateProcessIdAndSchedulerForQueueIds($quidList, $processId);
1596
            $this->processRepository->updateProcessAssignItemsCount($numberOfAffectedRows, $processId);
1597
1598
            if ($numberOfAffectedRows !== count($quidList)) {
1599
                $this->CLI_debug('Nothing processed due to multi-process collision (' . $this->CLI_buildProcessId() . ')');
0 ignored issues
show
Deprecated Code introduced by
The function AOE\Crawler\Controller\C...Controller::CLI_debug() has been deprecated. ( Ignorable by Annotation )

If this is a false-positive, you can also ignore this issue in your code via the ignore-deprecated  annotation

1599
                /** @scrutinizer ignore-deprecated */ $this->CLI_debug('Nothing processed due to multi-process collision (' . $this->CLI_buildProcessId() . ')');
Loading history...
1600
                return ($result | self::CLI_STATUS_ABORTED);
1601
            }
1602
1603
            foreach ($rows as $r) {
1604
                $result |= $this->readUrl($r['qid']);
1605
1606
                $counter++;
1607
                // Just to relax the system
1608
                usleep((int) $sleepTime);
1609
1610
                // if during the start and the current read url the cli has been disable we need to return from the function
1611
                // mark the process NOT as ended.
1612
                if ($this->crawler->isDisabled()) {
1613
                    return ($result | self::CLI_STATUS_ABORTED);
1614
                }
1615
1616
                if (! $this->processRepository->isProcessActive($this->CLI_buildProcessId())) {
1617
                    $this->CLI_debug('conflict / timeout (' . $this->CLI_buildProcessId() . ')');
0 ignored issues
show
Deprecated Code introduced by
The function AOE\Crawler\Controller\C...Controller::CLI_debug() has been deprecated. ( Ignorable by Annotation )

If this is a false-positive, you can also ignore this issue in your code via the ignore-deprecated  annotation

1617
                    /** @scrutinizer ignore-deprecated */ $this->CLI_debug('conflict / timeout (' . $this->CLI_buildProcessId() . ')');
Loading history...
1618
                    $result |= self::CLI_STATUS_ABORTED;
1619
                    //possible timeout
1620
                    break;
1621
                }
1622
            }
1623
1624
            sleep((int) $sleepAfterFinish);
1625
1626
            $msg = 'Rows: ' . $counter;
1627
            $this->CLI_debug($msg . ' (' . $this->CLI_buildProcessId() . ')');
0 ignored issues
show
Deprecated Code introduced by
The function AOE\Crawler\Controller\C...Controller::CLI_debug() has been deprecated. ( Ignorable by Annotation )

If this is a false-positive, you can also ignore this issue in your code via the ignore-deprecated  annotation

1627
            /** @scrutinizer ignore-deprecated */ $this->CLI_debug($msg . ' (' . $this->CLI_buildProcessId() . ')');
Loading history...
1628
        } else {
1629
            $this->CLI_debug('Nothing within queue which needs to be processed (' . $this->CLI_buildProcessId() . ')');
0 ignored issues
show
Deprecated Code introduced by
The function AOE\Crawler\Controller\C...Controller::CLI_debug() has been deprecated. ( Ignorable by Annotation )

If this is a false-positive, you can also ignore this issue in your code via the ignore-deprecated  annotation

1629
            /** @scrutinizer ignore-deprecated */ $this->CLI_debug('Nothing within queue which needs to be processed (' . $this->CLI_buildProcessId() . ')');
Loading history...
1630
        }
1631
1632
        if ($counter > 0) {
1633
            $result |= self::CLI_STATUS_PROCESSED;
1634
        }
1635
1636
        return $result;
1637
    }
1638
1639
    /**
1640
     * Activate hooks
1641
     */
1642
    public function CLI_runHooks(): void
1643
    {
1644
        foreach ($GLOBALS['TYPO3_CONF_VARS']['EXTCONF']['crawler']['cli_hooks'] ?? [] as $objRef) {
1645
            $hookObj = GeneralUtility::makeInstance($objRef);
1646
            if (is_object($hookObj)) {
1647
                $hookObj->crawler_init($this);
1648
            }
1649
        }
1650
    }
1651
1652
    /**
1653
     * Try to acquire a new process with the given id
1654
     * also performs some auto-cleanup for orphan processes
1655
     * @param string $id identification string for the process
1656
     * @return boolean
1657
     * @todo preemption might not be the most elegant way to clean up
1658
     */
1659
    public function CLI_checkAndAcquireNewProcess($id)
1660
    {
1661
        $ret = true;
1662
1663
        $systemProcessId = getmypid();
1664
        if (! $systemProcessId) {
1665
            return false;
1666
        }
1667
1668
        $processCount = 0;
1669
        $orphanProcesses = [];
1670
1671
        $activeProcesses = $this->processRepository->findAllActive();
1672
        $currentTime = $this->getCurrentTime();
1673
1674
        /** @var Process $process */
1675
        foreach ($activeProcesses as $process) {
1676
            if ($process->getTtl() < $currentTime) {
1677
                $orphanProcesses[] = $process->getProcessId();
1678
            } else {
1679
                $processCount++;
1680
            }
1681
        }
1682
1683
        // if there are less than allowed active processes then add a new one
1684
        if ($processCount < (int) $this->extensionSettings['processLimit']) {
1685
            $this->CLI_debug('add process ' . $this->CLI_buildProcessId() . ' (' . ($processCount + 1) . '/' . (int) $this->extensionSettings['processLimit'] . ')');
0 ignored issues
show
Deprecated Code introduced by
The function AOE\Crawler\Controller\C...Controller::CLI_debug() has been deprecated. ( Ignorable by Annotation )

If this is a false-positive, you can also ignore this issue in your code via the ignore-deprecated  annotation

1685
            /** @scrutinizer ignore-deprecated */ $this->CLI_debug('add process ' . $this->CLI_buildProcessId() . ' (' . ($processCount + 1) . '/' . (int) $this->extensionSettings['processLimit'] . ')');
Loading history...
1686
1687
            GeneralUtility::makeInstance(ConnectionPool::class)->getConnectionForTable('tx_crawler_process')->insert(
1688
                'tx_crawler_process',
1689
                [
1690
                    'process_id' => $id,
1691
                    'active' => 1,
1692
                    'ttl' => $currentTime + (int) $this->extensionSettings['processMaxRunTime'],
1693
                    'system_process_id' => $systemProcessId,
1694
                ]
1695
            );
1696
        } else {
1697
            $this->CLI_debug('Processlimit reached (' . ($processCount) . '/' . (int) $this->extensionSettings['processLimit'] . ')');
0 ignored issues
show
Deprecated Code introduced by
The function AOE\Crawler\Controller\C...Controller::CLI_debug() has been deprecated. ( Ignorable by Annotation )

If this is a false-positive, you can also ignore this issue in your code via the ignore-deprecated  annotation

1697
            /** @scrutinizer ignore-deprecated */ $this->CLI_debug('Processlimit reached (' . ($processCount) . '/' . (int) $this->extensionSettings['processLimit'] . ')');
Loading history...
1698
            $ret = false;
1699
        }
1700
1701
        $this->processRepository->deleteProcessesMarkedAsDeleted();
1702
        $this->CLI_releaseProcesses($orphanProcesses);
1703
1704
        return $ret;
1705
    }
1706
1707
    /**
1708
     * Release a process and the required resources
1709
     *
1710
     * @param mixed $releaseIds string with a single process-id or array with multiple process-ids
1711
     * @return boolean
1712
     */
1713
    public function CLI_releaseProcesses($releaseIds)
1714
    {
1715
        $queryBuilder = GeneralUtility::makeInstance(ConnectionPool::class)->getQueryBuilderForTable($this->tableName);
1716
1717
        if (! is_array($releaseIds)) {
1718
            $releaseIds = [$releaseIds];
1719
        }
1720
1721
        if (empty($releaseIds)) {
1722
            //nothing to release
1723
            return false;
1724
        }
1725
1726
        // some kind of 2nd chance algo - this way you need at least 2 processes to have a real cleanup
1727
        // this ensures that a single process can't mess up the entire process table
1728
1729
        // mark all processes as deleted which have no "waiting" queue-entires and which are not active
1730
1731
        $queryBuilder
1732
            ->update($this->tableName, 'q')
1733
            ->where(
1734
                'q.process_id IN(SELECT p.process_id FROM tx_crawler_process as p WHERE p.active = 0)'
1735
            )
1736
            ->set('q.process_scheduled', 0)
1737
            ->set('q.process_id', '')
1738
            ->execute();
1739
1740
        // FIXME: Not entirely sure that this is equivalent to the previous version
1741
        $queryBuilder->resetQueryPart('set');
1742
1743
        $queryBuilder
1744
            ->update('tx_crawler_process')
1745
            ->where(
1746
                $queryBuilder->expr()->eq('active', 0),
1747
                'process_id IN(SELECT q.process_id FROM tx_crawler_queue as q WHERE q.exec_time = 0)'
1748
            )
1749
            ->set('system_process_id', 0)
1750
            ->execute();
1751
1752
        $this->processRepository->markRequestedProcessesAsNotActive($releaseIds);
1753
        $this->queueRepository->unsetProcessScheduledAndProcessIdForQueueEntries($releaseIds);
1754
1755
        return true;
1756
    }
1757
1758
    /**
1759
     * Create a unique Id for the current process
1760
     *
1761
     * @return string the ID
1762
     */
1763 1
    public function CLI_buildProcessId()
1764
    {
1765 1
        if (! $this->processID) {
1766
            $this->processID = GeneralUtility::shortMD5(microtime(true));
1767
        }
1768 1
        return $this->processID;
1769
    }
1770
1771
    /**
1772
     * Prints a message to the stdout (only if debug-mode is enabled)
1773
     *
1774
     * @param string $msg the message
1775
     * @deprecated
1776
     * @codeCoverageIgnore
1777
     */
1778
    public function CLI_debug($msg): void
1779
    {
1780
        if ((int) $this->extensionSettings['processDebug']) {
1781
            echo $msg . "\n";
1782
            flush();
1783
        }
1784
    }
1785
1786
    /**
1787
     * Cleans up entries that stayed for too long in the queue. These are:
1788
     * - processed entries that are over 1.5 days in age
1789
     * - scheduled entries that are over 7 days old
1790
     *
1791
     * @deprecated
1792
     */
1793 1
    public function cleanUpOldQueueEntries(): void
1794
    {
1795
        // 24*60*60 Seconds in 24 hours
1796 1
        $processedAgeInSeconds = $this->extensionSettings['cleanUpProcessedAge'] * 86400;
1797 1
        $scheduledAgeInSeconds = $this->extensionSettings['cleanUpScheduledAge'] * 86400;
1798
1799 1
        $now = time();
1800 1
        $condition = '(exec_time<>0 AND exec_time<' . ($now - $processedAgeInSeconds) . ') OR scheduled<=' . ($now - $scheduledAgeInSeconds);
1801 1
        $this->flushQueue($condition);
0 ignored issues
show
Deprecated Code introduced by
The function AOE\Crawler\Controller\C...ontroller::flushQueue() has been deprecated. ( Ignorable by Annotation )

If this is a false-positive, you can also ignore this issue in your code via the ignore-deprecated  annotation

1801
        /** @scrutinizer ignore-deprecated */ $this->flushQueue($condition);
Loading history...
1802 1
    }
1803
1804
    /**
1805
     * Removes queue entries
1806
     *
1807
     * @param string $where SQL related filter for the entries which should be removed
1808
     *
1809
     * @deprecated
1810
     */
1811 5
    protected function flushQueue($where = ''): void
1812
    {
1813 5
        $realWhere = strlen((string) $where) > 0 ? $where : '1=1';
1814
1815 5
        $queryBuilder = $this->getQueryBuilder($this->tableName);
1816
1817
        $groups = $queryBuilder
1818 5
            ->selectLiteral('DISTINCT set_id')
1819 5
            ->from($this->tableName)
1820 5
            ->where($realWhere)
1821 5
            ->execute()
1822 5
            ->fetchAll();
1823 5
        if (is_array($groups)) {
0 ignored issues
show
introduced by
The condition is_array($groups) is always true.
Loading history...
1824 5
            foreach ($groups as $group) {
1825
                $subSet = $queryBuilder
1826 4
                    ->select('qid', 'set_id')
1827 4
                    ->from($this->tableName)
1828 4
                    ->where(
1829 4
                        $realWhere,
1830 4
                        $queryBuilder->expr()->eq('set_id', $group['set_id'])
1831
                    )
1832 4
                    ->execute()
1833 4
                    ->fetchAll();
1834
1835 4
                $payLoad = ['subSet' => $subSet];
1836 4
                SignalSlotUtility::emitSignal(
1837 4
                    self::class,
1838 4
                    SignalSlotUtility::SIGNAL_QUEUE_ENTRY_FLUSH,
1839 4
                    $payLoad
1840
                );
1841
            }
1842
        }
1843
1844
        $queryBuilder
1845 5
            ->delete($this->tableName)
1846 5
            ->where($realWhere)
1847 5
            ->execute();
1848 5
    }
1849
1850
    /**
1851
     * This method determines duplicates for a queue entry with the same parameters and this timestamp.
1852
     * If the timestamp is in the past, it will check if there is any unprocessed queue entry in the past.
1853
     * If the timestamp is in the future it will check, if the queued entry has exactly the same timestamp
1854
     *
1855
     * @param int $tstamp
1856
     * @param array $fieldArray
1857
     *
1858
     * @return array
1859
     * @deprecated
1860
     */
1861 5
    protected function getDuplicateRowsIfExist($tstamp, $fieldArray)
1862
    {
1863 5
        $rows = [];
1864
1865 5
        $currentTime = $this->getCurrentTime();
1866
1867 5
        $queryBuilder = GeneralUtility::makeInstance(ConnectionPool::class)->getQueryBuilderForTable($this->tableName);
1868
        $queryBuilder
1869 5
            ->select('qid')
1870 5
            ->from('tx_crawler_queue');
1871
        //if this entry is scheduled with "now"
1872 5
        if ($tstamp <= $currentTime) {
1873 2
            if ($this->extensionSettings['enableTimeslot']) {
1874 1
                $timeBegin = $currentTime - 100;
1875 1
                $timeEnd = $currentTime + 100;
1876
                $queryBuilder
1877 1
                    ->where(
1878 1
                        'scheduled BETWEEN ' . $timeBegin . ' AND ' . $timeEnd . ''
1879
                    )
1880 1
                    ->orWhere(
1881 1
                        $queryBuilder->expr()->lte('scheduled', $currentTime)
1882
                    );
1883
            } else {
1884
                $queryBuilder
1885 1
                    ->where(
1886 2
                        $queryBuilder->expr()->lte('scheduled', $currentTime)
1887
                    );
1888
            }
1889 3
        } elseif ($tstamp > $currentTime) {
1890
            //entry with a timestamp in the future need to have the same schedule time
1891
            $queryBuilder
1892 3
                ->where(
1893 3
                    $queryBuilder->expr()->eq('scheduled', $tstamp)
1894
                );
1895
        }
1896
1897
        $queryBuilder
1898 5
            ->andWhere('NOT exec_time')
1899 5
            ->andWhere('NOT process_id')
1900 5
            ->andWhere($queryBuilder->expr()->eq('page_id', $queryBuilder->createNamedParameter($fieldArray['page_id'], PDO::PARAM_INT)))
1901 5
            ->andWhere($queryBuilder->expr()->eq('parameters_hash', $queryBuilder->createNamedParameter($fieldArray['parameters_hash'], PDO::PARAM_STR)));
1902
1903 5
        $statement = $queryBuilder->execute();
1904
1905 5
        while ($row = $statement->fetch()) {
1906 5
            $rows[] = $row['qid'];
1907
        }
1908
1909 5
        return $rows;
1910
    }
1911
1912
    /**
1913
     * Returns a md5 hash generated from a serialized configuration array.
1914
     *
1915
     * @return string
1916
     */
1917 10
    protected function getConfigurationHash(array $configuration)
1918
    {
1919 10
        unset($configuration['paramExpanded']);
1920 10
        unset($configuration['URLs']);
1921 10
        return md5(serialize($configuration));
1922
    }
1923
1924
    /**
1925
     * Build a URL from a Page and the Query String. If the page has a Site configuration, it can be built by using
1926
     * the Site instance.
1927
     *
1928
     * @param int $httpsOrHttp see tx_crawler_configuration.force_ssl
1929
     * @throws SiteNotFoundException
1930
     * @throws InvalidRouteArgumentsException
1931
     *
1932
     * @deprecated Using CrawlerController::getUrlFromPageAndQueryParameters() is deprecated since 9.1.1 and will be removed in v11.x, please use UrlService->getUrlFromPageAndQueryParameters() instead.
1933
     * @codeCoverageIgnore
1934
     */
1935
    protected function getUrlFromPageAndQueryParameters(int $pageId, string $queryString, ?string $alternativeBaseUrl, int $httpsOrHttp): UriInterface
1936
    {
1937
        $urlService = new UrlService();
1938
        return $urlService->getUrlFromPageAndQueryParameters($pageId, $queryString, $alternativeBaseUrl, $httpsOrHttp);
1939
    }
1940
1941 1
    protected function swapIfFirstIsLargerThanSecond(array $reg): array
1942
    {
1943
        // Swap if first is larger than last:
1944 1
        if ($reg[1] > $reg[2]) {
1945
            $temp = $reg[2];
1946
            $reg[2] = $reg[1];
1947
            $reg[1] = $temp;
1948
        }
1949
1950 1
        return $reg;
1951
    }
1952
1953
    /**
1954
     * @return BackendUserAuthentication
1955
     */
1956 2
    private function getBackendUser()
1957
    {
1958
        // Make sure the _cli_ user is loaded
1959 2
        Bootstrap::initializeBackendAuthentication();
1960 2
        if ($this->backendUser === null) {
1961 2
            $this->backendUser = $GLOBALS['BE_USER'];
1962
        }
1963 2
        return $this->backendUser;
1964
    }
1965
1966
    /**
1967
     * Get querybuilder for given table
1968
     *
1969
     * @return QueryBuilder
1970
     */
1971 12
    private function getQueryBuilder(string $table)
1972
    {
1973 12
        return GeneralUtility::makeInstance(ConnectionPool::class)->getQueryBuilderForTable($table);
1974
    }
1975
}
1976