Passed
Push — feature/switch-to-configuratio... ( 74ae66 )
by Tomas Norre
07:20
created

CrawlerController::checkIfPageShouldBeSkipped()   F

Complexity

Conditions 13
Paths 360

Size

Total Lines 53
Code Lines 29

Duplication

Lines 0
Ratio 0 %

Code Coverage

Tests 28
CRAP Score 13

Importance

Changes 1
Bugs 0 Features 0
Metric Value
eloc 29
nc 360
nop 1
dl 0
loc 53
c 1
b 0
f 0
cc 13
ccs 28
cts 28
cp 1
crap 13
rs 3.7833

How to fix   Long Method    Complexity   

Long Method

Small methods make your code easier to understand, in particular if combined with a good name. Besides, if your method is small, finding a good name is usually much easier.

For example, if you find yourself adding comments to a method's body, this is usually a good sign to extract the commented part to a new method, and use the comment as a starting point when coming up with a good name for this new method.

Commonly applied refactorings include:

1
<?php
2
3
declare(strict_types=1);
4
5
namespace AOE\Crawler\Controller;
6
7
/***************************************************************
8
 *  Copyright notice
9
 *
10
 *  (c) 2020 AOE GmbH <[email protected]>
11
 *
12
 *  All rights reserved
13
 *
14
 *  This script is part of the TYPO3 project. The TYPO3 project is
15
 *  free software; you can redistribute it and/or modify
16
 *  it under the terms of the GNU General Public License as published by
17
 *  the Free Software Foundation; either version 3 of the License, or
18
 *  (at your option) any later version.
19
 *
20
 *  The GNU General Public License can be found at
21
 *  http://www.gnu.org/copyleft/gpl.html.
22
 *
23
 *  This script is distributed in the hope that it will be useful,
24
 *  but WITHOUT ANY WARRANTY; without even the implied warranty of
25
 *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
26
 *  GNU General Public License for more details.
27
 *
28
 *  This copyright notice MUST APPEAR in all copies of the script!
29
 ***************************************************************/
30
31
use AOE\Crawler\Configuration\ExtensionConfigurationProvider;
32
use AOE\Crawler\Converter\JsonCompatibilityConverter;
33
use AOE\Crawler\Crawler;
34
use AOE\Crawler\CrawlStrategy\CrawlStrategyFactory;
35
use AOE\Crawler\Domain\Model\Configuration;
36
use AOE\Crawler\Domain\Model\Process;
37
use AOE\Crawler\Domain\Model\Queue;
38
use AOE\Crawler\Domain\Repository\ConfigurationRepository;
39
use AOE\Crawler\Domain\Repository\ProcessRepository;
40
use AOE\Crawler\Domain\Repository\QueueRepository;
41
use AOE\Crawler\QueueExecutor;
42
use AOE\Crawler\Service\UrlService;
43
use AOE\Crawler\Utility\SignalSlotUtility;
44
use AOE\Crawler\Value\QueueFilter;
45
use Psr\Http\Message\UriInterface;
46
use Psr\Log\LoggerAwareInterface;
47
use Psr\Log\LoggerAwareTrait;
48
use TYPO3\CMS\Backend\Tree\View\PageTreeView;
49
use TYPO3\CMS\Backend\Utility\BackendUtility;
50
use TYPO3\CMS\Core\Authentication\BackendUserAuthentication;
51
use TYPO3\CMS\Core\Compatibility\PublicMethodDeprecationTrait;
52
use TYPO3\CMS\Core\Compatibility\PublicPropertyDeprecationTrait;
53
use TYPO3\CMS\Core\Core\Bootstrap;
54
use TYPO3\CMS\Core\Core\Environment;
55
use TYPO3\CMS\Core\Database\Connection;
56
use TYPO3\CMS\Core\Database\ConnectionPool;
57
use TYPO3\CMS\Core\Database\Query\Restriction\DeletedRestriction;
58
use TYPO3\CMS\Core\Imaging\Icon;
59
use TYPO3\CMS\Core\Imaging\IconFactory;
60
use TYPO3\CMS\Core\Site\Entity\Site;
61
use TYPO3\CMS\Core\Type\Bitmask\Permission;
62
use TYPO3\CMS\Core\TypoScript\Parser\TypoScriptParser;
63
use TYPO3\CMS\Core\Utility\DebugUtility;
64
use TYPO3\CMS\Core\Utility\GeneralUtility;
65
use TYPO3\CMS\Core\Utility\MathUtility;
66
use TYPO3\CMS\Extbase\Object\ObjectManager;
67
use TYPO3\CMS\Frontend\Page\PageRepository;
68
69
/**
70
 * Class CrawlerController
71
 *
72
 * @package AOE\Crawler\Controller
73
 */
74
class CrawlerController implements LoggerAwareInterface
75
{
76
    use LoggerAwareTrait;
77
    use PublicMethodDeprecationTrait;
78
    use PublicPropertyDeprecationTrait;
79
80
    public const CLI_STATUS_NOTHING_PROCCESSED = 0;
81
82
    //queue not empty
83
    public const CLI_STATUS_REMAIN = 1;
84
85
    //(some) queue items where processed
86
    public const CLI_STATUS_PROCESSED = 2;
87
88
    //instance didn't finish
89
    public const CLI_STATUS_ABORTED = 4;
90
91
    public const CLI_STATUS_POLLABLE_PROCESSED = 8;
92
93
    /**
94
     * @var integer
95
     */
96
    public $setID = 0;
97
98
    /**
99
     * @var string
100
     */
101
    public $processID = '';
102
103
    /**
104
     * @var array
105
     */
106
    public $duplicateTrack = [];
107
108
    /**
109
     * @var array
110
     */
111
    public $downloadUrls = [];
112
113
    /**
114
     * @var array
115
     */
116
    public $incomingProcInstructions = [];
117
118
    /**
119
     * @var array
120
     */
121
    public $incomingConfigurationSelection = [];
122
123
    /**
124
     * @var bool
125
     */
126
    public $registerQueueEntriesInternallyOnly = false;
127
128
    /**
129
     * @var array
130
     */
131
    public $queueEntries = [];
132
133
    /**
134
     * @var array
135
     */
136
    public $urlList = [];
137
138
    /**
139
     * @var array
140
     */
141
    public $extensionSettings = [];
142
143
    /**
144
     * Mount Point
145
     *
146
     * @var bool
147
     * Todo: Check what this is used for and adjust the type hint or code, as bool doesn't match the current code.
148
     */
149
    public $MP = false;
150
151
    /**
152
     * @var string
153
     * @deprecated
154
     */
155
    protected $processFilename;
156
157
    /**
158
     * Holds the internal access mode can be 'gui','cli' or 'cli_im'
159
     *
160
     * @var string
161
     * @deprecated
162
     */
163
    protected $accessMode;
164
165
    /**
166
     * @var QueueRepository
167
     */
168
    protected $queueRepository;
169
170
    /**
171
     * @var ProcessRepository
172
     */
173
    protected $processRepository;
174
175
    /**
176
     * @var ConfigurationRepository
177
     */
178
    protected $configurationRepository;
179
180
    /**
181
     * @var string
182
     */
183
    protected $tableName = 'tx_crawler_queue';
184
185
    /**
186
     * @var QueueExecutor
187
     */
188
    protected $queueExecutor;
189
190
    /**
191
     * @var int
192
     */
193
    protected $maximumUrlsToCompile = 10000;
194
195
    /**
196
     * @var IconFactory
197
     */
198
    protected $iconFactory;
199
200
    /**
201
     * @var string[]
202
     */
203
    private $deprecatedPublicMethods = [
0 ignored issues
show
introduced by
The private property $deprecatedPublicMethods is not used, and could be removed.
Loading history...
204
        'cleanUpOldQueueEntries' => 'Using CrawlerController::cleanUpOldQueueEntries() is deprecated since 9.0.1 and will be removed in v11.x, please use QueueRepository->cleanUpOldQueueEntries() instead.',
205
        'CLI_debug' => 'Using CrawlerController->CLI_debug() is deprecated since 9.1.3 and will be removed in v11.x',
206
        'getAccessMode' => 'Using CrawlerController->getAccessMode() is deprecated since 9.1.3 and will be removed in v11.x',
207
        'getLogEntriesForSetId' => 'Using crawlerController::getLogEntriesForSetId() is deprecated since 9.0.1 and will be removed in v11.x',
208
        'flushQueue' => 'Using CrawlerController::flushQueue() is deprecated since 9.0.1 and will be removed in v11.x, please use QueueRepository->flushQueue() instead.',
209
        'setAccessMode' => 'Using CrawlerController->setAccessMode() is deprecated since 9.1.3 and will be removed in v11.x',
210
        'getDisabled' => 'Using CrawlerController->getDisabled() is deprecated since 9.1.3 and will be removed in v11.x, please use Crawler->isDisabled() instead',
211
        'setDisabled' => 'Using CrawlerController->setDisabled() is deprecated since 9.1.3 and will be removed in v11.x, please use Crawler->setDisabled() instead',
212
        'getProcessFilename' => 'Using CrawlerController->getProcessFilename() is deprecated since 9.1.3 and will be removed in v11.x',
213
        'setProcessFilename' => 'Using CrawlerController->setProcessFilename() is deprecated since 9.1.3 and will be removed in v11.x',
214
215
    ];
216
217
    /**
218
     * @var string[]
219
     */
220
    private $deprecatedPublicProperties = [
1 ignored issue
show
introduced by
The private property $deprecatedPublicProperties is not used, and could be removed.
Loading history...
221
        'accessMode' => 'Using CrawlerController->accessMode is deprecated since 9.1.3 and will be removed in v11.x',
222
        'processFilename' => 'Using CrawlerController->accessMode is deprecated since 9.1.3 and will be removed in v11.x',
223
    ];
224
225
    /**
226
     * @var BackendUserAuthentication|null
227
     */
228
    private $backendUser;
229
230
    /**
231
     * @var integer
232
     */
233
    private $scheduledTime = 0;
234
235
    /**
236
     * @var integer
237
     */
238
    private $reqMinute = 0;
239
240
    /**
241
     * @var bool
242
     */
243
    private $submitCrawlUrls = false;
244
245
    /**
246
     * @var bool
247
     */
248
    private $downloadCrawlUrls = false;
249
250
    /**
251
     * @var PageRepository
252
     */
253
    private $pageRepository;
254
255
    /**
256
     * @var Crawler
257
     */
258
    private $crawler;
259
260
    /************************************
261
     *
262
     * Getting URLs based on Page TSconfig
263
     *
264
     ************************************/
265
266 35
    public function __construct()
267
    {
268 35
        $objectManager = GeneralUtility::makeInstance(ObjectManager::class);
269 35
        $crawlStrategyFactory = GeneralUtility::makeInstance(CrawlStrategyFactory::class);
270 35
        $this->queueRepository = $objectManager->get(QueueRepository::class);
271 35
        $this->processRepository = $objectManager->get(ProcessRepository::class);
272 35
        $this->configurationRepository = $objectManager->get(ConfigurationRepository::class);
273 35
        $this->pageRepository = $objectManager->get(PageRepository::class);
274 35
        $this->queueExecutor = GeneralUtility::makeInstance(QueueExecutor::class, $crawlStrategyFactory);
275 35
        $this->iconFactory = GeneralUtility::makeInstance(IconFactory::class);
276 35
        $this->crawler = GeneralUtility::makeInstance(Crawler::class);
277
278 35
        $this->processFilename = Environment::getVarPath() . '/lock/tx_crawler.proc';
0 ignored issues
show
Deprecated Code introduced by
The property AOE\Crawler\Controller\C...oller::$processFilename has been deprecated. ( Ignorable by Annotation )

If this is a false-positive, you can also ignore this issue in your code via the ignore-deprecated  annotation

278
        /** @scrutinizer ignore-deprecated */ $this->processFilename = Environment::getVarPath() . '/lock/tx_crawler.proc';
Loading history...
279
280
        /** @var ExtensionConfigurationProvider $configurationProvider */
281 35
        $configurationProvider = GeneralUtility::makeInstance(ExtensionConfigurationProvider::class);
282 35
        $settings = $configurationProvider->getExtensionConfiguration();
283 35
        $this->extensionSettings = is_array($settings) ? $settings : [];
0 ignored issues
show
introduced by
The condition is_array($settings) is always true.
Loading history...
284
285
        // set defaults:
286 35
        if (MathUtility::convertToPositiveInteger($this->extensionSettings['countInARun']) === 0) {
287
            $this->extensionSettings['countInARun'] = 100;
288
        }
289
290 35
        $this->extensionSettings['processLimit'] = MathUtility::forceIntegerInRange($this->extensionSettings['processLimit'], 1, 99, 1);
291 35
        $this->maximumUrlsToCompile = MathUtility::forceIntegerInRange($this->extensionSettings['maxCompileUrls'], 1, 1000000000, 10000);
292 35
    }
293
294
    /**
295
     * Method to set the accessMode can be gui, cli or cli_im
296
     *
297
     * @return string
298
     * @deprecated
299
     */
300 1
    public function getAccessMode()
301
    {
302 1
        return $this->accessMode;
0 ignored issues
show
Deprecated Code introduced by
The property AOE\Crawler\Controller\C...Controller::$accessMode has been deprecated. ( Ignorable by Annotation )

If this is a false-positive, you can also ignore this issue in your code via the ignore-deprecated  annotation

302
        return /** @scrutinizer ignore-deprecated */ $this->accessMode;
Loading history...
303
    }
304
305
    /**
306
     * @param string $accessMode
307
     * @deprecated
308
     */
309 1
    public function setAccessMode($accessMode): void
310
    {
311 1
        $this->accessMode = $accessMode;
0 ignored issues
show
Deprecated Code introduced by
The property AOE\Crawler\Controller\C...Controller::$accessMode has been deprecated. ( Ignorable by Annotation )

If this is a false-positive, you can also ignore this issue in your code via the ignore-deprecated  annotation

311
        /** @scrutinizer ignore-deprecated */ $this->accessMode = $accessMode;
Loading history...
312 1
    }
313
314
    /**
315
     * Set disabled status to prevent processes from being processed
316
     *
317
     * @param bool $disabled (optional, defaults to true)
318
     * @deprecated
319
     */
320 2
    public function setDisabled($disabled = true): void
321
    {
322 2
        if ($disabled) {
323 1
            GeneralUtility::writeFile($this->processFilename, '');
0 ignored issues
show
Deprecated Code introduced by
The property AOE\Crawler\Controller\C...oller::$processFilename has been deprecated. ( Ignorable by Annotation )

If this is a false-positive, you can also ignore this issue in your code via the ignore-deprecated  annotation

323
            GeneralUtility::writeFile(/** @scrutinizer ignore-deprecated */ $this->processFilename, '');
Loading history...
324
        } else {
325 1
            if (is_file($this->processFilename)) {
0 ignored issues
show
Deprecated Code introduced by
The property AOE\Crawler\Controller\C...oller::$processFilename has been deprecated. ( Ignorable by Annotation )

If this is a false-positive, you can also ignore this issue in your code via the ignore-deprecated  annotation

325
            if (is_file(/** @scrutinizer ignore-deprecated */ $this->processFilename)) {
Loading history...
326 1
                unlink($this->processFilename);
0 ignored issues
show
Deprecated Code introduced by
The property AOE\Crawler\Controller\C...oller::$processFilename has been deprecated. ( Ignorable by Annotation )

If this is a false-positive, you can also ignore this issue in your code via the ignore-deprecated  annotation

326
                unlink(/** @scrutinizer ignore-deprecated */ $this->processFilename);
Loading history...
327
            }
328
        }
329 2
    }
330
331
    /**
332
     * Get disable status
333
     *
334
     * @return bool true if disabled
335
     * @deprecated
336
     */
337 2
    public function getDisabled()
338
    {
339 2
        return is_file($this->processFilename);
0 ignored issues
show
Deprecated Code introduced by
The property AOE\Crawler\Controller\C...oller::$processFilename has been deprecated. ( Ignorable by Annotation )

If this is a false-positive, you can also ignore this issue in your code via the ignore-deprecated  annotation

339
        return is_file(/** @scrutinizer ignore-deprecated */ $this->processFilename);
Loading history...
340
    }
341
342
    /**
343
     * @param string $filenameWithPath
344
     * @deprecated
345
     */
346 3
    public function setProcessFilename($filenameWithPath): void
347
    {
348 3
        $this->processFilename = $filenameWithPath;
0 ignored issues
show
Deprecated Code introduced by
The property AOE\Crawler\Controller\C...oller::$processFilename has been deprecated. ( Ignorable by Annotation )

If this is a false-positive, you can also ignore this issue in your code via the ignore-deprecated  annotation

348
        /** @scrutinizer ignore-deprecated */ $this->processFilename = $filenameWithPath;
Loading history...
349 3
    }
350
351
    /**
352
     * @return string
353
     * @deprecated
354
     */
355 1
    public function getProcessFilename()
356
    {
357 1
        return $this->processFilename;
0 ignored issues
show
Deprecated Code introduced by
The property AOE\Crawler\Controller\C...oller::$processFilename has been deprecated. ( Ignorable by Annotation )

If this is a false-positive, you can also ignore this issue in your code via the ignore-deprecated  annotation

357
        return /** @scrutinizer ignore-deprecated */ $this->processFilename;
Loading history...
358
    }
359
360
    /**
361
     * Sets the extensions settings (unserialized pendant of $TYPO3_CONF_VARS['EXT']['extConf']['crawler']).
362
     */
363 14
    public function setExtensionSettings(array $extensionSettings): void
364
    {
365 14
        $this->extensionSettings = $extensionSettings;
366 14
    }
367
368
    /**
369
     * Check if the given page should be crawled
370
     *
371
     * @return false|string false if the page should be crawled (not excluded), true / skipMessage if it should be skipped
372
     */
373 11
    public function checkIfPageShouldBeSkipped(array $pageRow)
374
    {
375 11
        $skipPage = false;
376
        // message will be overwritten later
377 11
        $skipMessage = 'Skipped';
378
379
        // if page is hidden
380 11
        if (! $this->extensionSettings['crawlHiddenPages']) {
381 11
            if ($pageRow['hidden']) {
382 1
                $skipPage = true;
383 1
                $skipMessage = 'Because page is hidden';
384
            }
385
        }
386
387 11
        if (! $skipPage) {
388 10
            if (GeneralUtility::inList('3,4,199,254,255', $pageRow['doktype'])) {
389 3
                $skipPage = true;
390 3
                $skipMessage = 'Because doktype is not allowed';
391
            }
392
        }
393
394 11
        if (! $skipPage) {
395 7
            foreach ($GLOBALS['TYPO3_CONF_VARS']['EXTCONF']['crawler']['excludeDoktype'] ?? [] as $key => $doktypeList) {
396 1
                if (GeneralUtility::inList($doktypeList, $pageRow['doktype'])) {
397 1
                    $skipPage = true;
398 1
                    $skipMessage = 'Doktype was excluded by "' . $key . '"';
399 1
                    break;
400
                }
401
            }
402
        }
403
404 11
        if (! $skipPage) {
405
            // veto hook
406 6
            foreach ($GLOBALS['TYPO3_CONF_VARS']['EXTCONF']['crawler']['pageVeto'] ?? [] as $key => $func) {
407
                $params = [
408 2
                    'pageRow' => $pageRow,
409
                ];
410
                // expects "false" if page is ok and "true" or a skipMessage if this page should _not_ be crawled
411 2
                $veto = GeneralUtility::callUserFunction($func, $params, $this);
412 2
                if ($veto !== false) {
413 2
                    $skipPage = true;
414 2
                    if (is_string($veto)) {
415 1
                        $skipMessage = $veto;
416
                    } else {
417 1
                        $skipMessage = 'Veto from hook "' . htmlspecialchars($key) . '"';
418
                    }
419
                    // no need to execute other hooks if a previous one return a veto
420 2
                    break;
421
                }
422
            }
423
        }
424
425 11
        return $skipPage ? $skipMessage : false;
426
    }
427
428
    /**
429
     * Wrapper method for getUrlsForPageId()
430
     * It returns an array of configurations and no urls!
431
     *
432
     * @param array $pageRow Page record with at least dok-type and uid columns.
433
     * @param string $skipMessage
434
     * @return array
435
     * @see getUrlsForPageId()
436
     */
437 5
    public function getUrlsForPageRow(array $pageRow, &$skipMessage = '')
438
    {
439 5
        $message = $this->checkIfPageShouldBeSkipped($pageRow);
440 5
        if ($message === false) {
441 4
            $res = $this->getUrlsForPageId($pageRow['uid']);
442 4
            $skipMessage = '';
443
        } else {
444 1
            $skipMessage = $message;
445 1
            $res = [];
446
        }
447
448 5
        return $res;
449
    }
450
451
    /**
452
     * Creates a list of URLs from input array (and submits them to queue if asked for)
453
     * See Web > Info module script + "indexed_search"'s crawler hook-client using this!
454
     *
455
     * @param array $vv Information about URLs from pageRow to crawl.
456
     * @param array $pageRow Page row
457
     * @param int $scheduledTime Unix time to schedule indexing to, typically time()
458
     * @param int $reqMinute Number of requests per minute (creates the interleave between requests)
459
     * @param bool $submitCrawlUrls If set, submits the URLs to queue
460
     * @param bool $downloadCrawlUrls If set (and submitcrawlUrls is false) will fill $downloadUrls with entries)
461
     * @param array $duplicateTrack Array which is passed by reference and contains the an id per url to secure we will not crawl duplicates
462
     * @param array $downloadUrls Array which will be filled with URLS for download if flag is set.
463
     * @param array $incomingProcInstructions Array of processing instructions
464
     * @return string List of URLs (meant for display in backend module)
465
     */
466 3
    public function urlListFromUrlArray(
467
        array $vv,
468
        array $pageRow,
469
        $scheduledTime,
470
        $reqMinute,
471
        $submitCrawlUrls,
472
        $downloadCrawlUrls,
473
        array &$duplicateTrack,
474
        array &$downloadUrls,
475
        array $incomingProcInstructions
476
    ) {
477 3
        if (! is_array($vv['URLs'])) {
478
            return 'ERROR - no URL generated';
479
        }
480 3
        $urlLog = [];
481 3
        $pageId = (int) $pageRow['uid'];
482 3
        $configurationHash = $this->getConfigurationHash($vv);
483 3
        $skipInnerCheck = $this->queueRepository->noUnprocessedQueueEntriesForPageWithConfigurationHashExist($pageId, $configurationHash);
484
485 3
        $urlService = new UrlService();
486
487 3
        foreach ($vv['URLs'] as $urlQuery) {
488 3
            if (! $this->drawURLs_PIfilter($vv['subCfg']['procInstrFilter'], $incomingProcInstructions)) {
489
                continue;
490
            }
491 3
            $url = (string) $urlService->getUrlFromPageAndQueryParameters(
492 3
                $pageId,
493 3
                $urlQuery,
494 3
                $vv['subCfg']['baseUrl'] ?? null,
495 3
                $vv['subCfg']['force_ssl'] ?? 0
496
            );
497
498
            // Create key by which to determine unique-ness:
499 3
            $uKey = $url . '|' . $vv['subCfg']['userGroups'] . '|' . $vv['subCfg']['procInstrFilter'];
500
501 3
            if (isset($duplicateTrack[$uKey])) {
502
                //if the url key is registered just display it and do not resubmit is
503
                $urlLog[] = '<em><span class="text-muted">' . htmlspecialchars($url) . '</span></em>';
504
            } else {
505
                // Scheduled time:
506 3
                $schTime = $scheduledTime + round(count($duplicateTrack) * (60 / $reqMinute));
507 3
                $schTime = intval($schTime / 60) * 60;
508 3
                $formattedDate = BackendUtility::datetime($schTime);
509 3
                $this->urlList[] = '[' . $formattedDate . '] ' . $url;
510 3
                $urlList = '[' . $formattedDate . '] ' . htmlspecialchars($url);
511
512
                // Submit for crawling!
513 3
                if ($submitCrawlUrls) {
514 3
                    $added = $this->addUrl(
515 3
                        $pageId,
516 3
                        $url,
517 3
                        $vv['subCfg'],
518 3
                        $scheduledTime,
519 3
                        $configurationHash,
520 3
                        $skipInnerCheck
521
                    );
522 3
                    if ($added === false) {
523 3
                        $urlList .= ' (URL already existed)';
524
                    }
525
                } elseif ($downloadCrawlUrls) {
526
                    $downloadUrls[$url] = $url;
527
                }
528 3
                $urlLog[] = $urlList;
529
            }
530 3
            $duplicateTrack[$uKey] = true;
531
        }
532
533 3
        return implode('<br>', $urlLog);
534
    }
535
536
    /**
537
     * Returns true if input processing instruction is among registered ones.
538
     *
539
     * @param string $piString PI to test
540
     * @param array $incomingProcInstructions Processing instructions
541
     * @return boolean
542
     */
543 5
    public function drawURLs_PIfilter($piString, array $incomingProcInstructions)
544
    {
545 5
        if (empty($incomingProcInstructions)) {
546 1
            return true;
547
        }
548
549 4
        foreach ($incomingProcInstructions as $pi) {
550 4
            if (GeneralUtility::inList($piString, $pi)) {
551 2
                return true;
552
            }
553
        }
554 2
        return false;
555
    }
556
557 4
    public function getPageTSconfigForId($id): array
558
    {
559 4
        if (! $this->MP) {
560 4
            $pageTSconfig = BackendUtility::getPagesTSconfig($id);
0 ignored issues
show
Deprecated Code introduced by
The function TYPO3\CMS\Backend\Utilit...ity::getPagesTSconfig() has been deprecated. ( Ignorable by Annotation )

If this is a false-positive, you can also ignore this issue in your code via the ignore-deprecated  annotation

560
            $pageTSconfig = /** @scrutinizer ignore-deprecated */ BackendUtility::getPagesTSconfig($id);
Loading history...
561
        } else {
562
            // TODO: Please check, this makes no sense to split a boolean value.
563
            [, $mountPointId] = explode('-', $this->MP);
0 ignored issues
show
Bug introduced by
$this->MP of type true is incompatible with the type string expected by parameter $string of explode(). ( Ignorable by Annotation )

If this is a false-positive, you can also ignore this issue in your code via the ignore-type  annotation

563
            [, $mountPointId] = explode('-', /** @scrutinizer ignore-type */ $this->MP);
Loading history...
564
            $pageTSconfig = BackendUtility::getPagesTSconfig($mountPointId);
0 ignored issues
show
Deprecated Code introduced by
The function TYPO3\CMS\Backend\Utilit...ity::getPagesTSconfig() has been deprecated. ( Ignorable by Annotation )

If this is a false-positive, you can also ignore this issue in your code via the ignore-deprecated  annotation

564
            $pageTSconfig = /** @scrutinizer ignore-deprecated */ BackendUtility::getPagesTSconfig($mountPointId);
Loading history...
Bug introduced by
$mountPointId of type string is incompatible with the type integer expected by parameter $id of TYPO3\CMS\Backend\Utilit...ity::getPagesTSconfig(). ( Ignorable by Annotation )

If this is a false-positive, you can also ignore this issue in your code via the ignore-type  annotation

564
            $pageTSconfig = BackendUtility::getPagesTSconfig(/** @scrutinizer ignore-type */ $mountPointId);
Loading history...
565
        }
566
567
        // Call a hook to alter configuration
568 4
        if (is_array($GLOBALS['TYPO3_CONF_VARS']['EXTCONF']['crawler']['getPageTSconfigForId'])) {
569
            $params = [
570
                'pageId' => $id,
571
                'pageTSConfig' => &$pageTSconfig,
572
            ];
573
            foreach ($GLOBALS['TYPO3_CONF_VARS']['EXTCONF']['crawler']['getPageTSconfigForId'] as $userFunc) {
574
                GeneralUtility::callUserFunction($userFunc, $params, $this);
575
            }
576
        }
577 4
        return $pageTSconfig;
578
    }
579
580
    /**
581
     * This methods returns an array of configurations.
582
     * Adds no urls!
583
     */
584 3
    public function getUrlsForPageId(int $pageId): array
585
    {
586
        // Get page TSconfig for page ID
587 3
        $pageTSconfig = $this->getPageTSconfigForId($pageId);
588
589 3
        $res = [];
590
591
        // Fetch Crawler Configuration from pageTSconfig
592 3
        $crawlerCfg = $pageTSconfig['tx_crawler.']['crawlerCfg.']['paramSets.'] ?? [];
593 3
        foreach ($crawlerCfg as $key => $values) {
594 3
            if (! is_array($values)) {
595 3
                continue;
596
            }
597 3
            $key = str_replace('.', '', $key);
598
            // Sub configuration for a single configuration string:
599 3
            $subCfg = (array) $crawlerCfg[$key . '.'];
600 3
            $subCfg['key'] = $key;
601
602 3
            if (strcmp($subCfg['procInstrFilter'] ?? '', '')) {
603 3
                $subCfg['procInstrFilter'] = implode(',', GeneralUtility::trimExplode(',', $subCfg['procInstrFilter']));
604
            }
605 3
            $pidOnlyList = implode(',', GeneralUtility::trimExplode(',', $subCfg['pidsOnly'], true));
606
607
            // process configuration if it is not page-specific or if the specific page is the current page:
608
            // TODO: Check if $pidOnlyList can be kept as Array instead of imploded
609 3
            if (! strcmp((string) $subCfg['pidsOnly'], '') || GeneralUtility::inList($pidOnlyList, strval($pageId))) {
610
611
                // Explode, process etc.:
612 3
                $res[$key] = [];
613 3
                $res[$key]['subCfg'] = $subCfg;
614 3
                $res[$key]['paramParsed'] = GeneralUtility::explodeUrl2Array($crawlerCfg[$key]);
615 3
                $res[$key]['paramExpanded'] = $this->expandParameters($res[$key]['paramParsed'], $pageId);
616 3
                $res[$key]['origin'] = 'pagets';
617
618
                // recognize MP value
619 3
                if (! $this->MP) {
620 3
                    $res[$key]['URLs'] = $this->compileUrls($res[$key]['paramExpanded'], ['?id=' . $pageId]);
621
                } else {
622
                    $res[$key]['URLs'] = $this->compileUrls($res[$key]['paramExpanded'], ['?id=' . $pageId . '&MP=' . $this->MP]);
0 ignored issues
show
Bug introduced by
Are you sure $this->MP of type true can be used in concatenation? ( Ignorable by Annotation )

If this is a false-positive, you can also ignore this issue in your code via the ignore-type  annotation

622
                    $res[$key]['URLs'] = $this->compileUrls($res[$key]['paramExpanded'], ['?id=' . $pageId . '&MP=' . /** @scrutinizer ignore-type */ $this->MP]);
Loading history...
623
                }
624
            }
625
        }
626
627
        // Get configuration from tx_crawler_configuration records up the rootline
628 3
        $crawlerConfigurations = $this->configurationRepository->getCrawlerConfigurationRecordsFromRootLine($pageId);
629
        /** @var Configuration $configurationRecord */
630 3
        foreach ($crawlerConfigurations as $configurationRecord) {
631
632
            // check access to the configuration record
633
            if (empty($configurationRecord->getBeGroups()) || $this->getBackendUser()->isAdmin() || $this->hasGroupAccess($this->getBackendUser()->user['usergroup_cached_list'], $configurationRecord->getBeGroups())) {
634
                $pidOnlyList = implode(',', GeneralUtility::trimExplode(',', $configurationRecord->getPidsOnly(), true));
635
636
                // process configuration if it is not page-specific or if the specific page is the current page:
637
                // TODO: Check if $pidOnlyList can be kept as Array instead of imploded
638
                if (! strcmp($configurationRecord->getPidsOnly(), '') || GeneralUtility::inList($pidOnlyList, strval($pageId))) {
639
                    $key = $configurationRecord->getName();
640
641
                    // don't overwrite previously defined paramSets
642
                    if (! isset($res[$key])) {
643
644
                        /* @var $TSparserObject \TYPO3\CMS\Core\TypoScript\Parser\TypoScriptParser */
645
                        $TSparserObject = GeneralUtility::makeInstance(TypoScriptParser::class);
646
                        $TSparserObject->parse($configurationRecord->getProcessingInstructionParameters());
647
648
                        $subCfg = [
649
                            'procInstrFilter' => $configurationRecord->getProcessingInstructionFilter(),
650
                            'procInstrParams.' => $TSparserObject->setup,
651
                            'baseUrl' => $configurationRecord->getBaseUrl(),
652
                            'force_ssl' => (int) $configurationRecord->isForceSsl(),
653
                            'userGroups' => $configurationRecord->getFeGroups(),
654
                            'exclude' => $configurationRecord->getExclude(),
655
                            'key' => $key,
656
                        ];
657
658
                        if (! in_array($pageId, $this->expandExcludeString($subCfg['exclude']), true)) {
659
                            $res[$key] = [];
660
                            $res[$key]['subCfg'] = $subCfg;
661
                            $res[$key]['paramParsed'] = GeneralUtility::explodeUrl2Array($configurationRecord->getConfiguration());
662
                            $res[$key]['paramExpanded'] = $this->expandParameters($res[$key]['paramParsed'], $pageId);
663
                            $res[$key]['URLs'] = $this->compileUrls($res[$key]['paramExpanded'], ['?id=' . $pageId]);
664
                            $res[$key]['origin'] = 'tx_crawler_configuration_' . $configurationRecord->getUid();
665
                        }
666
                    }
667
                }
668
            }
669
        }
670
671 3
        foreach ($GLOBALS['TYPO3_CONF_VARS']['EXTCONF']['crawler']['processUrls'] ?? [] as $func) {
672
            $params = [
673
                'res' => &$res,
674
            ];
675
            GeneralUtility::callUserFunction($func, $params, $this);
676
        }
677 3
        return $res;
678
    }
679
680
    /**
681
     * Find all configurations of subpages of a page
682
     * TODO: Write Functional Tests
683
     */
684 1
    public function getConfigurationsForBranch(int $rootid, int $depth): array
685
    {
686 1
        $configurationsForBranch = [];
687 1
        $pageTSconfig = $this->getPageTSconfigForId($rootid);
688 1
        $sets = $pageTSconfig['tx_crawler.']['crawlerCfg.']['paramSets.'] ?? [];
689 1
        foreach ($sets as $key => $value) {
690
            if (! is_array($value)) {
691
                continue;
692
            }
693
            $configurationsForBranch[] = substr($key, -1) === '.' ? substr($key, 0, -1) : $key;
694
        }
695 1
        $pids = [];
696 1
        $rootLine = BackendUtility::BEgetRootLine($rootid);
697 1
        foreach ($rootLine as $node) {
698 1
            $pids[] = $node['uid'];
699
        }
700
        /* @var PageTreeView $tree */
701 1
        $tree = GeneralUtility::makeInstance(PageTreeView::class);
702 1
        $perms_clause = $this->getBackendUser()->getPagePermsClause(Permission::PAGE_SHOW);
703 1
        $tree->init(empty($perms_clause) ? '' : ('AND ' . $perms_clause));
704 1
        $tree->getTree($rootid, $depth, '');
705 1
        foreach ($tree->tree as $node) {
706
            $pids[] = $node['row']['uid'];
707
        }
708
709 1
        $queryBuilder = $this->getQueryBuilder('tx_crawler_configuration');
710
        $statement = $queryBuilder
711 1
            ->select('name')
712 1
            ->from('tx_crawler_configuration')
713 1
            ->where(
714 1
                $queryBuilder->expr()->in('pid', $queryBuilder->createNamedParameter($pids, Connection::PARAM_INT_ARRAY))
715
            )
716 1
            ->execute();
717
718 1
        while ($row = $statement->fetch()) {
719 1
            $configurationsForBranch[] = $row['name'];
720
        }
721 1
        return $configurationsForBranch;
722
    }
723
724
    /**
725
     * Check if a user has access to an item
726
     * (e.g. get the group list of the current logged in user from $GLOBALS['TSFE']->gr_list)
727
     *
728
     * @param string $groupList Comma-separated list of (fe_)group UIDs from a user
729
     * @param string $accessList Comma-separated list of (fe_)group UIDs of the item to access
730
     * @return bool TRUE if at least one of the users group UIDs is in the access list or the access list is empty
731
     * @see \TYPO3\CMS\Frontend\Page\PageRepository::getMultipleGroupsWhereClause()
732
     */
733 3
    public function hasGroupAccess($groupList, $accessList)
734
    {
735 3
        if (empty($accessList)) {
736 1
            return true;
737
        }
738 2
        foreach (GeneralUtility::intExplode(',', $groupList) as $groupUid) {
739 2
            if (GeneralUtility::inList($accessList, $groupUid)) {
740 1
                return true;
741
            }
742
        }
743 1
        return false;
744
    }
745
746
    /**
747
     * Will expand the parameters configuration to individual values. This follows a certain syntax of the value of each parameter.
748
     * Syntax of values:
749
     * - Basically: If the value is wrapped in [...] it will be expanded according to the following syntax, otherwise the value is taken literally
750
     * - Configuration is splitted by "|" and the parts are processed individually and finally added together
751
     * - For each configuration part:
752
     *         - "[int]-[int]" = Integer range, will be expanded to all values in between, values included, starting from low to high (max. 1000). Example "1-34" or "-40--30"
753
     *         - "_TABLE:[TCA table name];[_PID:[optional page id, default is current page]];[_ENABLELANG:1]" = Look up of table records from PID, filtering out deleted records. Example "_TABLE:tt_content; _PID:123"
754
     *        _ENABLELANG:1 picks only original records without their language overlays
755
     *         - Default: Literal value
756
     *
757
     * @param array $paramArray Array with key (GET var name) and values (value of GET var which is configuration for expansion)
758
     * @param integer $pid Current page ID
759
     * @return array
760
     *
761
     * TODO: Write Functional Tests
762
     */
763 10
    public function expandParameters($paramArray, $pid)
764
    {
765
        // Traverse parameter names:
766 10
        foreach ($paramArray as $p => $v) {
767 10
            $v = trim($v);
768
769
            // If value is encapsulated in square brackets it means there are some ranges of values to find, otherwise the value is literal
770 10
            if (strpos($v, '[') === 0 && substr($v, -1) === ']') {
771
                // So, find the value inside brackets and reset the paramArray value as an array.
772 10
                $v = substr($v, 1, -1);
773 10
                $paramArray[$p] = [];
774
775
                // Explode parts and traverse them:
776 10
                $parts = explode('|', $v);
777 10
                foreach ($parts as $pV) {
778
779
                    // Look for integer range: (fx. 1-34 or -40--30 // reads minus 40 to minus 30)
780 10
                    if (preg_match('/^(-?[0-9]+)\s*-\s*(-?[0-9]+)$/', trim($pV), $reg)) {
781 1
                        $reg = $this->swapIfFirstIsLargerThanSecond($reg);
782
783
                        // Traverse range, add values:
784
                        // Limit to size of range!
785 1
                        $runAwayBrake = 1000;
786 1
                        for ($a = $reg[1]; $a <= $reg[2]; $a++) {
787 1
                            $paramArray[$p][] = $a;
788 1
                            $runAwayBrake--;
789 1
                            if ($runAwayBrake <= 0) {
790
                                break;
791
                            }
792
                        }
793 9
                    } elseif (strpos(trim($pV), '_TABLE:') === 0) {
794
795
                        // Parse parameters:
796 6
                        $subparts = GeneralUtility::trimExplode(';', $pV);
797 6
                        $subpartParams = [];
798 6
                        foreach ($subparts as $spV) {
799 6
                            [$pKey, $pVal] = GeneralUtility::trimExplode(':', $spV);
800 6
                            $subpartParams[$pKey] = $pVal;
801
                        }
802
803
                        // Table exists:
804 6
                        if (isset($GLOBALS['TCA'][$subpartParams['_TABLE']])) {
805 6
                            $lookUpPid = isset($subpartParams['_PID']) ? intval($subpartParams['_PID']) : intval($pid);
806 6
                            $recursiveDepth = isset($subpartParams['_RECURSIVE']) ? intval($subpartParams['_RECURSIVE']) : 0;
807 6
                            $pidField = isset($subpartParams['_PIDFIELD']) ? trim($subpartParams['_PIDFIELD']) : 'pid';
808 6
                            $where = $subpartParams['_WHERE'] ?? '';
809 6
                            $addTable = $subpartParams['_ADDTABLE'] ?? '';
810
811 6
                            $fieldName = $subpartParams['_FIELD'] ? $subpartParams['_FIELD'] : 'uid';
812 6
                            if ($fieldName === 'uid' || $GLOBALS['TCA'][$subpartParams['_TABLE']]['columns'][$fieldName]) {
813 6
                                $queryBuilder = $this->getQueryBuilder($subpartParams['_TABLE']);
814
815 6
                                if ($recursiveDepth > 0) {
816
                                    /** @var \TYPO3\CMS\Core\Database\QueryGenerator $queryGenerator */
817 2
                                    $queryGenerator = GeneralUtility::makeInstance(\TYPO3\CMS\Core\Database\QueryGenerator::class);
818 2
                                    $pidList = $queryGenerator->getTreeList($lookUpPid, $recursiveDepth, 0, 1);
819 2
                                    $pidArray = GeneralUtility::intExplode(',', $pidList);
820
                                } else {
821 4
                                    $pidArray = [(string) $lookUpPid];
822
                                }
823
824 6
                                $queryBuilder->getRestrictions()
825 6
                                    ->removeAll()
826 6
                                    ->add(GeneralUtility::makeInstance(DeletedRestriction::class));
827
828
                                $queryBuilder
829 6
                                    ->select($fieldName)
830 6
                                    ->from($subpartParams['_TABLE'])
831 6
                                    ->where(
832 6
                                        $queryBuilder->expr()->in($pidField, $queryBuilder->createNamedParameter($pidArray, Connection::PARAM_INT_ARRAY)),
833 6
                                        $where
834
                                    );
835
836 6
                                if (! empty($addTable)) {
837
                                    // TODO: Check if this works as intended!
838
                                    $queryBuilder->add('from', $addTable);
839
                                }
840 6
                                $transOrigPointerField = $GLOBALS['TCA'][$subpartParams['_TABLE']]['ctrl']['transOrigPointerField'];
841
842 6
                                if ($subpartParams['_ENABLELANG'] && $transOrigPointerField) {
843
                                    $queryBuilder->andWhere(
844
                                        $queryBuilder->expr()->lte(
845
                                            $transOrigPointerField,
846
                                            0
847
                                        )
848
                                    );
849
                                }
850
851 6
                                $statement = $queryBuilder->execute();
852
853 6
                                $rows = [];
854 6
                                while ($row = $statement->fetch()) {
855 6
                                    $rows[$row[$fieldName]] = $row;
856
                                }
857
858 6
                                if (is_array($rows)) {
859 6
                                    $paramArray[$p] = array_merge($paramArray[$p], array_keys($rows));
860
                                }
861
                            }
862
                        }
863
                    } else {
864
                        // Just add value:
865 3
                        $paramArray[$p][] = $pV;
866
                    }
867
                    // Hook for processing own expandParameters place holder
868 10
                    if (is_array($GLOBALS['TYPO3_CONF_VARS']['SC_OPTIONS']['crawler/class.tx_crawler_lib.php']['expandParameters'])) {
869
                        $_params = [
870
                            'pObj' => &$this,
871
                            'paramArray' => &$paramArray,
872
                            'currentKey' => $p,
873
                            'currentValue' => $pV,
874
                            'pid' => $pid,
875
                        ];
876
                        foreach ($GLOBALS['TYPO3_CONF_VARS']['SC_OPTIONS']['crawler/class.tx_crawler_lib.php']['expandParameters'] as $_funcRef) {
877
                            GeneralUtility::callUserFunction($_funcRef, $_params, $this);
878
                        }
879
                    }
880
                }
881
882
                // Make unique set of values and sort array by key:
883 10
                $paramArray[$p] = array_unique($paramArray[$p]);
884 10
                ksort($paramArray);
885
            } else {
886
                // Set the literal value as only value in array:
887 3
                $paramArray[$p] = [$v];
888
            }
889
        }
890
891 10
        return $paramArray;
892
    }
893
894
    /**
895
     * Compiling URLs from parameter array (output of expandParameters())
896
     * The number of URLs will be the multiplication of the number of parameter values for each key
897
     *
898
     * @param array $paramArray Output of expandParameters(): Array with keys (GET var names) and for each an array of values
899
     * @param array $urls URLs accumulated in this array (for recursion)
900
     * @return array
901
     */
902 6
    public function compileUrls($paramArray, array $urls)
903
    {
904 6
        if (empty($paramArray)) {
905 6
            return $urls;
906
        }
907
        // shift first off stack:
908 5
        reset($paramArray);
909 5
        $varName = key($paramArray);
910 5
        $valueSet = array_shift($paramArray);
911
912
        // Traverse value set:
913 5
        $newUrls = [];
914 5
        foreach ($urls as $url) {
915 4
            foreach ($valueSet as $val) {
916 4
                $newUrls[] = $url . (strcmp((string) $val, '') ? '&' . rawurlencode($varName) . '=' . rawurlencode((string) $val) : '');
917
918 4
                if (count($newUrls) > $this->maximumUrlsToCompile) {
919
                    break;
920
                }
921
            }
922
        }
923 5
        return $this->compileUrls($paramArray, $newUrls);
924
    }
925
926
    /************************************
927
     *
928
     * Crawler log
929
     *
930
     ************************************/
931
932
    /**
933
     * Return array of records from crawler queue for input page ID
934
     *
935
     * @param integer $id Page ID for which to look up log entries.
936
     * @param boolean $doFlush If TRUE, then entries selected at DELETED(!) instead of selected!
937
     * @param boolean $doFullFlush
938
     * @param integer $itemsPerPage Limit the amount of entries per page default is 10
939
     * @return array
940
     */
941 4
    public function getLogEntriesForPageId($id, QueueFilter $queueFilter, $doFlush = false, $doFullFlush = false, $itemsPerPage = 10)
942
    {
943 4
        $queryBuilder = GeneralUtility::makeInstance(ConnectionPool::class)->getQueryBuilderForTable($this->tableName);
944
        $queryBuilder
945 4
            ->select('*')
946 4
            ->from($this->tableName)
947 4
            ->where(
948 4
                $queryBuilder->expr()->eq('page_id', $queryBuilder->createNamedParameter($id, \PDO::PARAM_INT))
949
            )
950 4
            ->orderBy('scheduled', 'DESC');
951
952 4
        $expressionBuilder = GeneralUtility::makeInstance(ConnectionPool::class)
953 4
            ->getConnectionForTable($this->tableName)
954 4
            ->getExpressionBuilder();
955 4
        $query = $expressionBuilder->andX();
0 ignored issues
show
Unused Code introduced by
The assignment to $query is dead and can be removed.
Loading history...
956
        // PHPStorm adds the highlight that the $addWhere is immediately overwritten,
957
        // but the $query = $expressionBuilder->andX() ensures that the $addWhere is written correctly with AND
958
        // between the statements, it's not a mistake in the code.
959 4
        switch ($queueFilter) {
960 4
            case 'pending':
961
                $queryBuilder->andWhere($queryBuilder->expr()->eq('exec_time', 0));
962
                break;
963 4
            case 'finished':
964
                $queryBuilder->andWhere($queryBuilder->expr()->gt('exec_time', 0));
965
                break;
966
        }
967
968 4
        if ($doFlush) {
969 2
            if ($doFullFlush) {
970 1
                $this->queueRepository->flushQueue($queueFilter);
971
            } else {
972 1
                $this->queueRepository->flushQueue($queueFilter);
973
            }
974
        }
975 4
        if ($itemsPerPage > 0) {
976
            $queryBuilder
977 4
                ->setMaxResults((int) $itemsPerPage);
978
        }
979
980 4
        return $queryBuilder->execute()->fetchAll();
981
    }
982
983
    /**
984
     * Return array of records from crawler queue for input set ID
985
     *
986
     * @param int $set_id Set ID for which to look up log entries.
987
     * @param string $filter Filter: "all" => all entries, "pending" => all that is not yet run, "finished" => all complete ones
988
     * @param bool $doFlush If TRUE, then entries selected at DELETED(!) instead of selected!
989
     * @param int $itemsPerPage Limit the amount of entries per page default is 10
990
     * @return array
991
     *
992
     * @deprecated
993
     */
994 6
    public function getLogEntriesForSetId(int $set_id, string $filter = '', bool $doFlush = false, bool $doFullFlush = false, int $itemsPerPage = 10)
995
    {
996 6
        $queryBuilder = GeneralUtility::makeInstance(ConnectionPool::class)->getQueryBuilderForTable($this->tableName);
997
        $queryBuilder
998 6
            ->select('*')
999 6
            ->from($this->tableName)
1000 6
            ->where(
1001 6
                $queryBuilder->expr()->eq('set_id', $queryBuilder->createNamedParameter($set_id, \PDO::PARAM_INT))
1002
            )
1003 6
            ->orderBy('scheduled', 'DESC');
1004
1005 6
        $expressionBuilder = GeneralUtility::makeInstance(ConnectionPool::class)
1006 6
            ->getConnectionForTable($this->tableName)
1007 6
            ->getExpressionBuilder();
1008 6
        $query = $expressionBuilder->andX();
1009
        // PHPStorm adds the highlight that the $addWhere is immediately overwritten,
1010
        // but the $query = $expressionBuilder->andX() ensures that the $addWhere is written correctly with AND
1011
        // between the statements, it's not a mistake in the code.
1012 6
        $addWhere = '';
1013 6
        switch ($filter) {
1014 6
            case 'pending':
1015 1
                $queryBuilder->andWhere($queryBuilder->expr()->eq('exec_time', 0));
1016 1
                $addWhere = $query->add($expressionBuilder->eq('exec_time', 0));
0 ignored issues
show
Unused Code introduced by
The assignment to $addWhere is dead and can be removed.
Loading history...
1017 1
                break;
1018 5
            case 'finished':
1019 1
                $queryBuilder->andWhere($queryBuilder->expr()->gt('exec_time', 0));
1020 1
                $addWhere = $query->add($expressionBuilder->gt('exec_time', 0));
1021 1
                break;
1022
        }
1023 6
        if ($doFlush) {
1024 4
            $addWhere = $query->add($expressionBuilder->eq('set_id', (int) $set_id));
1025 4
            $this->flushQueue($doFullFlush ? '' : $addWhere);
0 ignored issues
show
Deprecated Code introduced by
The function AOE\Crawler\Controller\C...ontroller::flushQueue() has been deprecated. ( Ignorable by Annotation )

If this is a false-positive, you can also ignore this issue in your code via the ignore-deprecated  annotation

1025
            /** @scrutinizer ignore-deprecated */ $this->flushQueue($doFullFlush ? '' : $addWhere);
Loading history...
1026 4
            return [];
1027
        }
1028 2
        if ($itemsPerPage > 0) {
1029
            $queryBuilder
1030 2
                ->setMaxResults((int) $itemsPerPage);
1031
        }
1032
1033 2
        return $queryBuilder->execute()->fetchAll();
1034
    }
1035
1036
    /**
1037
     * Adding call back entries to log (called from hooks typically, see indexed search class "class.crawler.php"
1038
     *
1039
     * @param integer $setId Set ID
1040
     * @param array $params Parameters to pass to call back function
1041
     * @param string $callBack Call back object reference, eg. 'EXT:indexed_search/class.crawler.php:&tx_indexedsearch_crawler'
1042
     * @param integer $page_id Page ID to attach it to
1043
     * @param integer $schedule Time at which to activate
1044
     */
1045
    public function addQueueEntry_callBack($setId, $params, $callBack, $page_id = 0, $schedule = 0): void
1046
    {
1047
        if (! is_array($params)) {
0 ignored issues
show
introduced by
The condition is_array($params) is always true.
Loading history...
1048
            $params = [];
1049
        }
1050
        $params['_CALLBACKOBJ'] = $callBack;
1051
1052
        GeneralUtility::makeInstance(ConnectionPool::class)->getConnectionForTable('tx_crawler_queue')
1053
            ->insert(
1054
                'tx_crawler_queue',
1055
                [
1056
                    'page_id' => (int) $page_id,
1057
                    'parameters' => json_encode($params),
1058
                    'scheduled' => (int) $schedule ?: $this->getCurrentTime(),
1059
                    'exec_time' => 0,
1060
                    'set_id' => (int) $setId,
1061
                    'result_data' => '',
1062
                ]
1063
            );
1064
    }
1065
1066
    /************************************
1067
     *
1068
     * URL setting
1069
     *
1070
     ************************************/
1071
1072
    /**
1073
     * Setting a URL for crawling:
1074
     *
1075
     * @param integer $id Page ID
1076
     * @param string $url Complete URL
1077
     * @param Configuration|array $subCfg Sub configuration array (from TS config)
1078
     * @param integer $tstamp Scheduled-time
1079
     * @param string $configurationHash (optional) configuration hash
1080
     * @param bool $skipInnerDuplicationCheck (optional) skip inner duplication check
1081
     * @return bool
1082
     */
1083 7
    public function addUrl(
1084
        $id,
1085
        $url,
1086
        $subCfg,
1087
        $tstamp,
1088
        $configurationHash = '',
1089
        $skipInnerDuplicationCheck = false
1090
    ) {
1091 7
        $urlAdded = false;
1092 7
        $rows = [];
1093
1094
        // Creating parameters:
1095
        $parameters = [
1096 7
            'url' => $url,
1097
        ];
1098
1099
        // Creates a Configuration Object from array
1100 7
        if (is_array($subCfg)) {
1101 3
            $subCfg['name'] = $subCfg['name'] ?: 'Config without name';
1102 3
            $subCfg = Configuration::fromArray($subCfg);
1103
        }
1104
1105
        // fe user group simulation:
1106 7
        $uGs = implode(',', array_unique(GeneralUtility::intExplode(',', $subCfg->getFeGroups(), true)));
1107 7
        if ($uGs) {
1108 1
            $parameters['feUserGroupList'] = $uGs;
1109
        }
1110
1111
        // Setting processing instructions
1112 7
        $parameters['procInstructions'] = GeneralUtility::trimExplode(',', $subCfg->getProcessingInstructionFilter());
1113 7
        if (is_array($subCfg->getProcessingInstructionParameters())) {
0 ignored issues
show
introduced by
The condition is_array($subCfg->getPro...nstructionParameters()) is always false.
Loading history...
1114 3
            $parameters['procInstrParams'] = $subCfg->getProcessingInstructionParameters();
1115
        }
1116
1117
        // Compile value array:
1118 7
        $parameters_serialized = json_encode($parameters);
1119
        $fieldArray = [
1120 7
            'page_id' => (int) $id,
1121 7
            'parameters' => $parameters_serialized,
1122 7
            'parameters_hash' => GeneralUtility::shortMD5($parameters_serialized),
1123 7
            'configuration_hash' => $configurationHash,
1124 7
            'scheduled' => $tstamp,
1125 7
            'exec_time' => 0,
1126 7
            'set_id' => (int) $this->setID,
1127 7
            'result_data' => '',
1128 7
            'configuration' => $subCfg,
1129
        ];
1130
1131 7
        if ($this->registerQueueEntriesInternallyOnly) {
1132
            //the entries will only be registered and not stored to the database
1133 1
            $this->queueEntries[] = $fieldArray;
1134
        } else {
1135 6
            if (! $skipInnerDuplicationCheck) {
1136
                // check if there is already an equal entry
1137 5
                $rows = $this->getDuplicateRowsIfExist($tstamp, $fieldArray);
1138
            }
1139
1140 6
            if (empty($rows)) {
1141
                //$queueObject = Queue::fromArray($fieldArray);
1142
                //$this->queueRepository->add($queueObject);
1143 5
                $connectionForCrawlerQueue = GeneralUtility::makeInstance(ConnectionPool::class)->getConnectionForTable('tx_crawler_queue');
1144 5
                $connectionForCrawlerQueue->insert(
1145 5
                    'tx_crawler_queue',
1146 5
                    $fieldArray
1147
                );
1148 5
                $uid = $connectionForCrawlerQueue->lastInsertId('tx_crawler_queue', 'qid');
1149 5
                $rows[] = $uid;
1150
1151 5
                $urlAdded = true;
1152
1153 5
                $signalPayload = ['uid' => $uid, 'fieldArray' => $fieldArray];
1154 5
                SignalSlotUtility::emitSignal(
1155 5
                    self::class,
1156 5
                    SignalSlotUtility::SIGNAL_URL_ADDED_TO_QUEUE,
1157 5
                    $signalPayload
1158
                );
1159
            } else {
1160 3
                $signalPayload = ['rows' => $rows, 'fieldArray' => $fieldArray];
1161 3
                SignalSlotUtility::emitSignal(
1162 3
                    self::class,
1163 3
                    SignalSlotUtility::SIGNAL_DUPLICATE_URL_IN_QUEUE,
1164 3
                    $signalPayload
1165
                );
1166
            }
1167
        }
1168
1169 7
        return $urlAdded;
1170
    }
1171
1172
    /**
1173
     * Returns the current system time
1174
     *
1175
     * @return int
1176
     */
1177
    public function getCurrentTime()
1178
    {
1179
        return time();
1180
    }
1181
1182
    /************************************
1183
     *
1184
     * URL reading
1185
     *
1186
     ************************************/
1187
1188
    /**
1189
     * Read URL for single queue entry
1190
     *
1191
     * @param integer $queueId
1192
     * @param boolean $force If set, will process even if exec_time has been set!
1193
     * @return integer
1194
     */
1195
    public function readUrl($queueId, $force = false)
1196
    {
1197
        $queryBuilder = GeneralUtility::makeInstance(ConnectionPool::class)->getQueryBuilderForTable($this->tableName);
1198
        $ret = 0;
1199
        $this->logger->debug('crawler-readurl start ' . microtime(true));
1200
        // Get entry:
1201
        $queryBuilder
1202
            ->select('*')
1203
            ->from('tx_crawler_queue')
1204
            ->where(
1205
                $queryBuilder->expr()->eq('qid', $queryBuilder->createNamedParameter($queueId, \PDO::PARAM_INT))
1206
            );
1207
        if (! $force) {
1208
            $queryBuilder
1209
                ->andWhere('exec_time = 0')
1210
                ->andWhere('process_scheduled > 0');
1211
        }
1212
        $queueRec = $queryBuilder->execute()->fetch();
1213
1214
        if (! is_array($queueRec)) {
1215
            return;
1216
        }
1217
1218
        SignalSlotUtility::emitSignal(
1219
            self::class,
1220
            SignalSlotUtility::SIGNAL_QUEUEITEM_PREPROCESS,
1221
            [$queueId, &$queueRec]
1222
        );
1223
1224
        // Set exec_time to lock record:
1225
        $field_array = ['exec_time' => $this->getCurrentTime()];
1226
1227
        if (isset($this->processID)) {
1228
            //if mulitprocessing is used we need to store the id of the process which has handled this entry
1229
            $field_array['process_id_completed'] = $this->processID;
1230
        }
1231
1232
        GeneralUtility::makeInstance(ConnectionPool::class)->getConnectionForTable('tx_crawler_queue')
1233
            ->update(
1234
                'tx_crawler_queue',
1235
                $field_array,
1236
                ['qid' => (int) $queueId]
1237
            );
1238
1239
        $result = $this->queueExecutor->executeQueueItem($queueRec, $this);
1240
        if ($result['content'] === null) {
1241
            $resultData = 'An errors happened';
1242
        } else {
1243
            /** @var JsonCompatibilityConverter $jsonCompatibilityConverter */
1244
            $jsonCompatibilityConverter = GeneralUtility::makeInstance(JsonCompatibilityConverter::class);
1245
            $resultData = $jsonCompatibilityConverter->convert($result['content']);
1246
        }
1247
1248
        //atm there's no need to point to specific pollable extensions
1249
        if (is_array($GLOBALS['TYPO3_CONF_VARS']['EXTCONF']['crawler']['pollSuccess'])) {
1250
            foreach ($GLOBALS['TYPO3_CONF_VARS']['EXTCONF']['crawler']['pollSuccess'] as $pollable) {
1251
                // only check the success value if the instruction is runnig
1252
                // it is important to name the pollSuccess key same as the procInstructions key
1253
                if (is_array($resultData['parameters']['procInstructions'])
1254
                    && in_array(
1255
                        $pollable,
1256
                        $resultData['parameters']['procInstructions'], true
1257
                    )
1258
                ) {
1259
                    if (! empty($resultData['success'][$pollable]) && $resultData['success'][$pollable]) {
1260
                        $ret |= self::CLI_STATUS_POLLABLE_PROCESSED;
1261
                    }
1262
                }
1263
            }
1264
        }
1265
1266
        // Set result in log which also denotes the end of the processing of this entry.
1267
        $field_array = ['result_data' => json_encode($result)];
1268
1269
        SignalSlotUtility::emitSignal(
1270
            self::class,
1271
            SignalSlotUtility::SIGNAL_QUEUEITEM_POSTPROCESS,
1272
            [$queueId, &$field_array]
1273
        );
1274
1275
        GeneralUtility::makeInstance(ConnectionPool::class)->getConnectionForTable('tx_crawler_queue')
1276
            ->update(
1277
                'tx_crawler_queue',
1278
                $field_array,
1279
                ['qid' => (int) $queueId]
1280
            );
1281
1282
        $this->logger->debug('crawler-readurl stop ' . microtime(true));
1283
        return $ret;
1284
    }
1285
1286
    /**
1287
     * Read URL for not-yet-inserted log-entry
1288
     *
1289
     * @param array $field_array Queue field array,
1290
     *
1291
     * @return array|bool|mixed|string
1292
     */
1293
    public function readUrlFromArray($field_array)
1294
    {
1295
        // Set exec_time to lock record:
1296
        $field_array['exec_time'] = $this->getCurrentTime();
1297
        $connectionForCrawlerQueue = GeneralUtility::makeInstance(ConnectionPool::class)->getConnectionForTable($this->tableName);
1298
        $connectionForCrawlerQueue->insert(
1299
            $this->tableName,
1300
            $field_array
1301
        );
1302
        $queueId = $connectionForCrawlerQueue->lastInsertId($this->tableName, 'qid');
1303
        $result = $this->queueExecutor->executeQueueItem($field_array, $this);
1304
1305
        // Set result in log which also denotes the end of the processing of this entry.
1306
        $field_array = ['result_data' => json_encode($result)];
1307
1308
        SignalSlotUtility::emitSignal(
1309
            self::class,
1310
            SignalSlotUtility::SIGNAL_QUEUEITEM_POSTPROCESS,
1311
            [$queueId, &$field_array]
1312
        );
1313
1314
        $connectionForCrawlerQueue->update(
1315
            $this->tableName,
1316
            $field_array,
1317
            ['qid' => $queueId]
1318
        );
1319
1320
        return $result;
1321
    }
1322
1323
    /*****************************
1324
     *
1325
     * Compiling URLs to crawl - tools
1326
     *
1327
     *****************************/
1328
1329
    /**
1330
     * @param integer $id Root page id to start from.
1331
     * @param integer $depth Depth of tree, 0=only id-page, 1= on sublevel, 99 = infinite
1332
     * @param integer $scheduledTime Unix Time when the URL is timed to be visited when put in queue
1333
     * @param integer $reqMinute Number of requests per minute (creates the interleave between requests)
1334
     * @param boolean $submitCrawlUrls If set, submits the URLs to queue in database (real crawling)
1335
     * @param boolean $downloadCrawlUrls If set (and submitcrawlUrls is false) will fill $downloadUrls with entries)
1336
     * @param array $incomingProcInstructions Array of processing instructions
1337
     * @param array $configurationSelection Array of configuration keys
1338
     * @return string
1339
     */
1340
    public function getPageTreeAndUrls(
1341
        $id,
1342
        $depth,
1343
        $scheduledTime,
1344
        $reqMinute,
1345
        $submitCrawlUrls,
1346
        $downloadCrawlUrls,
1347
        array $incomingProcInstructions,
1348
        array $configurationSelection
1349
    ) {
1350
        $this->scheduledTime = $scheduledTime;
1351
        $this->reqMinute = $reqMinute;
1352
        $this->submitCrawlUrls = $submitCrawlUrls;
1353
        $this->downloadCrawlUrls = $downloadCrawlUrls;
1354
        $this->incomingProcInstructions = $incomingProcInstructions;
1355
        $this->incomingConfigurationSelection = $configurationSelection;
1356
1357
        $this->duplicateTrack = [];
1358
        $this->downloadUrls = [];
1359
1360
        // Drawing tree:
1361
        /* @var PageTreeView $tree */
1362
        $tree = GeneralUtility::makeInstance(PageTreeView::class);
1363
        $perms_clause = $this->getBackendUser()->getPagePermsClause(Permission::PAGE_SHOW);
1364
        $tree->init('AND ' . $perms_clause);
1365
1366
        $pageInfo = BackendUtility::readPageAccess($id, $perms_clause);
1367
        if (is_array($pageInfo)) {
0 ignored issues
show
introduced by
The condition is_array($pageInfo) is always true.
Loading history...
1368
            // Set root row:
1369
            $tree->tree[] = [
1370
                'row' => $pageInfo,
1371
                'HTML' => $this->iconFactory->getIconForRecord('pages', $pageInfo, Icon::SIZE_SMALL),
1372
            ];
1373
        }
1374
1375
        // Get branch beneath:
1376
        if ($depth) {
1377
            $tree->getTree($id, $depth, '');
1378
        }
1379
1380
        // Traverse page tree:
1381
        $code = '';
1382
1383
        foreach ($tree->tree as $data) {
1384
            $this->MP = false;
1385
1386
            // recognize mount points
1387
            if ($data['row']['doktype'] === PageRepository::DOKTYPE_MOUNTPOINT) {
1388
                $mountpage = $this->pageRepository->getPage($data['row']['uid']);
1389
1390
                // fetch mounted pages
1391
                $this->MP = $mountpage[0]['mount_pid'] . '-' . $data['row']['uid'];
0 ignored issues
show
Documentation Bug introduced by
The property $MP was declared of type boolean, but $mountpage[0]['mount_pid...' . $data['row']['uid'] is of type string. Maybe add a type cast?

This check looks for assignments to scalar types that may be of the wrong type.

To ensure the code behaves as expected, it may be a good idea to add an explicit type cast.

$answer = 42;

$correct = false;

$correct = (bool) $answer;
Loading history...
1392
1393
                $mountTree = GeneralUtility::makeInstance(PageTreeView::class);
1394
                $mountTree->init('AND ' . $perms_clause);
1395
                $mountTree->getTree($mountpage[0]['mount_pid'], $depth);
1396
1397
                foreach ($mountTree->tree as $mountData) {
1398
                    $code .= $this->drawURLs_addRowsForPage(
1399
                        $mountData['row'],
1400
                        $mountData['HTML'] . BackendUtility::getRecordTitle('pages', $mountData['row'], true)
1401
                    );
1402
                }
1403
1404
                // replace page when mount_pid_ol is enabled
1405
                if ($mountpage[0]['mount_pid_ol']) {
1406
                    $data['row']['uid'] = $mountpage[0]['mount_pid'];
1407
                } else {
1408
                    // if the mount_pid_ol is not set the MP must not be used for the mountpoint page
1409
                    $this->MP = false;
1410
                }
1411
            }
1412
1413
            $code .= $this->drawURLs_addRowsForPage(
1414
                $data['row'],
1415
                $data['HTML'] . BackendUtility::getRecordTitle('pages', $data['row'], true)
1416
            );
1417
        }
1418
1419
        return $code;
1420
    }
1421
1422
    /**
1423
     * Expands exclude string
1424
     *
1425
     * @param string $excludeString Exclude string
1426
     * @return array
1427
     */
1428 1
    public function expandExcludeString($excludeString)
1429
    {
1430
        // internal static caches;
1431 1
        static $expandedExcludeStringCache;
1432 1
        static $treeCache;
1433
1434 1
        if (empty($expandedExcludeStringCache[$excludeString])) {
1435 1
            $pidList = [];
1436
1437 1
            if (! empty($excludeString)) {
1438
                /** @var PageTreeView $tree */
1439 1
                $tree = GeneralUtility::makeInstance(PageTreeView::class);
1440 1
                $tree->init('AND ' . $this->getBackendUser()->getPagePermsClause(Permission::PAGE_SHOW));
1441
1442 1
                $excludeParts = GeneralUtility::trimExplode(',', $excludeString);
1443
1444 1
                foreach ($excludeParts as $excludePart) {
1445 1
                    [$pid, $depth] = GeneralUtility::trimExplode('+', $excludePart);
1446
1447
                    // default is "page only" = "depth=0"
1448 1
                    if (empty($depth)) {
1449 1
                        $depth = (stristr($excludePart, '+')) ? 99 : 0;
1450
                    }
1451
1452 1
                    $pidList[] = (int) $pid;
1453
1454 1
                    if ($depth > 0) {
1455
                        if (empty($treeCache[$pid][$depth])) {
1456
                            $tree->reset();
1457
                            $tree->getTree($pid, $depth);
0 ignored issues
show
Bug introduced by
$pid of type string is incompatible with the type integer expected by parameter $uid of TYPO3\CMS\Backend\Tree\V...ractTreeView::getTree(). ( Ignorable by Annotation )

If this is a false-positive, you can also ignore this issue in your code via the ignore-type  annotation

1457
                            $tree->getTree(/** @scrutinizer ignore-type */ $pid, $depth);
Loading history...
1458
                            $treeCache[$pid][$depth] = $tree->tree;
1459
                        }
1460
1461
                        foreach ($treeCache[$pid][$depth] as $data) {
1462
                            $pidList[] = (int) $data['row']['uid'];
1463
                        }
1464
                    }
1465
                }
1466
            }
1467
1468 1
            $expandedExcludeStringCache[$excludeString] = array_unique($pidList);
1469
        }
1470
1471 1
        return $expandedExcludeStringCache[$excludeString];
1472
    }
1473
1474
    /**
1475
     * Create the rows for display of the page tree
1476
     * For each page a number of rows are shown displaying GET variable configuration
1477
     */
1478
    public function drawURLs_addRowsForPage(array $pageRow, string $pageTitle): string
1479
    {
1480
        $skipMessage = '';
1481
1482
        // Get list of configurations
1483
        $configurations = $this->getUrlsForPageRow($pageRow, $skipMessage);
1484
1485
        if (! empty($this->incomingConfigurationSelection)) {
1486
            // remove configuration that does not match the current selection
1487
            foreach ($configurations as $confKey => $confArray) {
1488
                if (! in_array($confKey, $this->incomingConfigurationSelection, true)) {
1489
                    unset($configurations[$confKey]);
1490
                }
1491
            }
1492
        }
1493
1494
        // Traverse parameter combinations:
1495
        $c = 0;
1496
        $content = '';
1497
        if (! empty($configurations)) {
1498
            foreach ($configurations as $confKey => $confArray) {
1499
1500
                // Title column:
1501
                if (! $c) {
1502
                    $titleClm = '<td rowspan="' . count($configurations) . '">' . $pageTitle . '</td>';
1503
                } else {
1504
                    $titleClm = '';
1505
                }
1506
1507
                if (! in_array($pageRow['uid'], $this->expandExcludeString($confArray['subCfg']['exclude']), true)) {
1508
1509
                    // URL list:
1510
                    $urlList = $this->urlListFromUrlArray(
1511
                        $confArray,
1512
                        $pageRow,
1513
                        $this->scheduledTime,
1514
                        $this->reqMinute,
1515
                        $this->submitCrawlUrls,
1516
                        $this->downloadCrawlUrls,
1517
                        $this->duplicateTrack,
1518
                        $this->downloadUrls,
1519
                        // if empty the urls won't be filtered by processing instructions
1520
                        $this->incomingProcInstructions
1521
                    );
1522
1523
                    // Expanded parameters:
1524
                    $paramExpanded = '';
1525
                    $calcAccu = [];
1526
                    $calcRes = 1;
1527
                    foreach ($confArray['paramExpanded'] as $gVar => $gVal) {
1528
                        $paramExpanded .= '
1529
                            <tr>
1530
                                <td>' . htmlspecialchars('&' . $gVar . '=') . '<br/>' .
1531
                            '(' . count($gVal) . ')' .
1532
                            '</td>
1533
                                <td nowrap="nowrap">' . nl2br(htmlspecialchars(implode(chr(10), $gVal))) . '</td>
1534
                            </tr>
1535
                        ';
1536
                        $calcRes *= count($gVal);
1537
                        $calcAccu[] = count($gVal);
1538
                    }
1539
                    $paramExpanded = '<table>' . $paramExpanded . '</table>';
1540
                    $paramExpanded .= 'Comb: ' . implode('*', $calcAccu) . '=' . $calcRes;
1541
1542
                    // Options
1543
                    $optionValues = '';
1544
                    if ($confArray['subCfg']['userGroups']) {
1545
                        $optionValues .= 'User Groups: ' . $confArray['subCfg']['userGroups'] . '<br/>';
1546
                    }
1547
                    if ($confArray['subCfg']['procInstrFilter']) {
1548
                        $optionValues .= 'ProcInstr: ' . $confArray['subCfg']['procInstrFilter'] . '<br/>';
1549
                    }
1550
1551
                    // Compile row:
1552
                    $content .= '
1553
                        <tr>
1554
                            ' . $titleClm . '
1555
                            <td>' . htmlspecialchars($confKey) . '</td>
1556
                            <td>' . nl2br(htmlspecialchars(rawurldecode(trim(str_replace('&', chr(10) . '&', GeneralUtility::implodeArrayForUrl('', $confArray['paramParsed'])))))) . '</td>
1557
                            <td>' . $paramExpanded . '</td>
1558
                            <td nowrap="nowrap">' . $urlList . '</td>
1559
                            <td nowrap="nowrap">' . $optionValues . '</td>
1560
                            <td nowrap="nowrap">' . DebugUtility::viewArray($confArray['subCfg']['procInstrParams.']) . '</td>
1561
                        </tr>';
1562
                } else {
1563
                    $content .= '<tr>
1564
                            ' . $titleClm . '
1565
                            <td>' . htmlspecialchars($confKey) . '</td>
1566
                            <td colspan="5"><em>No entries</em> (Page is excluded in this configuration)</td>
1567
                        </tr>';
1568
                }
1569
1570
                $c++;
1571
            }
1572
        } else {
1573
            $message = ! empty($skipMessage) ? ' (' . $skipMessage . ')' : '';
1574
1575
            // Compile row:
1576
            $content .= '
1577
                <tr>
1578
                    <td>' . $pageTitle . '</td>
1579
                    <td colspan="6"><em>No entries</em>' . $message . '</td>
1580
                </tr>';
1581
        }
1582
1583
        return $content;
1584
    }
1585
1586
    /*****************************
1587
     *
1588
     * CLI functions
1589
     *
1590
     *****************************/
1591
1592
    /**
1593
     * Running the functionality of the CLI (crawling URLs from queue)
1594
     */
1595
    public function CLI_run(int $countInARun, int $sleepTime, int $sleepAfterFinish): int
1596
    {
1597
        $result = 0;
1598
        $counter = 0;
1599
1600
        // First, run hooks:
1601
        $this->CLI_runHooks();
1602
1603
        // Clean up the queue
1604
        $this->queueRepository->cleanupQueue();
1605
1606
        // Select entries:
1607
        $rows = $this->queueRepository->fetchRecordsToBeCrawled($countInARun);
1608
1609
        if (! empty($rows)) {
1610
            $quidList = [];
1611
1612
            foreach ($rows as $r) {
1613
                $quidList[] = $r['qid'];
1614
            }
1615
1616
            $processId = $this->CLI_buildProcessId();
1617
1618
            //save the number of assigned queue entries to determine how many have been processed later
1619
            $numberOfAffectedRows = $this->queueRepository->updateProcessIdAndSchedulerForQueueIds($quidList, $processId);
1620
            $this->processRepository->updateProcessAssignItemsCount($numberOfAffectedRows, $processId);
1621
1622
            if ($numberOfAffectedRows !== count($quidList)) {
1623
                $this->CLI_debug('Nothing processed due to multi-process collision (' . $this->CLI_buildProcessId() . ')');
0 ignored issues
show
Deprecated Code introduced by
The function AOE\Crawler\Controller\C...Controller::CLI_debug() has been deprecated. ( Ignorable by Annotation )

If this is a false-positive, you can also ignore this issue in your code via the ignore-deprecated  annotation

1623
                /** @scrutinizer ignore-deprecated */ $this->CLI_debug('Nothing processed due to multi-process collision (' . $this->CLI_buildProcessId() . ')');
Loading history...
1624
                return ($result | self::CLI_STATUS_ABORTED);
1625
            }
1626
1627
            foreach ($rows as $r) {
1628
                $result |= $this->readUrl($r['qid']);
1629
1630
                $counter++;
1631
                // Just to relax the system
1632
                usleep((int) $sleepTime);
1633
1634
                // if during the start and the current read url the cli has been disable we need to return from the function
1635
                // mark the process NOT as ended.
1636
                if ($this->crawler->isDisabled()) {
1637
                    return ($result | self::CLI_STATUS_ABORTED);
1638
                }
1639
1640
                if (! $this->processRepository->isProcessActive($this->CLI_buildProcessId())) {
1641
                    $this->CLI_debug('conflict / timeout (' . $this->CLI_buildProcessId() . ')');
0 ignored issues
show
Deprecated Code introduced by
The function AOE\Crawler\Controller\C...Controller::CLI_debug() has been deprecated. ( Ignorable by Annotation )

If this is a false-positive, you can also ignore this issue in your code via the ignore-deprecated  annotation

1641
                    /** @scrutinizer ignore-deprecated */ $this->CLI_debug('conflict / timeout (' . $this->CLI_buildProcessId() . ')');
Loading history...
1642
                    $result |= self::CLI_STATUS_ABORTED;
1643
                    //possible timeout
1644
                    break;
1645
                }
1646
            }
1647
1648
            sleep((int) $sleepAfterFinish);
1649
1650
            $msg = 'Rows: ' . $counter;
1651
            $this->CLI_debug($msg . ' (' . $this->CLI_buildProcessId() . ')');
0 ignored issues
show
Deprecated Code introduced by
The function AOE\Crawler\Controller\C...Controller::CLI_debug() has been deprecated. ( Ignorable by Annotation )

If this is a false-positive, you can also ignore this issue in your code via the ignore-deprecated  annotation

1651
            /** @scrutinizer ignore-deprecated */ $this->CLI_debug($msg . ' (' . $this->CLI_buildProcessId() . ')');
Loading history...
1652
        } else {
1653
            $this->CLI_debug('Nothing within queue which needs to be processed (' . $this->CLI_buildProcessId() . ')');
0 ignored issues
show
Deprecated Code introduced by
The function AOE\Crawler\Controller\C...Controller::CLI_debug() has been deprecated. ( Ignorable by Annotation )

If this is a false-positive, you can also ignore this issue in your code via the ignore-deprecated  annotation

1653
            /** @scrutinizer ignore-deprecated */ $this->CLI_debug('Nothing within queue which needs to be processed (' . $this->CLI_buildProcessId() . ')');
Loading history...
1654
        }
1655
1656
        if ($counter > 0) {
1657
            $result |= self::CLI_STATUS_PROCESSED;
1658
        }
1659
1660
        return $result;
1661
    }
1662
1663
    /**
1664
     * Activate hooks
1665
     */
1666
    public function CLI_runHooks(): void
1667
    {
1668
        foreach ($GLOBALS['TYPO3_CONF_VARS']['EXTCONF']['crawler']['cli_hooks'] ?? [] as $objRef) {
1669
            $hookObj = GeneralUtility::makeInstance($objRef);
1670
            if (is_object($hookObj)) {
1671
                $hookObj->crawler_init($this);
1672
            }
1673
        }
1674
    }
1675
1676
    /**
1677
     * Try to acquire a new process with the given id
1678
     * also performs some auto-cleanup for orphan processes
1679
     * @param string $id identification string for the process
1680
     * @return boolean
1681
     * @todo preemption might not be the most elegant way to clean up
1682
     */
1683
    public function CLI_checkAndAcquireNewProcess($id)
1684
    {
1685
        $ret = true;
1686
1687
        $systemProcessId = getmypid();
1688
        if (! $systemProcessId) {
1689
            return false;
1690
        }
1691
1692
        $processCount = 0;
1693
        $orphanProcesses = [];
1694
1695
        $activeProcesses = $this->processRepository->findAllActive();
1696
        $currentTime = $this->getCurrentTime();
1697
1698
        /** @var Process $process */
1699
        foreach ($activeProcesses as $process) {
1700
            if ($process->getTtl() < $currentTime) {
1701
                $orphanProcesses[] = $process->getProcessId();
1702
            } else {
1703
                $processCount++;
1704
            }
1705
        }
1706
1707
        // if there are less than allowed active processes then add a new one
1708
        if ($processCount < (int) $this->extensionSettings['processLimit']) {
1709
            $this->CLI_debug('add process ' . $this->CLI_buildProcessId() . ' (' . ($processCount + 1) . '/' . (int) $this->extensionSettings['processLimit'] . ')');
0 ignored issues
show
Deprecated Code introduced by
The function AOE\Crawler\Controller\C...Controller::CLI_debug() has been deprecated. ( Ignorable by Annotation )

If this is a false-positive, you can also ignore this issue in your code via the ignore-deprecated  annotation

1709
            /** @scrutinizer ignore-deprecated */ $this->CLI_debug('add process ' . $this->CLI_buildProcessId() . ' (' . ($processCount + 1) . '/' . (int) $this->extensionSettings['processLimit'] . ')');
Loading history...
1710
1711
            GeneralUtility::makeInstance(ConnectionPool::class)->getConnectionForTable('tx_crawler_process')->insert(
1712
                'tx_crawler_process',
1713
                [
1714
                    'process_id' => $id,
1715
                    'active' => 1,
1716
                    'ttl' => $currentTime + (int) $this->extensionSettings['processMaxRunTime'],
1717
                    'system_process_id' => $systemProcessId,
1718
                ]
1719
            );
1720
        } else {
1721
            $this->CLI_debug('Processlimit reached (' . ($processCount) . '/' . (int) $this->extensionSettings['processLimit'] . ')');
0 ignored issues
show
Deprecated Code introduced by
The function AOE\Crawler\Controller\C...Controller::CLI_debug() has been deprecated. ( Ignorable by Annotation )

If this is a false-positive, you can also ignore this issue in your code via the ignore-deprecated  annotation

1721
            /** @scrutinizer ignore-deprecated */ $this->CLI_debug('Processlimit reached (' . ($processCount) . '/' . (int) $this->extensionSettings['processLimit'] . ')');
Loading history...
1722
            $ret = false;
1723
        }
1724
1725
        $this->processRepository->deleteProcessesMarkedAsDeleted();
1726
        $this->CLI_releaseProcesses($orphanProcesses);
1727
1728
        return $ret;
1729
    }
1730
1731
    /**
1732
     * Release a process and the required resources
1733
     *
1734
     * @param mixed $releaseIds string with a single process-id or array with multiple process-ids
1735
     * @return boolean
1736
     */
1737
    public function CLI_releaseProcesses($releaseIds)
1738
    {
1739
        $queryBuilder = GeneralUtility::makeInstance(ConnectionPool::class)->getQueryBuilderForTable($this->tableName);
1740
1741
        if (! is_array($releaseIds)) {
1742
            $releaseIds = [$releaseIds];
1743
        }
1744
1745
        if (empty($releaseIds)) {
1746
            //nothing to release
1747
            return false;
1748
        }
1749
1750
        // some kind of 2nd chance algo - this way you need at least 2 processes to have a real cleanup
1751
        // this ensures that a single process can't mess up the entire process table
1752
1753
        // mark all processes as deleted which have no "waiting" queue-entires and which are not active
1754
1755
        $queryBuilder
1756
            ->update($this->tableName, 'q')
1757
            ->where(
1758
                'q.process_id IN(SELECT p.process_id FROM tx_crawler_process as p WHERE p.active = 0)'
1759
            )
1760
            ->set('q.process_scheduled', 0)
1761
            ->set('q.process_id', '')
1762
            ->execute();
1763
1764
        // FIXME: Not entirely sure that this is equivalent to the previous version
1765
        $queryBuilder->resetQueryPart('set');
1766
1767
        $queryBuilder
1768
            ->update('tx_crawler_process')
1769
            ->where(
1770
                $queryBuilder->expr()->eq('active', 0),
1771
                'process_id IN(SELECT q.process_id FROM tx_crawler_queue as q WHERE q.exec_time = 0)'
1772
            )
1773
            ->set('system_process_id', 0)
1774
            ->execute();
1775
1776
        $this->processRepository->markRequestedProcessesAsNotActive($releaseIds);
1777
        $this->queueRepository->unsetProcessScheduledAndProcessIdForQueueEntries($releaseIds);
1778
1779
        return true;
1780
    }
1781
1782
    /**
1783
     * Create a unique Id for the current process
1784
     *
1785
     * @return string the ID
1786
     */
1787 1
    public function CLI_buildProcessId()
1788
    {
1789 1
        if (! $this->processID) {
1790
            $this->processID = GeneralUtility::shortMD5(microtime(true));
1791
        }
1792 1
        return $this->processID;
1793
    }
1794
1795
    /**
1796
     * Prints a message to the stdout (only if debug-mode is enabled)
1797
     *
1798
     * @param string $msg the message
1799
     * @deprecated
1800
     */
1801
    public function CLI_debug($msg): void
1802
    {
1803
        if ((int) $this->extensionSettings['processDebug']) {
1804
            echo $msg . "\n";
1805
            flush();
1806
        }
1807
    }
1808
1809
    /**
1810
     * Cleans up entries that stayed for too long in the queue. These are:
1811
     * - processed entries that are over 1.5 days in age
1812
     * - scheduled entries that are over 7 days old
1813
     *
1814
     * @deprecated
1815
     */
1816 1
    public function cleanUpOldQueueEntries(): void
1817
    {
1818
        // 24*60*60 Seconds in 24 hours
1819 1
        $processedAgeInSeconds = $this->extensionSettings['cleanUpProcessedAge'] * 86400;
1820 1
        $scheduledAgeInSeconds = $this->extensionSettings['cleanUpScheduledAge'] * 86400;
1821
1822 1
        $now = time();
1823 1
        $condition = '(exec_time<>0 AND exec_time<' . ($now - $processedAgeInSeconds) . ') OR scheduled<=' . ($now - $scheduledAgeInSeconds);
1824 1
        $this->flushQueue($condition);
0 ignored issues
show
Deprecated Code introduced by
The function AOE\Crawler\Controller\C...ontroller::flushQueue() has been deprecated. ( Ignorable by Annotation )

If this is a false-positive, you can also ignore this issue in your code via the ignore-deprecated  annotation

1824
        /** @scrutinizer ignore-deprecated */ $this->flushQueue($condition);
Loading history...
1825 1
    }
1826
1827
    /**
1828
     * Removes queue entries
1829
     *
1830
     * @param string $where SQL related filter for the entries which should be removed
1831
     *
1832
     * @deprecated
1833
     */
1834 5
    protected function flushQueue($where = ''): void
1835
    {
1836 5
        $realWhere = strlen((string) $where) > 0 ? $where : '1=1';
1837
1838 5
        $queryBuilder = $this->getQueryBuilder($this->tableName);
1839
1840
        $groups = $queryBuilder
1841 5
            ->selectLiteral('DISTINCT set_id')
1842 5
            ->from($this->tableName)
1843 5
            ->where($realWhere)
1844 5
            ->execute()
1845 5
            ->fetchAll();
1846 5
        if (is_array($groups)) {
0 ignored issues
show
introduced by
The condition is_array($groups) is always true.
Loading history...
1847 5
            foreach ($groups as $group) {
1848
                $subSet = $queryBuilder
1849 4
                    ->select('qid', 'set_id')
1850 4
                    ->from($this->tableName)
1851 4
                    ->where(
1852 4
                        $realWhere,
1853 4
                        $queryBuilder->expr()->eq('set_id', $group['set_id'])
1854
                    )
1855 4
                    ->execute()
1856 4
                    ->fetchAll();
1857
1858 4
                $payLoad = ['subSet' => $subSet];
1859 4
                SignalSlotUtility::emitSignal(
1860 4
                    self::class,
1861 4
                    SignalSlotUtility::SIGNAL_QUEUE_ENTRY_FLUSH,
1862 4
                    $payLoad
1863
                );
1864
            }
1865
        }
1866
1867
        $queryBuilder
1868 5
            ->delete($this->tableName)
1869 5
            ->where($realWhere)
1870 5
            ->execute();
1871 5
    }
1872
1873
    /**
1874
     * This method determines duplicates for a queue entry with the same parameters and this timestamp.
1875
     * If the timestamp is in the past, it will check if there is any unprocessed queue entry in the past.
1876
     * If the timestamp is in the future it will check, if the queued entry has exactly the same timestamp
1877
     *
1878
     * @param int $tstamp
1879
     * @param array $fieldArray
1880
     *
1881
     * @return array
1882
     */
1883 8
    protected function getDuplicateRowsIfExist($tstamp, $fieldArray)
1884
    {
1885 8
        $rows = [];
1886
1887 8
        $currentTime = $this->getCurrentTime();
1888
1889 8
        $queryBuilder = GeneralUtility::makeInstance(ConnectionPool::class)->getQueryBuilderForTable($this->tableName);
1890
        $queryBuilder
1891 8
            ->select('qid')
1892 8
            ->from('tx_crawler_queue');
1893
        //if this entry is scheduled with "now"
1894 8
        if ($tstamp <= $currentTime) {
1895 3
            if ($this->extensionSettings['enableTimeslot']) {
1896 2
                $timeBegin = $currentTime - 100;
1897 2
                $timeEnd = $currentTime + 100;
1898
                $queryBuilder
1899 2
                    ->where(
1900 2
                        'scheduled BETWEEN ' . $timeBegin . ' AND ' . $timeEnd . ''
1901
                    )
1902 2
                    ->orWhere(
1903 2
                        $queryBuilder->expr()->lte('scheduled', $currentTime)
1904
                    );
1905
            } else {
1906
                $queryBuilder
1907 1
                    ->where(
1908 3
                        $queryBuilder->expr()->lte('scheduled', $currentTime)
1909
                    );
1910
            }
1911 5
        } elseif ($tstamp > $currentTime) {
1912
            //entry with a timestamp in the future need to have the same schedule time
1913
            $queryBuilder
1914 5
                ->where(
1915 5
                    $queryBuilder->expr()->eq('scheduled', $tstamp)
1916
                );
1917
        }
1918
1919
        $queryBuilder
1920 8
            ->andWhere('NOT exec_time')
1921 8
            ->andWhere('NOT process_id')
1922 8
            ->andWhere($queryBuilder->expr()->eq('page_id', $queryBuilder->createNamedParameter($fieldArray['page_id'], \PDO::PARAM_INT)))
1923 8
            ->andWhere($queryBuilder->expr()->eq('parameters_hash', $queryBuilder->createNamedParameter($fieldArray['parameters_hash'], \PDO::PARAM_STR)));
1924
1925 8
        $statement = $queryBuilder->execute();
1926
1927 8
        while ($row = $statement->fetch()) {
1928 7
            $rows[] = $row['qid'];
1929
        }
1930
1931 8
        return $rows;
1932
    }
1933
1934
    /**
1935
     * Returns a md5 hash generated from a serialized configuration array.
1936
     *
1937
     * @return string
1938
     */
1939 9
    protected function getConfigurationHash(array $configuration)
1940
    {
1941 9
        unset($configuration['paramExpanded']);
1942 9
        unset($configuration['URLs']);
1943 9
        return md5(serialize($configuration));
1944
    }
1945
1946
    /**
1947
     * Build a URL from a Page and the Query String. If the page has a Site configuration, it can be built by using
1948
     * the Site instance.
1949
     *
1950
     * @param int $httpsOrHttp see tx_crawler_configuration.force_ssl
1951
     * @throws \TYPO3\CMS\Core\Exception\SiteNotFoundException
1952
     * @throws \TYPO3\CMS\Core\Routing\InvalidRouteArgumentsException
1953
     *
1954
     * @deprecated Using CrawlerController::getUrlFromPageAndQueryParameters() is deprecated since 9.1.1 and will be removed in v11.x, please use UrlService->getUrlFromPageAndQueryParameters() instead.
1955
     */
1956
    protected function getUrlFromPageAndQueryParameters(int $pageId, string $queryString, ?string $alternativeBaseUrl, int $httpsOrHttp): UriInterface
1957
    {
1958
        $urlService = new UrlService();
1959
        return $urlService->getUrlFromPageAndQueryParameters($pageId, $queryString, $alternativeBaseUrl, $httpsOrHttp);
1960
    }
1961
1962 1
    protected function swapIfFirstIsLargerThanSecond(array $reg): array
1963
    {
1964
        // Swap if first is larger than last:
1965 1
        if ($reg[1] > $reg[2]) {
1966
            $temp = $reg[2];
1967
            $reg[2] = $reg[1];
1968
            $reg[1] = $temp;
1969
        }
1970
1971 1
        return $reg;
1972
    }
1973
1974
    /**
1975
     * @return BackendUserAuthentication
1976
     */
1977 2
    private function getBackendUser()
1978
    {
1979
        // Make sure the _cli_ user is loaded
1980 2
        Bootstrap::initializeBackendAuthentication();
1981 2
        if ($this->backendUser === null) {
1982 2
            $this->backendUser = $GLOBALS['BE_USER'];
1983
        }
1984 2
        return $this->backendUser;
1985
    }
1986
1987
    /**
1988
     * Get querybuilder for given table
1989
     *
1990
     * @return \TYPO3\CMS\Core\Database\Query\QueryBuilder
1991
     */
1992 12
    private function getQueryBuilder(string $table)
1993
    {
1994 12
        return GeneralUtility::makeInstance(ConnectionPool::class)->getQueryBuilderForTable($table);
1995
    }
1996
}
1997