Passed
Push — testing/behaviour-104 ( 344e70...67b3c9 )
by Tomas Norre
36:59 queued 19:28
created

CrawlerController::getPageTreeAndUrls()   B

Complexity

Conditions 7
Paths 16

Size

Total Lines 90
Code Lines 48

Duplication

Lines 0
Ratio 0 %

Importance

Changes 1
Bugs 0 Features 0
Metric Value
cc 7
eloc 48
c 1
b 0
f 0
nc 16
nop 8
dl 0
loc 90
rs 8.2012

How to fix   Long Method    Many Parameters   

Long Method

Small methods make your code easier to understand, in particular if combined with a good name. Besides, if your method is small, finding a good name is usually much easier.

For example, if you find yourself adding comments to a method's body, this is usually a good sign to extract the commented part to a new method, and use the comment as a starting point when coming up with a good name for this new method.

Commonly applied refactorings include:

Many Parameters

Methods with many parameters are not only hard to understand, but their parameters also often become inconsistent when you need more, or different data.

There are several approaches to avoid long parameter lists:

1
<?php
2
3
declare(strict_types=1);
4
5
namespace AOE\Crawler\Controller;
6
7
/***************************************************************
8
 *  Copyright notice
9
 *
10
 *  (c) 2020 AOE GmbH <[email protected]>
11
 *
12
 *  All rights reserved
13
 *
14
 *  This script is part of the TYPO3 project. The TYPO3 project is
15
 *  free software; you can redistribute it and/or modify
16
 *  it under the terms of the GNU General Public License as published by
17
 *  the Free Software Foundation; either version 3 of the License, or
18
 *  (at your option) any later version.
19
 *
20
 *  The GNU General Public License can be found at
21
 *  http://www.gnu.org/copyleft/gpl.html.
22
 *
23
 *  This script is distributed in the hope that it will be useful,
24
 *  but WITHOUT ANY WARRANTY; without even the implied warranty of
25
 *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
26
 *  GNU General Public License for more details.
27
 *
28
 *  This copyright notice MUST APPEAR in all copies of the script!
29
 ***************************************************************/
30
31
use AOE\Crawler\Configuration\ExtensionConfigurationProvider;
32
use AOE\Crawler\Converter\JsonCompatibilityConverter;
33
use AOE\Crawler\CrawlStrategy\CrawlStrategyFactory;
34
use AOE\Crawler\Domain\Repository\ConfigurationRepository;
35
use AOE\Crawler\Domain\Repository\ProcessRepository;
36
use AOE\Crawler\Domain\Repository\QueueRepository;
37
use AOE\Crawler\QueueExecutor;
38
use AOE\Crawler\Service\UrlService;
39
use AOE\Crawler\Utility\SignalSlotUtility;
40
use Psr\Http\Message\UriInterface;
41
use Psr\Log\LoggerAwareInterface;
42
use Psr\Log\LoggerAwareTrait;
43
use TYPO3\CMS\Backend\Tree\View\PageTreeView;
44
use TYPO3\CMS\Backend\Utility\BackendUtility;
45
use TYPO3\CMS\Core\Authentication\BackendUserAuthentication;
46
use TYPO3\CMS\Core\Compatibility\PublicMethodDeprecationTrait;
47
use TYPO3\CMS\Core\Core\Bootstrap;
48
use TYPO3\CMS\Core\Core\Environment;
49
use TYPO3\CMS\Core\Database\Connection;
50
use TYPO3\CMS\Core\Database\ConnectionPool;
51
use TYPO3\CMS\Core\Database\Query\Restriction\DeletedRestriction;
52
use TYPO3\CMS\Core\Imaging\Icon;
53
use TYPO3\CMS\Core\Imaging\IconFactory;
54
use TYPO3\CMS\Core\Site\Entity\Site;
55
use TYPO3\CMS\Core\Type\Bitmask\Permission;
56
use TYPO3\CMS\Core\TypoScript\Parser\TypoScriptParser;
57
use TYPO3\CMS\Core\Utility\DebugUtility;
58
use TYPO3\CMS\Core\Utility\GeneralUtility;
59
use TYPO3\CMS\Core\Utility\MathUtility;
60
use TYPO3\CMS\Extbase\Object\ObjectManager;
61
use TYPO3\CMS\Frontend\Page\PageRepository;
62
63
/**
64
 * Class CrawlerController
65
 *
66
 * @package AOE\Crawler\Controller
67
 */
68
class CrawlerController implements LoggerAwareInterface
69
{
70
    use LoggerAwareTrait;
71
    use PublicMethodDeprecationTrait;
72
73
    public const CLI_STATUS_NOTHING_PROCCESSED = 0;
74
75
    public const CLI_STATUS_REMAIN = 1; //queue not empty
76
77
    public const CLI_STATUS_PROCESSED = 2; //(some) queue items where processed
78
79
    public const CLI_STATUS_ABORTED = 4; //instance didn't finish
80
81
    public const CLI_STATUS_POLLABLE_PROCESSED = 8;
82
83
    /**
84
     * @var integer
85
     */
86
    public $setID = 0;
87
88
    /**
89
     * @var string
90
     */
91
    public $processID = '';
92
93
    /**
94
     * @var array
95
     */
96
    public $duplicateTrack = [];
97
98
    /**
99
     * @var array
100
     */
101
    public $downloadUrls = [];
102
103
    /**
104
     * @var array
105
     */
106
    public $incomingProcInstructions = [];
107
108
    /**
109
     * @var array
110
     */
111
    public $incomingConfigurationSelection = [];
112
113
    /**
114
     * @var bool
115
     */
116
    public $registerQueueEntriesInternallyOnly = false;
117
118
    /**
119
     * @var array
120
     */
121
    public $queueEntries = [];
122
123
    /**
124
     * @var array
125
     */
126
    public $urlList = [];
127
128
    /**
129
     * @var array
130
     */
131
    public $extensionSettings = [];
132
133
    /**
134
     * Mount Point
135
     *
136
     * @var bool
137
     * Todo: Check what this is used for and adjust the type hint or code, as bool doesn't match the current code.
138
     */
139
    public $MP = false;
140
141
    /**
142
     * @var string
143
     */
144
    protected $processFilename;
145
146
    /**
147
     * Holds the internal access mode can be 'gui','cli' or 'cli_im'
148
     *
149
     * @var string
150
     */
151
    protected $accessMode;
152
153
    /**
154
     * @var QueueRepository
155
     */
156
    protected $queueRepository;
157
158
    /**
159
     * @var ProcessRepository
160
     */
161
    protected $processRepository;
162
163
    /**
164
     * @var ConfigurationRepository
165
     */
166
    protected $configurationRepository;
167
168
    /**
169
     * @var string
170
     */
171
    protected $tableName = 'tx_crawler_queue';
172
173
    /**
174
     * @var QueueExecutor
175
     */
176
    protected $queueExecutor;
177
178
    /**
179
     * @var int
180
     */
181
    protected $maximumUrlsToCompile = 10000;
182
183
    /**
184
     * @var IconFactory
185
     */
186
    protected $iconFactory;
187
188
    /**
189
     * @var string[]
190
     */
191
    private $deprecatedPublicMethods = [
0 ignored issues
show
introduced by
The private property $deprecatedPublicMethods is not used, and could be removed.
Loading history...
192
        'getLogEntriesForSetId' => 'Using crawlerController::getLogEntriesForSetId() is deprecated since 9.0.1 and will be removed in v11.x',
193
        'flushQueue' => 'Using CrawlerController::flushQueue() is deprecated since 9.0.1 and will be removed in v11.x, please use QueueRepository->flushQueue() instead.',
194
        'cleanUpOldQueueEntries' => 'Using CrawlerController::cleanUpOldQueueEntries() is deprecated since 9.0.1 and will be removed in v11.x, please use QueueRepository->cleanUpOldQueueEntries() instead.',
195
    ];
196
197
    /**
198
     * @var BackendUserAuthentication|null
199
     */
200
    private $backendUser;
201
202
    /**
203
     * @var integer
204
     */
205
    private $scheduledTime = 0;
206
207
    /**
208
     * @var integer
209
     */
210
    private $reqMinute = 0;
211
212
    /**
213
     * @var bool
214
     */
215
    private $submitCrawlUrls = false;
216
217
    /**
218
     * @var bool
219
     */
220
    private $downloadCrawlUrls = false;
221
222
    /************************************
223
     *
224
     * Getting URLs based on Page TSconfig
225
     *
226
     ************************************/
227
228
    public function __construct()
229
    {
230
        $objectManager = GeneralUtility::makeInstance(ObjectManager::class);
231
        $crawlStrategyFactory = GeneralUtility::makeInstance(CrawlStrategyFactory::class);
232
        $this->queueRepository = $objectManager->get(QueueRepository::class);
233
        $this->processRepository = $objectManager->get(ProcessRepository::class);
234
        $this->configurationRepository = $objectManager->get(ConfigurationRepository::class);
235
        $this->queueExecutor = GeneralUtility::makeInstance(QueueExecutor::class, $crawlStrategyFactory);
236
        $this->iconFactory = GeneralUtility::makeInstance(IconFactory::class);
237
238
        $this->processFilename = Environment::getVarPath() . '/lock/tx_crawler.proc';
239
240
        /** @var ExtensionConfigurationProvider $configurationProvider */
241
        $configurationProvider = GeneralUtility::makeInstance(ExtensionConfigurationProvider::class);
242
        $settings = $configurationProvider->getExtensionConfiguration();
243
        $this->extensionSettings = is_array($settings) ? $settings : [];
0 ignored issues
show
introduced by
The condition is_array($settings) is always true.
Loading history...
244
245
        // set defaults:
246
        if (MathUtility::convertToPositiveInteger($this->extensionSettings['countInARun']) === 0) {
247
            $this->extensionSettings['countInARun'] = 100;
248
        }
249
250
        $this->extensionSettings['processLimit'] = MathUtility::forceIntegerInRange($this->extensionSettings['processLimit'], 1, 99, 1);
251
        $this->maximumUrlsToCompile = MathUtility::forceIntegerInRange($this->extensionSettings['maxCompileUrls'], 1, 1000000000, 10000);
252
    }
253
254
    /**
255
     * Method to set the accessMode can be gui, cli or cli_im
256
     *
257
     * @return string
258
     */
259
    public function getAccessMode()
260
    {
261
        return $this->accessMode;
262
    }
263
264
    /**
265
     * @param string $accessMode
266
     */
267
    public function setAccessMode($accessMode): void
268
    {
269
        $this->accessMode = $accessMode;
270
    }
271
272
    /**
273
     * Set disabled status to prevent processes from being processed
274
     *
275
     * @param bool $disabled (optional, defaults to true)
276
     */
277
    public function setDisabled($disabled = true): void
278
    {
279
        if ($disabled) {
280
            GeneralUtility::writeFile($this->processFilename, '');
281
        } else {
282
            if (is_file($this->processFilename)) {
283
                unlink($this->processFilename);
284
            }
285
        }
286
    }
287
288
    /**
289
     * Get disable status
290
     *
291
     * @return bool true if disabled
292
     */
293
    public function getDisabled()
294
    {
295
        return is_file($this->processFilename);
296
    }
297
298
    /**
299
     * @param string $filenameWithPath
300
     */
301
    public function setProcessFilename($filenameWithPath): void
302
    {
303
        $this->processFilename = $filenameWithPath;
304
    }
305
306
    /**
307
     * @return string
308
     */
309
    public function getProcessFilename()
310
    {
311
        return $this->processFilename;
312
    }
313
314
    /**
315
     * Sets the extensions settings (unserialized pendant of $TYPO3_CONF_VARS['EXT']['extConf']['crawler']).
316
     */
317
    public function setExtensionSettings(array $extensionSettings): void
318
    {
319
        $this->extensionSettings = $extensionSettings;
320
    }
321
322
    /**
323
     * Check if the given page should be crawled
324
     *
325
     * @return false|string false if the page should be crawled (not excluded), true / skipMessage if it should be skipped
326
     */
327
    public function checkIfPageShouldBeSkipped(array $pageRow)
328
    {
329
        $skipPage = false;
330
        $skipMessage = 'Skipped'; // message will be overwritten later
331
332
        // if page is hidden
333
        if (! $this->extensionSettings['crawlHiddenPages']) {
334
            if ($pageRow['hidden']) {
335
                $skipPage = true;
336
                $skipMessage = 'Because page is hidden';
337
            }
338
        }
339
340
        if (! $skipPage) {
341
            if (GeneralUtility::inList('3,4,199,254,255', $pageRow['doktype'])) {
342
                $skipPage = true;
343
                $skipMessage = 'Because doktype is not allowed';
344
            }
345
        }
346
347
        if (! $skipPage) {
348
            foreach ($GLOBALS['TYPO3_CONF_VARS']['EXTCONF']['crawler']['excludeDoktype'] ?? [] as $key => $doktypeList) {
349
                if (GeneralUtility::inList($doktypeList, $pageRow['doktype'])) {
350
                    $skipPage = true;
351
                    $skipMessage = 'Doktype was excluded by "' . $key . '"';
352
                    break;
353
                }
354
            }
355
        }
356
357
        if (! $skipPage) {
358
            // veto hook
359
            foreach ($GLOBALS['TYPO3_CONF_VARS']['EXTCONF']['crawler']['pageVeto'] ?? [] as $key => $func) {
360
                $params = [
361
                    'pageRow' => $pageRow,
362
                ];
363
                // expects "false" if page is ok and "true" or a skipMessage if this page should _not_ be crawled
364
                $veto = GeneralUtility::callUserFunction($func, $params, $this);
365
                if ($veto !== false) {
366
                    $skipPage = true;
367
                    if (is_string($veto)) {
368
                        $skipMessage = $veto;
369
                    } else {
370
                        $skipMessage = 'Veto from hook "' . htmlspecialchars($key) . '"';
371
                    }
372
                    // no need to execute other hooks if a previous one return a veto
373
                    break;
374
                }
375
            }
376
        }
377
378
        return $skipPage ? $skipMessage : false;
379
    }
380
381
    /**
382
     * Wrapper method for getUrlsForPageId()
383
     * It returns an array of configurations and no urls!
384
     *
385
     * @param array $pageRow Page record with at least dok-type and uid columns.
386
     * @param string $skipMessage
387
     * @return array
388
     * @see getUrlsForPageId()
389
     */
390
    public function getUrlsForPageRow(array $pageRow, &$skipMessage = '')
391
    {
392
        $message = $this->checkIfPageShouldBeSkipped($pageRow);
393
        if ($message === false) {
394
            $res = $this->getUrlsForPageId($pageRow['uid']);
395
            $skipMessage = '';
396
        } else {
397
            $skipMessage = $message;
398
            $res = [];
399
        }
400
401
        return $res;
402
    }
403
404
    /**
405
     * Creates a list of URLs from input array (and submits them to queue if asked for)
406
     * See Web > Info module script + "indexed_search"'s crawler hook-client using this!
407
     *
408
     * @param array $vv Information about URLs from pageRow to crawl.
409
     * @param array $pageRow Page row
410
     * @param int $scheduledTime Unix time to schedule indexing to, typically time()
411
     * @param int $reqMinute Number of requests per minute (creates the interleave between requests)
412
     * @param bool $submitCrawlUrls If set, submits the URLs to queue
413
     * @param bool $downloadCrawlUrls If set (and submitcrawlUrls is false) will fill $downloadUrls with entries)
414
     * @param array $duplicateTrack Array which is passed by reference and contains the an id per url to secure we will not crawl duplicates
415
     * @param array $downloadUrls Array which will be filled with URLS for download if flag is set.
416
     * @param array $incomingProcInstructions Array of processing instructions
417
     * @return string List of URLs (meant for display in backend module)
418
     */
419
    public function urlListFromUrlArray(
420
        array $vv,
421
        array $pageRow,
422
        $scheduledTime,
423
        $reqMinute,
424
        $submitCrawlUrls,
425
        $downloadCrawlUrls,
426
        array &$duplicateTrack,
427
        array &$downloadUrls,
428
        array $incomingProcInstructions
429
    ) {
430
        if (! is_array($vv['URLs'])) {
431
            return 'ERROR - no URL generated';
432
        }
433
        $urlLog = [];
434
        $pageId = (int) $pageRow['uid'];
435
        $configurationHash = $this->getConfigurationHash($vv);
436
        $skipInnerCheck = $this->queueRepository->noUnprocessedQueueEntriesForPageWithConfigurationHashExist($pageId, $configurationHash);
437
438
        $urlService = new UrlService();
439
440
        foreach ($vv['URLs'] as $urlQuery) {
441
            if (! $this->drawURLs_PIfilter($vv['subCfg']['procInstrFilter'], $incomingProcInstructions)) {
442
                continue;
443
            }
444
            $url = (string) $urlService->getUrlFromPageAndQueryParameters(
445
                $pageId,
446
                $urlQuery,
447
                $vv['subCfg']['baseUrl'] ?? null,
448
                $vv['subCfg']['force_ssl'] ?? 0
449
            );
450
451
            // Create key by which to determine unique-ness:
452
            $uKey = $url . '|' . $vv['subCfg']['userGroups'] . '|' . $vv['subCfg']['procInstrFilter'];
453
454
            if (isset($duplicateTrack[$uKey])) {
455
                //if the url key is registered just display it and do not resubmit is
456
                $urlLog[] = '<em><span class="text-muted">' . htmlspecialchars($url) . '</span></em>';
457
            } else {
458
                // Scheduled time:
459
                $schTime = $scheduledTime + round(count($duplicateTrack) * (60 / $reqMinute));
460
                $schTime = intval($schTime / 60) * 60;
461
                $formattedDate = BackendUtility::datetime($schTime);
462
                $this->urlList[] = '[' . $formattedDate . '] ' . $url;
463
                $urlList = '[' . $formattedDate . '] ' . htmlspecialchars($url);
464
465
                // Submit for crawling!
466
                if ($submitCrawlUrls) {
467
                    $added = $this->addUrl(
468
                        $pageId,
469
                        $url,
470
                        $vv['subCfg'],
471
                        $scheduledTime,
472
                        $configurationHash,
473
                        $skipInnerCheck
474
                    );
475
                    if ($added === false) {
476
                        $urlList .= ' (URL already existed)';
477
                    }
478
                } elseif ($downloadCrawlUrls) {
479
                    $downloadUrls[$url] = $url;
480
                }
481
                $urlLog[] = $urlList;
482
            }
483
            $duplicateTrack[$uKey] = true;
484
        }
485
486
        return implode('<br>', $urlLog);
487
    }
488
489
    /**
490
     * Returns true if input processing instruction is among registered ones.
491
     *
492
     * @param string $piString PI to test
493
     * @param array $incomingProcInstructions Processing instructions
494
     * @return boolean
495
     */
496
    public function drawURLs_PIfilter($piString, array $incomingProcInstructions)
497
    {
498
        if (empty($incomingProcInstructions)) {
499
            return true;
500
        }
501
502
        foreach ($incomingProcInstructions as $pi) {
503
            if (GeneralUtility::inList($piString, $pi)) {
504
                return true;
505
            }
506
        }
507
        return false;
508
    }
509
510
    public function getPageTSconfigForId($id): array
511
    {
512
        if (! $this->MP) {
513
            $pageTSconfig = BackendUtility::getPagesTSconfig($id);
0 ignored issues
show
Deprecated Code introduced by
The function TYPO3\CMS\Backend\Utilit...ity::getPagesTSconfig() has been deprecated. ( Ignorable by Annotation )

If this is a false-positive, you can also ignore this issue in your code via the ignore-deprecated  annotation

513
            $pageTSconfig = /** @scrutinizer ignore-deprecated */ BackendUtility::getPagesTSconfig($id);
Loading history...
514
        } else {
515
            // TODO: Please check, this makes no sense to split a boolean value.
516
            [, $mountPointId] = explode('-', $this->MP);
0 ignored issues
show
Bug introduced by
$this->MP of type true is incompatible with the type string expected by parameter $string of explode(). ( Ignorable by Annotation )

If this is a false-positive, you can also ignore this issue in your code via the ignore-type  annotation

516
            [, $mountPointId] = explode('-', /** @scrutinizer ignore-type */ $this->MP);
Loading history...
517
            $pageTSconfig = BackendUtility::getPagesTSconfig($mountPointId);
0 ignored issues
show
Deprecated Code introduced by
The function TYPO3\CMS\Backend\Utilit...ity::getPagesTSconfig() has been deprecated. ( Ignorable by Annotation )

If this is a false-positive, you can also ignore this issue in your code via the ignore-deprecated  annotation

517
            $pageTSconfig = /** @scrutinizer ignore-deprecated */ BackendUtility::getPagesTSconfig($mountPointId);
Loading history...
Bug introduced by
$mountPointId of type string is incompatible with the type integer expected by parameter $id of TYPO3\CMS\Backend\Utilit...ity::getPagesTSconfig(). ( Ignorable by Annotation )

If this is a false-positive, you can also ignore this issue in your code via the ignore-type  annotation

517
            $pageTSconfig = BackendUtility::getPagesTSconfig(/** @scrutinizer ignore-type */ $mountPointId);
Loading history...
518
        }
519
520
        // Call a hook to alter configuration
521
        if (is_array($GLOBALS['TYPO3_CONF_VARS']['EXTCONF']['crawler']['getPageTSconfigForId'])) {
522
            $params = [
523
                'pageId' => $id,
524
                'pageTSConfig' => &$pageTSconfig,
525
            ];
526
            foreach ($GLOBALS['TYPO3_CONF_VARS']['EXTCONF']['crawler']['getPageTSconfigForId'] as $userFunc) {
527
                GeneralUtility::callUserFunction($userFunc, $params, $this);
528
            }
529
        }
530
        return $pageTSconfig;
531
    }
532
533
    /**
534
     * This methods returns an array of configurations.
535
     * Adds no urls!
536
     */
537
    public function getUrlsForPageId(int $pageId): array
538
    {
539
        // Get page TSconfig for page ID
540
        $pageTSconfig = $this->getPageTSconfigForId($pageId);
541
542
        $res = [];
543
544
        // Fetch Crawler Configuration from pageTSconfig
545
        $crawlerCfg = $pageTSconfig['tx_crawler.']['crawlerCfg.']['paramSets.'] ?? [];
546
        foreach ($crawlerCfg as $key => $values) {
547
            if (! is_array($values)) {
548
                continue;
549
            }
550
            $key = str_replace('.', '', $key);
551
            // Sub configuration for a single configuration string:
552
            $subCfg = (array) $crawlerCfg[$key . '.'];
553
            $subCfg['key'] = $key;
554
555
            if (strcmp($subCfg['procInstrFilter'] ?? '', '')) {
556
                $subCfg['procInstrFilter'] = implode(',', GeneralUtility::trimExplode(',', $subCfg['procInstrFilter']));
557
            }
558
            $pidOnlyList = implode(',', GeneralUtility::trimExplode(',', $subCfg['pidsOnly'], true));
559
560
            // process configuration if it is not page-specific or if the specific page is the current page:
561
            // TODO: Check if $pidOnlyList can be kept as Array instead of imploded
562
            if (! strcmp((string) $subCfg['pidsOnly'], '') || GeneralUtility::inList($pidOnlyList, strval($pageId))) {
563
564
                // Explode, process etc.:
565
                $res[$key] = [];
566
                $res[$key]['subCfg'] = $subCfg;
567
                $res[$key]['paramParsed'] = GeneralUtility::explodeUrl2Array($crawlerCfg[$key]);
568
                $res[$key]['paramExpanded'] = $this->expandParameters($res[$key]['paramParsed'], $pageId);
569
                $res[$key]['origin'] = 'pagets';
570
571
                // recognize MP value
572
                if (! $this->MP) {
573
                    $res[$key]['URLs'] = $this->compileUrls($res[$key]['paramExpanded'], ['?id=' . $pageId]);
574
                } else {
575
                    $res[$key]['URLs'] = $this->compileUrls($res[$key]['paramExpanded'], ['?id=' . $pageId . '&MP=' . $this->MP]);
0 ignored issues
show
Bug introduced by
Are you sure $this->MP of type true can be used in concatenation? ( Ignorable by Annotation )

If this is a false-positive, you can also ignore this issue in your code via the ignore-type  annotation

575
                    $res[$key]['URLs'] = $this->compileUrls($res[$key]['paramExpanded'], ['?id=' . $pageId . '&MP=' . /** @scrutinizer ignore-type */ $this->MP]);
Loading history...
576
                }
577
            }
578
        }
579
580
        // Get configuration from tx_crawler_configuration records up the rootline
581
        $crawlerConfigurations = $this->configurationRepository->getCrawlerConfigurationRecordsFromRootLine($pageId);
582
        foreach ($crawlerConfigurations as $configurationRecord) {
583
584
            // check access to the configuration record
585
            if (empty($configurationRecord['begroups']) || $this->getBackendUser()->isAdmin() || $this->hasGroupAccess($this->getBackendUser()->user['usergroup_cached_list'], $configurationRecord['begroups'])) {
586
                $pidOnlyList = implode(',', GeneralUtility::trimExplode(',', $configurationRecord['pidsonly'], true));
587
588
                // process configuration if it is not page-specific or if the specific page is the current page:
589
                // TODO: Check if $pidOnlyList can be kept as Array instead of imploded
590
                if (! strcmp($configurationRecord['pidsonly'], '') || GeneralUtility::inList($pidOnlyList, strval($pageId))) {
591
                    $key = $configurationRecord['name'];
592
593
                    // don't overwrite previously defined paramSets
594
                    if (! isset($res[$key])) {
595
596
                        /* @var $TSparserObject \TYPO3\CMS\Core\TypoScript\Parser\TypoScriptParser */
597
                        $TSparserObject = GeneralUtility::makeInstance(TypoScriptParser::class);
598
                        $TSparserObject->parse($configurationRecord['processing_instruction_parameters_ts']);
599
600
                        $subCfg = [
601
                            'procInstrFilter' => $configurationRecord['processing_instruction_filter'],
602
                            'procInstrParams.' => $TSparserObject->setup,
603
                            'baseUrl' => $configurationRecord['base_url'],
604
                            'force_ssl' => (int) $configurationRecord['force_ssl'],
605
                            'userGroups' => $configurationRecord['fegroups'],
606
                            'exclude' => $configurationRecord['exclude'],
607
                            'key' => $key,
608
                        ];
609
610
                        if (! in_array($pageId, $this->expandExcludeString($subCfg['exclude']), true)) {
611
                            $res[$key] = [];
612
                            $res[$key]['subCfg'] = $subCfg;
613
                            $res[$key]['paramParsed'] = GeneralUtility::explodeUrl2Array($configurationRecord['configuration']);
614
                            $res[$key]['paramExpanded'] = $this->expandParameters($res[$key]['paramParsed'], $pageId);
615
                            $res[$key]['URLs'] = $this->compileUrls($res[$key]['paramExpanded'], ['?id=' . $pageId]);
616
                            $res[$key]['origin'] = 'tx_crawler_configuration_' . $configurationRecord['uid'];
617
                        }
618
                    }
619
                }
620
            }
621
        }
622
623
        foreach ($GLOBALS['TYPO3_CONF_VARS']['EXTCONF']['crawler']['processUrls'] ?? [] as $func) {
624
            $params = [
625
                'res' => &$res,
626
            ];
627
            GeneralUtility::callUserFunction($func, $params, $this);
628
        }
629
        return $res;
630
    }
631
632
    /**
633
     * Find all configurations of subpages of a page
634
     * TODO: Write Functional Tests
635
     */
636
    public function getConfigurationsForBranch(int $rootid, int $depth): array
637
    {
638
        $configurationsForBranch = [];
639
        $pageTSconfig = $this->getPageTSconfigForId($rootid);
640
        $sets = $pageTSconfig['tx_crawler.']['crawlerCfg.']['paramSets.'] ?? [];
641
        foreach ($sets as $key => $value) {
642
            if (! is_array($value)) {
643
                continue;
644
            }
645
            $configurationsForBranch[] = substr($key, -1) === '.' ? substr($key, 0, -1) : $key;
646
        }
647
        $pids = [];
648
        $rootLine = BackendUtility::BEgetRootLine($rootid);
649
        foreach ($rootLine as $node) {
650
            $pids[] = $node['uid'];
651
        }
652
        /* @var PageTreeView $tree */
653
        $tree = GeneralUtility::makeInstance(PageTreeView::class);
654
        $perms_clause = $this->getBackendUser()->getPagePermsClause(Permission::PAGE_SHOW);
655
        $tree->init(empty($perms_clause) ? '' : ('AND ' . $perms_clause));
656
        $tree->getTree($rootid, $depth, '');
657
        foreach ($tree->tree as $node) {
658
            $pids[] = $node['row']['uid'];
659
        }
660
661
        $queryBuilder = $this->getQueryBuilder('tx_crawler_configuration');
662
        $statement = $queryBuilder
663
            ->select('name')
664
            ->from('tx_crawler_configuration')
665
            ->where(
666
                $queryBuilder->expr()->in('pid', $queryBuilder->createNamedParameter($pids, Connection::PARAM_INT_ARRAY))
667
            )
668
            ->execute();
669
670
        while ($row = $statement->fetch()) {
671
            $configurationsForBranch[] = $row['name'];
672
        }
673
        return $configurationsForBranch;
674
    }
675
676
    /**
677
     * Check if a user has access to an item
678
     * (e.g. get the group list of the current logged in user from $GLOBALS['TSFE']->gr_list)
679
     *
680
     * @param string $groupList Comma-separated list of (fe_)group UIDs from a user
681
     * @param string $accessList Comma-separated list of (fe_)group UIDs of the item to access
682
     * @return bool                 TRUE if at least one of the users group UIDs is in the access list or the access list is empty
683
     * @see \TYPO3\CMS\Frontend\Page\PageRepository::getMultipleGroupsWhereClause()
684
     */
685
    public function hasGroupAccess($groupList, $accessList)
686
    {
687
        if (empty($accessList)) {
688
            return true;
689
        }
690
        foreach (GeneralUtility::intExplode(',', $groupList) as $groupUid) {
691
            if (GeneralUtility::inList($accessList, $groupUid)) {
692
                return true;
693
            }
694
        }
695
        return false;
696
    }
697
698
    /**
699
     * Will expand the parameters configuration to individual values. This follows a certain syntax of the value of each parameter.
700
     * Syntax of values:
701
     * - Basically: If the value is wrapped in [...] it will be expanded according to the following syntax, otherwise the value is taken literally
702
     * - Configuration is splitted by "|" and the parts are processed individually and finally added together
703
     * - For each configuration part:
704
     *         - "[int]-[int]" = Integer range, will be expanded to all values in between, values included, starting from low to high (max. 1000). Example "1-34" or "-40--30"
705
     *         - "_TABLE:[TCA table name];[_PID:[optional page id, default is current page]];[_ENABLELANG:1]" = Look up of table records from PID, filtering out deleted records. Example "_TABLE:tt_content; _PID:123"
706
     *        _ENABLELANG:1 picks only original records without their language overlays
707
     *         - Default: Literal value
708
     *
709
     * @param array $paramArray Array with key (GET var name) and values (value of GET var which is configuration for expansion)
710
     * @param integer $pid Current page ID
711
     * @return array
712
     *
713
     * TODO: Write Functional Tests
714
     */
715
    public function expandParameters($paramArray, $pid)
716
    {
717
        // Traverse parameter names:
718
        foreach ($paramArray as $p => $v) {
719
            $v = trim($v);
720
721
            // If value is encapsulated in square brackets it means there are some ranges of values to find, otherwise the value is literal
722
            if (strpos($v, '[') === 0 && substr($v, -1) === ']') {
723
                // So, find the value inside brackets and reset the paramArray value as an array.
724
                $v = substr($v, 1, -1);
725
                $paramArray[$p] = [];
726
727
                // Explode parts and traverse them:
728
                $parts = explode('|', $v);
729
                foreach ($parts as $pV) {
730
731
                    // Look for integer range: (fx. 1-34 or -40--30 // reads minus 40 to minus 30)
732
                    if (preg_match('/^(-?[0-9]+)\s*-\s*(-?[0-9]+)$/', trim($pV), $reg)) {
733
                        $reg = $this->swapIfFirstIsLargerThanSecond($reg);
734
735
                        // Traverse range, add values:
736
                        $runAwayBrake = 1000; // Limit to size of range!
737
                        for ($a = $reg[1]; $a <= $reg[2]; $a++) {
738
                            $paramArray[$p][] = $a;
739
                            $runAwayBrake--;
740
                            if ($runAwayBrake <= 0) {
741
                                break;
742
                            }
743
                        }
744
                    } elseif (strpos(trim($pV), '_TABLE:') === 0) {
745
746
                        // Parse parameters:
747
                        $subparts = GeneralUtility::trimExplode(';', $pV);
748
                        $subpartParams = [];
749
                        foreach ($subparts as $spV) {
750
                            [$pKey, $pVal] = GeneralUtility::trimExplode(':', $spV);
751
                            $subpartParams[$pKey] = $pVal;
752
                        }
753
754
                        // Table exists:
755
                        if (isset($GLOBALS['TCA'][$subpartParams['_TABLE']])) {
756
                            $lookUpPid = isset($subpartParams['_PID']) ? intval($subpartParams['_PID']) : intval($pid);
757
                            $recursiveDepth = isset($subpartParams['_RECURSIVE']) ? intval($subpartParams['_RECURSIVE']) : 0;
758
                            $pidField = isset($subpartParams['_PIDFIELD']) ? trim($subpartParams['_PIDFIELD']) : 'pid';
759
                            $where = $subpartParams['_WHERE'] ?? '';
760
                            $addTable = $subpartParams['_ADDTABLE'] ?? '';
761
762
                            $fieldName = $subpartParams['_FIELD'] ? $subpartParams['_FIELD'] : 'uid';
763
                            if ($fieldName === 'uid' || $GLOBALS['TCA'][$subpartParams['_TABLE']]['columns'][$fieldName]) {
764
                                $queryBuilder = $this->getQueryBuilder($subpartParams['_TABLE']);
765
766
                                if ($recursiveDepth > 0) {
767
                                    /** @var \TYPO3\CMS\Core\Database\QueryGenerator $queryGenerator */
768
                                    $queryGenerator = GeneralUtility::makeInstance(\TYPO3\CMS\Core\Database\QueryGenerator::class);
769
                                    $pidList = $queryGenerator->getTreeList($lookUpPid, $recursiveDepth, 0, 1);
770
                                    $pidArray = GeneralUtility::intExplode(',', $pidList);
771
                                } else {
772
                                    $pidArray = [(string) $lookUpPid];
773
                                }
774
775
                                $queryBuilder->getRestrictions()
776
                                    ->removeAll()
777
                                    ->add(GeneralUtility::makeInstance(DeletedRestriction::class));
778
779
                                $queryBuilder
780
                                    ->select($fieldName)
781
                                    ->from($subpartParams['_TABLE'])
782
                                    ->where(
783
                                        $queryBuilder->expr()->in($pidField, $queryBuilder->createNamedParameter($pidArray, Connection::PARAM_INT_ARRAY)),
784
                                        $where
785
                                    );
786
787
                                if (! empty($addTable)) {
788
                                    // TODO: Check if this works as intended!
789
                                    $queryBuilder->add('from', $addTable);
790
                                }
791
                                $transOrigPointerField = $GLOBALS['TCA'][$subpartParams['_TABLE']]['ctrl']['transOrigPointerField'];
792
793
                                if ($subpartParams['_ENABLELANG'] && $transOrigPointerField) {
794
                                    $queryBuilder->andWhere(
795
                                        $queryBuilder->expr()->lte(
796
                                            $transOrigPointerField,
797
                                            0
798
                                        )
799
                                    );
800
                                }
801
802
                                $statement = $queryBuilder->execute();
803
804
                                $rows = [];
805
                                while ($row = $statement->fetch()) {
806
                                    $rows[$row[$fieldName]] = $row;
807
                                }
808
809
                                if (is_array($rows)) {
810
                                    $paramArray[$p] = array_merge($paramArray[$p], array_keys($rows));
811
                                }
812
                            }
813
                        }
814
                    } else { // Just add value:
815
                        $paramArray[$p][] = $pV;
816
                    }
817
                    // Hook for processing own expandParameters place holder
818
                    if (is_array($GLOBALS['TYPO3_CONF_VARS']['SC_OPTIONS']['crawler/class.tx_crawler_lib.php']['expandParameters'])) {
819
                        $_params = [
820
                            'pObj' => &$this,
821
                            'paramArray' => &$paramArray,
822
                            'currentKey' => $p,
823
                            'currentValue' => $pV,
824
                            'pid' => $pid,
825
                        ];
826
                        foreach ($GLOBALS['TYPO3_CONF_VARS']['SC_OPTIONS']['crawler/class.tx_crawler_lib.php']['expandParameters'] as $_funcRef) {
827
                            GeneralUtility::callUserFunction($_funcRef, $_params, $this);
828
                        }
829
                    }
830
                }
831
832
                // Make unique set of values and sort array by key:
833
                $paramArray[$p] = array_unique($paramArray[$p]);
834
                ksort($paramArray);
835
            } else {
836
                // Set the literal value as only value in array:
837
                $paramArray[$p] = [$v];
838
            }
839
        }
840
841
        return $paramArray;
842
    }
843
844
    /**
845
     * Compiling URLs from parameter array (output of expandParameters())
846
     * The number of URLs will be the multiplication of the number of parameter values for each key
847
     *
848
     * @param array $paramArray Output of expandParameters(): Array with keys (GET var names) and for each an array of values
849
     * @param array $urls URLs accumulated in this array (for recursion)
850
     * @return array
851
     */
852
    public function compileUrls($paramArray, array $urls)
853
    {
854
        if (empty($paramArray)) {
855
            return $urls;
856
        }
857
        // shift first off stack:
858
        reset($paramArray);
859
        $varName = key($paramArray);
860
        $valueSet = array_shift($paramArray);
861
862
        // Traverse value set:
863
        $newUrls = [];
864
        foreach ($urls as $url) {
865
            foreach ($valueSet as $val) {
866
                $newUrls[] = $url . (strcmp((string) $val, '') ? '&' . rawurlencode($varName) . '=' . rawurlencode((string) $val) : '');
867
868
                if (count($newUrls) > $this->maximumUrlsToCompile) {
869
                    break;
870
                }
871
            }
872
        }
873
        return $this->compileUrls($paramArray, $newUrls);
874
    }
875
876
    /************************************
877
     *
878
     * Crawler log
879
     *
880
     ************************************/
881
882
    /**
883
     * Return array of records from crawler queue for input page ID
884
     *
885
     * @param integer $id Page ID for which to look up log entries.
886
     * @param string $filter Filter: "all" => all entries, "pending" => all that is not yet run, "finished" => all complete ones
887
     * @param boolean $doFlush If TRUE, then entries selected at DELETED(!) instead of selected!
888
     * @param boolean $doFullFlush
889
     * @param integer $itemsPerPage Limit the amount of entries per page default is 10
890
     * @return array
891
     */
892
    public function getLogEntriesForPageId($id, $filter = '', $doFlush = false, $doFullFlush = false, $itemsPerPage = 10)
893
    {
894
        $queryBuilder = GeneralUtility::makeInstance(ConnectionPool::class)->getQueryBuilderForTable($this->tableName);
895
        $queryBuilder
896
            ->select('*')
897
            ->from($this->tableName)
898
            ->where(
899
                $queryBuilder->expr()->eq('page_id', $queryBuilder->createNamedParameter($id, \PDO::PARAM_INT))
900
            )
901
            ->orderBy('scheduled', 'DESC');
902
903
        $expressionBuilder = GeneralUtility::makeInstance(ConnectionPool::class)
904
            ->getConnectionForTable($this->tableName)
905
            ->getExpressionBuilder();
906
        $query = $expressionBuilder->andX();
0 ignored issues
show
Unused Code introduced by
The assignment to $query is dead and can be removed.
Loading history...
907
        // PHPStorm adds the highlight that the $addWhere is immediately overwritten,
908
        // but the $query = $expressionBuilder->andX() ensures that the $addWhere is written correctly with AND
909
        // between the statements, it's not a mistake in the code.
910
        switch ($filter) {
911
            case 'pending':
912
                $queryBuilder->andWhere($queryBuilder->expr()->eq('exec_time', 0));
913
                break;
914
            case 'finished':
915
                $queryBuilder->andWhere($queryBuilder->expr()->gt('exec_time', 0));
916
                break;
917
        }
918
919
        if ($doFlush) {
920
            if ($doFullFlush) {
921
                $this->queueRepository->flushQueue('all');
922
            } else {
923
                $this->queueRepository->flushQueue($filter);
924
            }
925
        }
926
        if ($itemsPerPage > 0) {
927
            $queryBuilder
928
                ->setMaxResults((int) $itemsPerPage);
929
        }
930
931
        return $queryBuilder->execute()->fetchAll();
932
    }
933
934
    /**
935
     * Return array of records from crawler queue for input set ID
936
     *
937
     * @param int $set_id Set ID for which to look up log entries.
938
     * @param string $filter Filter: "all" => all entries, "pending" => all that is not yet run, "finished" => all complete ones
939
     * @param bool $doFlush If TRUE, then entries selected at DELETED(!) instead of selected!
940
     * @param int $itemsPerPage Limit the amount of entries per page default is 10
941
     * @return array
942
     *
943
     * @deprecated
944
     */
945
    public function getLogEntriesForSetId(int $set_id, string $filter = '', bool $doFlush = false, bool $doFullFlush = false, int $itemsPerPage = 10)
946
    {
947
        $queryBuilder = GeneralUtility::makeInstance(ConnectionPool::class)->getQueryBuilderForTable($this->tableName);
948
        $queryBuilder
949
            ->select('*')
950
            ->from($this->tableName)
951
            ->where(
952
                $queryBuilder->expr()->eq('set_id', $queryBuilder->createNamedParameter($set_id, \PDO::PARAM_INT))
953
            )
954
            ->orderBy('scheduled', 'DESC');
955
956
        $expressionBuilder = GeneralUtility::makeInstance(ConnectionPool::class)
957
            ->getConnectionForTable($this->tableName)
958
            ->getExpressionBuilder();
959
        $query = $expressionBuilder->andX();
960
        // PHPStorm adds the highlight that the $addWhere is immediately overwritten,
961
        // but the $query = $expressionBuilder->andX() ensures that the $addWhere is written correctly with AND
962
        // between the statements, it's not a mistake in the code.
963
        $addWhere = '';
964
        switch ($filter) {
965
            case 'pending':
966
                $queryBuilder->andWhere($queryBuilder->expr()->eq('exec_time', 0));
967
                $addWhere = $query->add($expressionBuilder->eq('exec_time', 0));
0 ignored issues
show
Unused Code introduced by
The assignment to $addWhere is dead and can be removed.
Loading history...
968
                break;
969
            case 'finished':
970
                $queryBuilder->andWhere($queryBuilder->expr()->gt('exec_time', 0));
971
                $addWhere = $query->add($expressionBuilder->gt('exec_time', 0));
972
                break;
973
        }
974
        if ($doFlush) {
975
            $addWhere = $query->add($expressionBuilder->eq('set_id', (int) $set_id));
976
            $this->flushQueue($doFullFlush ? '' : $addWhere);
0 ignored issues
show
Deprecated Code introduced by
The function AOE\Crawler\Controller\C...ontroller::flushQueue() has been deprecated. ( Ignorable by Annotation )

If this is a false-positive, you can also ignore this issue in your code via the ignore-deprecated  annotation

976
            /** @scrutinizer ignore-deprecated */ $this->flushQueue($doFullFlush ? '' : $addWhere);
Loading history...
977
            return [];
978
        }
979
        if ($itemsPerPage > 0) {
980
            $queryBuilder
981
                ->setMaxResults((int) $itemsPerPage);
982
        }
983
984
        return $queryBuilder->execute()->fetchAll();
985
    }
986
987
    /**
988
     * Adding call back entries to log (called from hooks typically, see indexed search class "class.crawler.php"
989
     *
990
     * @param integer $setId Set ID
991
     * @param array $params Parameters to pass to call back function
992
     * @param string $callBack Call back object reference, eg. 'EXT:indexed_search/class.crawler.php:&tx_indexedsearch_crawler'
993
     * @param integer $page_id Page ID to attach it to
994
     * @param integer $schedule Time at which to activate
995
     */
996
    public function addQueueEntry_callBack($setId, $params, $callBack, $page_id = 0, $schedule = 0): void
997
    {
998
        if (! is_array($params)) {
0 ignored issues
show
introduced by
The condition is_array($params) is always true.
Loading history...
999
            $params = [];
1000
        }
1001
        $params['_CALLBACKOBJ'] = $callBack;
1002
1003
        GeneralUtility::makeInstance(ConnectionPool::class)->getConnectionForTable('tx_crawler_queue')
1004
            ->insert(
1005
                'tx_crawler_queue',
1006
                [
1007
                    'page_id' => (int) $page_id,
1008
                    'parameters' => json_encode($params),
1009
                    'scheduled' => (int) $schedule ?: $this->getCurrentTime(),
1010
                    'exec_time' => 0,
1011
                    'set_id' => (int) $setId,
1012
                    'result_data' => '',
1013
                ]
1014
            );
1015
    }
1016
1017
    /************************************
1018
     *
1019
     * URL setting
1020
     *
1021
     ************************************/
1022
1023
    /**
1024
     * Setting a URL for crawling:
1025
     *
1026
     * @param integer $id Page ID
1027
     * @param string $url Complete URL
1028
     * @param array $subCfg Sub configuration array (from TS config)
1029
     * @param integer $tstamp Scheduled-time
1030
     * @param string $configurationHash (optional) configuration hash
1031
     * @param bool $skipInnerDuplicationCheck (optional) skip inner duplication check
1032
     * @return bool
1033
     */
1034
    public function addUrl(
1035
        $id,
1036
        $url,
1037
        array $subCfg,
1038
        $tstamp,
1039
        $configurationHash = '',
1040
        $skipInnerDuplicationCheck = false
1041
    ) {
1042
        $urlAdded = false;
1043
        $rows = [];
1044
1045
        // Creating parameters:
1046
        $parameters = [
1047
            'url' => $url,
1048
        ];
1049
1050
        // fe user group simulation:
1051
        $uGs = implode(',', array_unique(GeneralUtility::intExplode(',', $subCfg['userGroups'], true)));
1052
        if ($uGs) {
1053
            $parameters['feUserGroupList'] = $uGs;
1054
        }
1055
1056
        // Setting processing instructions
1057
        $parameters['procInstructions'] = GeneralUtility::trimExplode(',', $subCfg['procInstrFilter']);
1058
        if (is_array($subCfg['procInstrParams.'])) {
1059
            $parameters['procInstrParams'] = $subCfg['procInstrParams.'];
1060
        }
1061
1062
        // Compile value array:
1063
        $parameters_serialized = json_encode($parameters);
1064
        $fieldArray = [
1065
            'page_id' => (int) $id,
1066
            'parameters' => $parameters_serialized,
1067
            'parameters_hash' => GeneralUtility::shortMD5($parameters_serialized),
1068
            'configuration_hash' => $configurationHash,
1069
            'scheduled' => $tstamp,
1070
            'exec_time' => 0,
1071
            'set_id' => (int) $this->setID,
1072
            'result_data' => '',
1073
            'configuration' => $subCfg['key'],
1074
        ];
1075
1076
        if ($this->registerQueueEntriesInternallyOnly) {
1077
            //the entries will only be registered and not stored to the database
1078
            $this->queueEntries[] = $fieldArray;
1079
        } else {
1080
            if (! $skipInnerDuplicationCheck) {
1081
                // check if there is already an equal entry
1082
                $rows = $this->getDuplicateRowsIfExist($tstamp, $fieldArray);
1083
            }
1084
1085
            if (empty($rows)) {
1086
                $connectionForCrawlerQueue = GeneralUtility::makeInstance(ConnectionPool::class)->getConnectionForTable('tx_crawler_queue');
1087
                $connectionForCrawlerQueue->insert(
1088
                    'tx_crawler_queue',
1089
                    $fieldArray
1090
                );
1091
                $uid = $connectionForCrawlerQueue->lastInsertId('tx_crawler_queue', 'qid');
1092
                $rows[] = $uid;
1093
                $urlAdded = true;
1094
1095
                $signalPayload = ['uid' => $uid, 'fieldArray' => $fieldArray];
1096
                SignalSlotUtility::emitSignal(
1097
                    self::class,
1098
                    SignalSlotUtility::SIGNAL_URL_ADDED_TO_QUEUE,
1099
                    $signalPayload
1100
                );
1101
            } else {
1102
                $signalPayload = ['rows' => $rows, 'fieldArray' => $fieldArray];
1103
                SignalSlotUtility::emitSignal(
1104
                    self::class,
1105
                    SignalSlotUtility::SIGNAL_DUPLICATE_URL_IN_QUEUE,
1106
                    $signalPayload
1107
                );
1108
            }
1109
        }
1110
1111
        return $urlAdded;
1112
    }
1113
1114
    /**
1115
     * Returns the current system time
1116
     *
1117
     * @return int
1118
     */
1119
    public function getCurrentTime()
1120
    {
1121
        return time();
1122
    }
1123
1124
    /************************************
1125
     *
1126
     * URL reading
1127
     *
1128
     ************************************/
1129
1130
    /**
1131
     * Read URL for single queue entry
1132
     *
1133
     * @param integer $queueId
1134
     * @param boolean $force If set, will process even if exec_time has been set!
1135
     * @return integer
1136
     */
1137
    public function readUrl($queueId, $force = false)
1138
    {
1139
        $queryBuilder = GeneralUtility::makeInstance(ConnectionPool::class)->getQueryBuilderForTable($this->tableName);
1140
        $ret = 0;
1141
        $this->logger->debug('crawler-readurl start ' . microtime(true));
1142
        // Get entry:
1143
        $queryBuilder
1144
            ->select('*')
1145
            ->from('tx_crawler_queue')
1146
            ->where(
1147
                $queryBuilder->expr()->eq('qid', $queryBuilder->createNamedParameter($queueId, \PDO::PARAM_INT))
1148
            );
1149
        if (! $force) {
1150
            $queryBuilder
1151
                ->andWhere('exec_time = 0')
1152
                ->andWhere('process_scheduled > 0');
1153
        }
1154
        $queueRec = $queryBuilder->execute()->fetch();
1155
1156
        if (! is_array($queueRec)) {
1157
            return;
1158
        }
1159
1160
        SignalSlotUtility::emitSignal(
1161
            self::class,
1162
            SignalSlotUtility::SIGNAL_QUEUEITEM_PREPROCESS,
1163
            [$queueId, &$queueRec]
1164
        );
1165
1166
        // Set exec_time to lock record:
1167
        $field_array = ['exec_time' => $this->getCurrentTime()];
1168
1169
        if (isset($this->processID)) {
1170
            //if mulitprocessing is used we need to store the id of the process which has handled this entry
1171
            $field_array['process_id_completed'] = $this->processID;
1172
        }
1173
1174
        GeneralUtility::makeInstance(ConnectionPool::class)->getConnectionForTable('tx_crawler_queue')
1175
            ->update(
1176
                'tx_crawler_queue',
1177
                $field_array,
1178
                ['qid' => (int) $queueId]
1179
            );
1180
1181
        $result = $this->queueExecutor->executeQueueItem($queueRec, $this);
1182
        if ($result['content'] === null) {
1183
            $resultData = 'An errors happened';
1184
        } else {
1185
            /** @var JsonCompatibilityConverter $jsonCompatibilityConverter */
1186
            $jsonCompatibilityConverter = GeneralUtility::makeInstance(JsonCompatibilityConverter::class);
1187
            $resultData = $jsonCompatibilityConverter->convert($result['content']);
1188
        }
1189
1190
        //atm there's no need to point to specific pollable extensions
1191
        if (is_array($GLOBALS['TYPO3_CONF_VARS']['EXTCONF']['crawler']['pollSuccess'])) {
1192
            foreach ($GLOBALS['TYPO3_CONF_VARS']['EXTCONF']['crawler']['pollSuccess'] as $pollable) {
1193
                // only check the success value if the instruction is runnig
1194
                // it is important to name the pollSuccess key same as the procInstructions key
1195
                if (is_array($resultData['parameters']['procInstructions'])
1196
                    && in_array(
1197
                        $pollable,
1198
                        $resultData['parameters']['procInstructions'], true
1199
                    )
1200
                ) {
1201
                    if (! empty($resultData['success'][$pollable]) && $resultData['success'][$pollable]) {
1202
                        $ret |= self::CLI_STATUS_POLLABLE_PROCESSED;
1203
                    }
1204
                }
1205
            }
1206
        }
1207
1208
        // Set result in log which also denotes the end of the processing of this entry.
1209
        $field_array = ['result_data' => json_encode($result)];
1210
1211
        SignalSlotUtility::emitSignal(
1212
            self::class,
1213
            SignalSlotUtility::SIGNAL_QUEUEITEM_POSTPROCESS,
1214
            [$queueId, &$field_array]
1215
        );
1216
1217
        GeneralUtility::makeInstance(ConnectionPool::class)->getConnectionForTable('tx_crawler_queue')
1218
            ->update(
1219
                'tx_crawler_queue',
1220
                $field_array,
1221
                ['qid' => (int) $queueId]
1222
            );
1223
1224
        $this->logger->debug('crawler-readurl stop ' . microtime(true));
1225
        return $ret;
1226
    }
1227
1228
    /**
1229
     * Read URL for not-yet-inserted log-entry
1230
     *
1231
     * @param array $field_array Queue field array,
1232
     *
1233
     * @return array|bool|mixed|string
1234
     */
1235
    public function readUrlFromArray($field_array)
1236
    {
1237
        // Set exec_time to lock record:
1238
        $field_array['exec_time'] = $this->getCurrentTime();
1239
        $connectionForCrawlerQueue = GeneralUtility::makeInstance(ConnectionPool::class)->getConnectionForTable($this->tableName);
1240
        $connectionForCrawlerQueue->insert(
1241
            $this->tableName,
1242
            $field_array
1243
        );
1244
        $queueId = $field_array['qid'] = $connectionForCrawlerQueue->lastInsertId($this->tableName, 'qid');
1245
1246
        $result = $this->queueExecutor->executeQueueItem($field_array, $this);
1247
1248
        // Set result in log which also denotes the end of the processing of this entry.
1249
        $field_array = ['result_data' => json_encode($result)];
1250
1251
        SignalSlotUtility::emitSignal(
1252
            self::class,
1253
            SignalSlotUtility::SIGNAL_QUEUEITEM_POSTPROCESS,
1254
            [$queueId, &$field_array]
1255
        );
1256
1257
        $connectionForCrawlerQueue->update(
1258
            $this->tableName,
1259
            $field_array,
1260
            ['qid' => $queueId]
1261
        );
1262
1263
        return $result;
1264
    }
1265
1266
    /*****************************
1267
     *
1268
     * Compiling URLs to crawl - tools
1269
     *
1270
     *****************************/
1271
1272
    /**
1273
     * @param integer $id Root page id to start from.
1274
     * @param integer $depth Depth of tree, 0=only id-page, 1= on sublevel, 99 = infinite
1275
     * @param integer $scheduledTime Unix Time when the URL is timed to be visited when put in queue
1276
     * @param integer $reqMinute Number of requests per minute (creates the interleave between requests)
1277
     * @param boolean $submitCrawlUrls If set, submits the URLs to queue in database (real crawling)
1278
     * @param boolean $downloadCrawlUrls If set (and submitcrawlUrls is false) will fill $downloadUrls with entries)
1279
     * @param array $incomingProcInstructions Array of processing instructions
1280
     * @param array $configurationSelection Array of configuration keys
1281
     * @return string
1282
     */
1283
    public function getPageTreeAndUrls(
1284
        $id,
1285
        $depth,
1286
        $scheduledTime,
1287
        $reqMinute,
1288
        $submitCrawlUrls,
1289
        $downloadCrawlUrls,
1290
        array $incomingProcInstructions,
1291
        array $configurationSelection
1292
    ) {
1293
        $this->scheduledTime = $scheduledTime;
1294
        $this->reqMinute = $reqMinute;
1295
        $this->submitCrawlUrls = $submitCrawlUrls;
1296
        $this->downloadCrawlUrls = $downloadCrawlUrls;
1297
        $this->incomingProcInstructions = $incomingProcInstructions;
1298
        $this->incomingConfigurationSelection = $configurationSelection;
1299
1300
        $this->duplicateTrack = [];
1301
        $this->downloadUrls = [];
1302
1303
        // Drawing tree:
1304
        /* @var PageTreeView $tree */
1305
        $tree = GeneralUtility::makeInstance(PageTreeView::class);
1306
        $perms_clause = $this->getBackendUser()->getPagePermsClause(Permission::PAGE_SHOW);
1307
        $tree->init('AND ' . $perms_clause);
1308
1309
        $pageInfo = BackendUtility::readPageAccess($id, $perms_clause);
1310
        if (is_array($pageInfo)) {
0 ignored issues
show
introduced by
The condition is_array($pageInfo) is always true.
Loading history...
1311
            // Set root row:
1312
            $tree->tree[] = [
1313
                'row' => $pageInfo,
1314
                'HTML' => $this->iconFactory->getIconForRecord('pages', $pageInfo, Icon::SIZE_SMALL),
1315
            ];
1316
        }
1317
1318
        // Get branch beneath:
1319
        if ($depth) {
1320
            $tree->getTree($id, $depth, '');
1321
        }
1322
1323
        // Traverse page tree:
1324
        $code = '';
1325
1326
        foreach ($tree->tree as $data) {
1327
            $this->MP = false;
1328
1329
            // recognize mount points
1330
            if ($data['row']['doktype'] === PageRepository::DOKTYPE_MOUNTPOINT) {
1331
                $queryBuilder = GeneralUtility::makeInstance(ConnectionPool::class)->getQueryBuilderForTable('pages');
1332
                $queryBuilder->getRestrictions()->removeAll()->add(GeneralUtility::makeInstance(DeletedRestriction::class));
1333
                $mountpage = $queryBuilder
1334
                    ->select('*')
1335
                    ->from('pages')
1336
                    ->where(
1337
                        $queryBuilder->expr()->eq('uid', $queryBuilder->createNamedParameter($data['row']['uid'], \PDO::PARAM_INT))
1338
                    )
1339
                    ->execute()
1340
                    ->fetchAll();
1341
                $queryBuilder->resetRestrictions();
1342
1343
                // fetch mounted pages
1344
                $this->MP = $mountpage[0]['mount_pid'] . '-' . $data['row']['uid'];
0 ignored issues
show
Documentation Bug introduced by
The property $MP was declared of type boolean, but $mountpage[0]['mount_pid...' . $data['row']['uid'] is of type string. Maybe add a type cast?

This check looks for assignments to scalar types that may be of the wrong type.

To ensure the code behaves as expected, it may be a good idea to add an explicit type cast.

$answer = 42;

$correct = false;

$correct = (bool) $answer;
Loading history...
1345
1346
                $mountTree = GeneralUtility::makeInstance(PageTreeView::class);
1347
                $mountTree->init('AND ' . $perms_clause);
1348
                $mountTree->getTree($mountpage[0]['mount_pid'], $depth);
1349
1350
                foreach ($mountTree->tree as $mountData) {
1351
                    $code .= $this->drawURLs_addRowsForPage(
1352
                        $mountData['row'],
1353
                        $mountData['HTML'] . BackendUtility::getRecordTitle('pages', $mountData['row'], true)
1354
                    );
1355
                }
1356
1357
                // replace page when mount_pid_ol is enabled
1358
                if ($mountpage[0]['mount_pid_ol']) {
1359
                    $data['row']['uid'] = $mountpage[0]['mount_pid'];
1360
                } else {
1361
                    // if the mount_pid_ol is not set the MP must not be used for the mountpoint page
1362
                    $this->MP = false;
1363
                }
1364
            }
1365
1366
            $code .= $this->drawURLs_addRowsForPage(
1367
                $data['row'],
1368
                $data['HTML'] . BackendUtility::getRecordTitle('pages', $data['row'], true)
1369
            );
1370
        }
1371
1372
        return $code;
1373
    }
1374
1375
    /**
1376
     * Expands exclude string
1377
     *
1378
     * @param string $excludeString Exclude string
1379
     * @return array
1380
     */
1381
    public function expandExcludeString($excludeString)
1382
    {
1383
        // internal static caches;
1384
        static $expandedExcludeStringCache;
1385
        static $treeCache;
1386
1387
        if (empty($expandedExcludeStringCache[$excludeString])) {
1388
            $pidList = [];
1389
1390
            if (! empty($excludeString)) {
1391
                /** @var PageTreeView $tree */
1392
                $tree = GeneralUtility::makeInstance(PageTreeView::class);
1393
                $tree->init('AND ' . $this->getBackendUser()->getPagePermsClause(Permission::PAGE_SHOW));
1394
1395
                $excludeParts = GeneralUtility::trimExplode(',', $excludeString);
1396
1397
                foreach ($excludeParts as $excludePart) {
1398
                    [$pid, $depth] = GeneralUtility::trimExplode('+', $excludePart);
1399
1400
                    // default is "page only" = "depth=0"
1401
                    if (empty($depth)) {
1402
                        $depth = (stristr($excludePart, '+')) ? 99 : 0;
1403
                    }
1404
1405
                    $pidList[] = (int) $pid;
1406
1407
                    if ($depth > 0) {
1408
                        if (empty($treeCache[$pid][$depth])) {
1409
                            $tree->reset();
1410
                            $tree->getTree($pid, $depth);
0 ignored issues
show
Bug introduced by
$pid of type string is incompatible with the type integer expected by parameter $uid of TYPO3\CMS\Backend\Tree\V...ractTreeView::getTree(). ( Ignorable by Annotation )

If this is a false-positive, you can also ignore this issue in your code via the ignore-type  annotation

1410
                            $tree->getTree(/** @scrutinizer ignore-type */ $pid, $depth);
Loading history...
1411
                            $treeCache[$pid][$depth] = $tree->tree;
1412
                        }
1413
1414
                        foreach ($treeCache[$pid][$depth] as $data) {
1415
                            $pidList[] = (int) $data['row']['uid'];
1416
                        }
1417
                    }
1418
                }
1419
            }
1420
1421
            $expandedExcludeStringCache[$excludeString] = array_unique($pidList);
1422
        }
1423
1424
        return $expandedExcludeStringCache[$excludeString];
1425
    }
1426
1427
    /**
1428
     * Create the rows for display of the page tree
1429
     * For each page a number of rows are shown displaying GET variable configuration
1430
     */
1431
    public function drawURLs_addRowsForPage(array $pageRow, string $pageTitle): string
1432
    {
1433
        $skipMessage = '';
1434
1435
        // Get list of configurations
1436
        $configurations = $this->getUrlsForPageRow($pageRow, $skipMessage);
1437
1438
        if (! empty($this->incomingConfigurationSelection)) {
1439
            // remove configuration that does not match the current selection
1440
            foreach ($configurations as $confKey => $confArray) {
1441
                if (! in_array($confKey, $this->incomingConfigurationSelection, true)) {
1442
                    unset($configurations[$confKey]);
1443
                }
1444
            }
1445
        }
1446
1447
        // Traverse parameter combinations:
1448
        $c = 0;
1449
        $content = '';
1450
        if (! empty($configurations)) {
1451
            foreach ($configurations as $confKey => $confArray) {
1452
1453
                // Title column:
1454
                if (! $c) {
1455
                    $titleClm = '<td rowspan="' . count($configurations) . '">' . $pageTitle . '</td>';
1456
                } else {
1457
                    $titleClm = '';
1458
                }
1459
1460
                if (! in_array($pageRow['uid'], $this->expandExcludeString($confArray['subCfg']['exclude']), true)) {
1461
1462
                    // URL list:
1463
                    $urlList = $this->urlListFromUrlArray(
1464
                        $confArray,
1465
                        $pageRow,
1466
                        $this->scheduledTime,
1467
                        $this->reqMinute,
1468
                        $this->submitCrawlUrls,
1469
                        $this->downloadCrawlUrls,
1470
                        $this->duplicateTrack,
1471
                        $this->downloadUrls,
1472
                        $this->incomingProcInstructions // if empty the urls won't be filtered by processing instructions
1473
                    );
1474
1475
                    // Expanded parameters:
1476
                    $paramExpanded = '';
1477
                    $calcAccu = [];
1478
                    $calcRes = 1;
1479
                    foreach ($confArray['paramExpanded'] as $gVar => $gVal) {
1480
                        $paramExpanded .= '
1481
                            <tr>
1482
                                <td>' . htmlspecialchars('&' . $gVar . '=') . '<br/>' .
1483
                            '(' . count($gVal) . ')' .
1484
                            '</td>
1485
                                <td nowrap="nowrap">' . nl2br(htmlspecialchars(implode(chr(10), $gVal))) . '</td>
1486
                            </tr>
1487
                        ';
1488
                        $calcRes *= count($gVal);
1489
                        $calcAccu[] = count($gVal);
1490
                    }
1491
                    $paramExpanded = '<table>' . $paramExpanded . '</table>';
1492
                    $paramExpanded .= 'Comb: ' . implode('*', $calcAccu) . '=' . $calcRes;
1493
1494
                    // Options
1495
                    $optionValues = '';
1496
                    if ($confArray['subCfg']['userGroups']) {
1497
                        $optionValues .= 'User Groups: ' . $confArray['subCfg']['userGroups'] . '<br/>';
1498
                    }
1499
                    if ($confArray['subCfg']['procInstrFilter']) {
1500
                        $optionValues .= 'ProcInstr: ' . $confArray['subCfg']['procInstrFilter'] . '<br/>';
1501
                    }
1502
1503
                    // Compile row:
1504
                    $content .= '
1505
                        <tr>
1506
                            ' . $titleClm . '
1507
                            <td>' . htmlspecialchars($confKey) . '</td>
1508
                            <td>' . nl2br(htmlspecialchars(rawurldecode(trim(str_replace('&', chr(10) . '&', GeneralUtility::implodeArrayForUrl('', $confArray['paramParsed'])))))) . '</td>
1509
                            <td>' . $paramExpanded . '</td>
1510
                            <td nowrap="nowrap">' . $urlList . '</td>
1511
                            <td nowrap="nowrap">' . $optionValues . '</td>
1512
                            <td nowrap="nowrap">' . DebugUtility::viewArray($confArray['subCfg']['procInstrParams.']) . '</td>
1513
                        </tr>';
1514
                } else {
1515
                    $content .= '<tr>
1516
                            ' . $titleClm . '
1517
                            <td>' . htmlspecialchars($confKey) . '</td>
1518
                            <td colspan="5"><em>No entries</em> (Page is excluded in this configuration)</td>
1519
                        </tr>';
1520
                }
1521
1522
                $c++;
1523
            }
1524
        } else {
1525
            $message = ! empty($skipMessage) ? ' (' . $skipMessage . ')' : '';
1526
1527
            // Compile row:
1528
            $content .= '
1529
                <tr>
1530
                    <td>' . $pageTitle . '</td>
1531
                    <td colspan="6"><em>No entries</em>' . $message . '</td>
1532
                </tr>';
1533
        }
1534
1535
        return $content;
1536
    }
1537
1538
    /*****************************
1539
     *
1540
     * CLI functions
1541
     *
1542
     *****************************/
1543
1544
    /**
1545
     * Running the functionality of the CLI (crawling URLs from queue)
1546
     */
1547
    public function CLI_run(int $countInARun, int $sleepTime, int $sleepAfterFinish): int
1548
    {
1549
        $result = 0;
1550
        $counter = 0;
1551
1552
        // First, run hooks:
1553
        $this->CLI_runHooks();
1554
1555
        // Clean up the queue
1556
        $this->queueRepository->cleanupQueue();
1557
1558
        // Select entries:
1559
        $rows = $this->queueRepository->fetchRecordsToBeCrawled($countInARun);
1560
1561
        if (! empty($rows)) {
1562
            $quidList = [];
1563
1564
            foreach ($rows as $r) {
1565
                $quidList[] = $r['qid'];
1566
            }
1567
1568
            $processId = $this->CLI_buildProcessId();
1569
1570
            //save the number of assigned queue entries to determine how many have been processed later
1571
            $numberOfAffectedRows = $this->queueRepository->updateProcessIdAndSchedulerForQueueIds($quidList, $processId);
1572
            $this->processRepository->updateProcessAssignItemsCount($numberOfAffectedRows, $processId);
1573
1574
            if ($numberOfAffectedRows !== count($quidList)) {
1575
                $this->CLI_debug('Nothing processed due to multi-process collision (' . $this->CLI_buildProcessId() . ')');
1576
                return ($result | self::CLI_STATUS_ABORTED);
1577
            }
1578
1579
            foreach ($rows as $r) {
1580
                $result |= $this->readUrl($r['qid']);
1581
1582
                $counter++;
1583
                usleep((int) $sleepTime); // Just to relax the system
1584
1585
                // if during the start and the current read url the cli has been disable we need to return from the function
1586
                // mark the process NOT as ended.
1587
                if ($this->getDisabled()) {
1588
                    return ($result | self::CLI_STATUS_ABORTED);
1589
                }
1590
1591
                if (! $this->processRepository->isProcessActive($this->CLI_buildProcessId())) {
1592
                    $this->CLI_debug('conflict / timeout (' . $this->CLI_buildProcessId() . ')');
1593
                    $result |= self::CLI_STATUS_ABORTED;
1594
                    break; //possible timeout
1595
                }
1596
            }
1597
1598
            sleep((int) $sleepAfterFinish);
1599
1600
            $msg = 'Rows: ' . $counter;
1601
            $this->CLI_debug($msg . ' (' . $this->CLI_buildProcessId() . ')');
1602
        } else {
1603
            $this->CLI_debug('Nothing within queue which needs to be processed (' . $this->CLI_buildProcessId() . ')');
1604
        }
1605
1606
        if ($counter > 0) {
1607
            $result |= self::CLI_STATUS_PROCESSED;
1608
        }
1609
1610
        return $result;
1611
    }
1612
1613
    /**
1614
     * Activate hooks
1615
     */
1616
    public function CLI_runHooks(): void
1617
    {
1618
        foreach ($GLOBALS['TYPO3_CONF_VARS']['EXTCONF']['crawler']['cli_hooks'] ?? [] as $objRef) {
1619
            $hookObj = GeneralUtility::makeInstance($objRef);
1620
            if (is_object($hookObj)) {
1621
                $hookObj->crawler_init($this);
1622
            }
1623
        }
1624
    }
1625
1626
    /**
1627
     * Try to acquire a new process with the given id
1628
     * also performs some auto-cleanup for orphan processes
1629
     * @param string $id identification string for the process
1630
     * @return boolean
1631
     * @todo preemption might not be the most elegant way to clean up
1632
     */
1633
    public function CLI_checkAndAcquireNewProcess($id)
1634
    {
1635
        $queryBuilder = GeneralUtility::makeInstance(ConnectionPool::class)->getQueryBuilderForTable($this->tableName);
1636
        $ret = true;
1637
1638
        $systemProcessId = getmypid();
1639
        if ($systemProcessId < 1) {
1640
            return false;
1641
        }
1642
1643
        $processCount = 0;
1644
        $orphanProcesses = [];
1645
1646
        $statement = $queryBuilder
1647
            ->select('process_id', 'ttl')
1648
            ->from('tx_crawler_process')
1649
            ->where(
1650
                'active = 1 AND deleted = 0'
1651
            )
1652
            ->execute();
1653
1654
        $currentTime = $this->getCurrentTime();
1655
1656
        while ($row = $statement->fetch()) {
1657
            if ($row['ttl'] < $currentTime) {
1658
                $orphanProcesses[] = $row['process_id'];
1659
            } else {
1660
                $processCount++;
1661
            }
1662
        }
1663
1664
        // if there are less than allowed active processes then add a new one
1665
        if ($processCount < (int) $this->extensionSettings['processLimit']) {
1666
            $this->CLI_debug('add process ' . $this->CLI_buildProcessId() . ' (' . ($processCount + 1) . '/' . (int) $this->extensionSettings['processLimit'] . ')');
1667
1668
            GeneralUtility::makeInstance(ConnectionPool::class)->getConnectionForTable('tx_crawler_process')->insert(
1669
                'tx_crawler_process',
1670
                [
1671
                    'process_id' => $id,
1672
                    'active' => 1,
1673
                    'ttl' => $currentTime + (int) $this->extensionSettings['processMaxRunTime'],
1674
                    'system_process_id' => $systemProcessId,
1675
                ]
1676
            );
1677
        } else {
1678
            $this->CLI_debug('Processlimit reached (' . ($processCount) . '/' . (int) $this->extensionSettings['processLimit'] . ')');
1679
            $ret = false;
1680
        }
1681
1682
        $this->processRepository->deleteProcessesMarkedAsDeleted();
1683
        $this->CLI_releaseProcesses($orphanProcesses);
1684
1685
        return $ret;
1686
    }
1687
1688
    /**
1689
     * Release a process and the required resources
1690
     *
1691
     * @param mixed $releaseIds string with a single process-id or array with multiple process-ids
1692
     * @return boolean
1693
     */
1694
    public function CLI_releaseProcesses($releaseIds)
1695
    {
1696
        $queryBuilder = GeneralUtility::makeInstance(ConnectionPool::class)->getQueryBuilderForTable($this->tableName);
1697
1698
        if (! is_array($releaseIds)) {
1699
            $releaseIds = [$releaseIds];
1700
        }
1701
1702
        if (empty($releaseIds)) {
1703
            return false;   //nothing to release
1704
        }
1705
1706
        // some kind of 2nd chance algo - this way you need at least 2 processes to have a real cleanup
1707
        // this ensures that a single process can't mess up the entire process table
1708
1709
        // mark all processes as deleted which have no "waiting" queue-entires and which are not active
1710
1711
        $queryBuilder
1712
            ->update($this->tableName, 'q')
1713
            ->where(
1714
                'q.process_id IN(SELECT p.process_id FROM tx_crawler_process as p WHERE p.active = 0)'
1715
            )
1716
            ->set('q.process_scheduled', 0)
1717
            ->set('q.process_id', '')
1718
            ->execute();
1719
1720
        // FIXME: Not entirely sure that this is equivalent to the previous version
1721
        $queryBuilder->resetQueryPart('set');
1722
1723
        $queryBuilder
1724
            ->update('tx_crawler_process')
1725
            ->where(
1726
                $queryBuilder->expr()->eq('active', 0),
1727
                'process_id IN(SELECT q.process_id FROM tx_crawler_queue as q WHERE q.exec_time = 0)'
1728
            )
1729
            ->set('system_process_id', 0)
1730
            ->execute();
1731
1732
        $this->processRepository->markRequestedProcessesAsNotActive($releaseIds);
1733
        $this->queueRepository->unsetProcessScheduledAndProcessIdForQueueEntries($releaseIds);
1734
1735
        return true;
1736
    }
1737
1738
    /**
1739
     * Create a unique Id for the current process
1740
     *
1741
     * @return string  the ID
1742
     */
1743
    public function CLI_buildProcessId()
1744
    {
1745
        if (! $this->processID) {
1746
            $this->processID = GeneralUtility::shortMD5(microtime(true));
1747
        }
1748
        return $this->processID;
1749
    }
1750
1751
    /**
1752
     * Prints a message to the stdout (only if debug-mode is enabled)
1753
     *
1754
     * @param string $msg the message
1755
     */
1756
    public function CLI_debug($msg): void
1757
    {
1758
        if ((int) $this->extensionSettings['processDebug']) {
1759
            echo $msg . "\n";
1760
            flush();
1761
        }
1762
    }
1763
1764
    /**
1765
     * Cleans up entries that stayed for too long in the queue. These are:
1766
     * - processed entries that are over 1.5 days in age
1767
     * - scheduled entries that are over 7 days old
1768
     *
1769
     * @deprecated
1770
     */
1771
    public function cleanUpOldQueueEntries(): void
1772
    {
1773
        $processedAgeInSeconds = $this->extensionSettings['cleanUpProcessedAge'] * 86400; // 24*60*60 Seconds in 24 hours
1774
        $scheduledAgeInSeconds = $this->extensionSettings['cleanUpScheduledAge'] * 86400;
1775
1776
        $now = time();
1777
        $condition = '(exec_time<>0 AND exec_time<' . ($now - $processedAgeInSeconds) . ') OR scheduled<=' . ($now - $scheduledAgeInSeconds);
1778
        $this->flushQueue($condition);
0 ignored issues
show
Deprecated Code introduced by
The function AOE\Crawler\Controller\C...ontroller::flushQueue() has been deprecated. ( Ignorable by Annotation )

If this is a false-positive, you can also ignore this issue in your code via the ignore-deprecated  annotation

1778
        /** @scrutinizer ignore-deprecated */ $this->flushQueue($condition);
Loading history...
1779
    }
1780
1781
    /**
1782
     * Removes queue entries
1783
     *
1784
     * @param string $where SQL related filter for the entries which should be removed
1785
     *
1786
     * @deprecated
1787
     */
1788
    protected function flushQueue($where = ''): void
1789
    {
1790
        $realWhere = strlen((string) $where) > 0 ? $where : '1=1';
1791
1792
        $queryBuilder = $this->getQueryBuilder($this->tableName);
1793
1794
        $groups = $queryBuilder
1795
            ->selectLiteral('DISTINCT set_id')
1796
            ->from($this->tableName)
1797
            ->where($realWhere)
1798
            ->execute()
1799
            ->fetchAll();
1800
        if (is_array($groups)) {
0 ignored issues
show
introduced by
The condition is_array($groups) is always true.
Loading history...
1801
            foreach ($groups as $group) {
1802
                $subSet = $queryBuilder
1803
                    ->select('qid', 'set_id')
1804
                    ->from($this->tableName)
1805
                    ->where(
1806
                        $realWhere,
1807
                        $queryBuilder->expr()->eq('set_id', $group['set_id'])
1808
                    )
1809
                    ->execute()
1810
                    ->fetchAll();
1811
1812
                $payLoad = ['subSet' => $subSet];
1813
                SignalSlotUtility::emitSignal(
1814
                    self::class,
1815
                    SignalSlotUtility::SIGNAL_QUEUE_ENTRY_FLUSH,
1816
                    $payLoad
1817
                );
1818
            }
1819
        }
1820
1821
        $queryBuilder
1822
            ->delete($this->tableName)
1823
            ->where($realWhere)
1824
            ->execute();
1825
    }
1826
1827
    /**
1828
     * This method determines duplicates for a queue entry with the same parameters and this timestamp.
1829
     * If the timestamp is in the past, it will check if there is any unprocessed queue entry in the past.
1830
     * If the timestamp is in the future it will check, if the queued entry has exactly the same timestamp
1831
     *
1832
     * @param int $tstamp
1833
     * @param array $fieldArray
1834
     *
1835
     * @return array
1836
     */
1837
    protected function getDuplicateRowsIfExist($tstamp, $fieldArray)
1838
    {
1839
        $rows = [];
1840
1841
        $currentTime = $this->getCurrentTime();
1842
1843
        $queryBuilder = GeneralUtility::makeInstance(ConnectionPool::class)->getQueryBuilderForTable($this->tableName);
1844
        $queryBuilder
1845
            ->select('qid')
1846
            ->from('tx_crawler_queue');
1847
        //if this entry is scheduled with "now"
1848
        if ($tstamp <= $currentTime) {
1849
            if ($this->extensionSettings['enableTimeslot']) {
1850
                $timeBegin = $currentTime - 100;
1851
                $timeEnd = $currentTime + 100;
1852
                $queryBuilder
1853
                    ->where(
1854
                        'scheduled BETWEEN ' . $timeBegin . ' AND ' . $timeEnd . ''
1855
                    )
1856
                    ->orWhere(
1857
                        $queryBuilder->expr()->lte('scheduled', $currentTime)
1858
                    );
1859
            } else {
1860
                $queryBuilder
1861
                    ->where(
1862
                        $queryBuilder->expr()->lte('scheduled', $currentTime)
1863
                    );
1864
            }
1865
        } elseif ($tstamp > $currentTime) {
1866
            //entry with a timestamp in the future need to have the same schedule time
1867
            $queryBuilder
1868
                ->where(
1869
                    $queryBuilder->expr()->eq('scheduled', $tstamp)
1870
                );
1871
        }
1872
1873
        $queryBuilder
1874
            ->andWhere('NOT exec_time')
1875
            ->andWhere('NOT process_id')
1876
            ->andWhere($queryBuilder->expr()->eq('page_id', $queryBuilder->createNamedParameter($fieldArray['page_id'], \PDO::PARAM_INT)))
1877
            ->andWhere($queryBuilder->expr()->eq('parameters_hash', $queryBuilder->createNamedParameter($fieldArray['parameters_hash'], \PDO::PARAM_STR)));
1878
1879
        $statement = $queryBuilder->execute();
1880
1881
        while ($row = $statement->fetch()) {
1882
            $rows[] = $row['qid'];
1883
        }
1884
1885
        return $rows;
1886
    }
1887
1888
    /**
1889
     * Returns a md5 hash generated from a serialized configuration array.
1890
     *
1891
     * @return string
1892
     */
1893
    protected function getConfigurationHash(array $configuration)
1894
    {
1895
        unset($configuration['paramExpanded']);
1896
        unset($configuration['URLs']);
1897
        return md5(serialize($configuration));
1898
    }
1899
1900
    /**
1901
     * Build a URL from a Page and the Query String. If the page has a Site configuration, it can be built by using
1902
     * the Site instance.
1903
     *
1904
     * @param int $httpsOrHttp see tx_crawler_configuration.force_ssl
1905
     * @throws \TYPO3\CMS\Core\Exception\SiteNotFoundException
1906
     * @throws \TYPO3\CMS\Core\Routing\InvalidRouteArgumentsException
1907
     *
1908
     * @deprecated Using CrawlerController::getUrlFromPageAndQueryParameters() is deprecated since 9.1.1 and will be removed in v11.x, please use UrlService->getUrlFromPageAndQueryParameters() instead.
1909
     */
1910
    protected function getUrlFromPageAndQueryParameters(int $pageId, string $queryString, ?string $alternativeBaseUrl, int $httpsOrHttp): UriInterface
1911
    {
1912
        $urlService = new UrlService();
1913
        return $urlService->getUrlFromPageAndQueryParameters($pageId, $queryString, $alternativeBaseUrl, $httpsOrHttp);
1914
    }
1915
1916
    protected function swapIfFirstIsLargerThanSecond(array $reg): array
1917
    {
1918
        // Swap if first is larger than last:
1919
        if ($reg[1] > $reg[2]) {
1920
            $temp = $reg[2];
1921
            $reg[2] = $reg[1];
1922
            $reg[1] = $temp;
1923
        }
1924
1925
        return $reg;
1926
    }
1927
1928
    /**
1929
     * @return BackendUserAuthentication
1930
     */
1931
    private function getBackendUser()
1932
    {
1933
        // Make sure the _cli_ user is loaded
1934
        Bootstrap::initializeBackendAuthentication();
1935
        if ($this->backendUser === null) {
1936
            $this->backendUser = $GLOBALS['BE_USER'];
1937
        }
1938
        return $this->backendUser;
1939
    }
1940
1941
    /**
1942
     * Get querybuilder for given table
1943
     *
1944
     * @return \TYPO3\CMS\Core\Database\Query\QueryBuilder
1945
     */
1946
    private function getQueryBuilder(string $table)
1947
    {
1948
        return GeneralUtility::makeInstance(ConnectionPool::class)->getQueryBuilderForTable($table);
1949
    }
1950
}
1951