Passed
Push — testing/behaviour-104 ( 344e70...67b3c9 )
by Tomas Norre
36:59 queued 19:28
created

CrawlerController::getBackendUser()   A

Complexity

Conditions 2
Paths 2

Size

Total Lines 8
Code Lines 4

Duplication

Lines 0
Ratio 0 %

Importance

Changes 1
Bugs 0 Features 0
Metric Value
cc 2
eloc 4
c 1
b 0
f 0
nc 2
nop 0
dl 0
loc 8
rs 10
1
<?php
2
3
declare(strict_types=1);
4
5
namespace AOE\Crawler\Controller;
6
7
/***************************************************************
8
 *  Copyright notice
9
 *
10
 *  (c) 2020 AOE GmbH <[email protected]>
11
 *
12
 *  All rights reserved
13
 *
14
 *  This script is part of the TYPO3 project. The TYPO3 project is
15
 *  free software; you can redistribute it and/or modify
16
 *  it under the terms of the GNU General Public License as published by
17
 *  the Free Software Foundation; either version 3 of the License, or
18
 *  (at your option) any later version.
19
 *
20
 *  The GNU General Public License can be found at
21
 *  http://www.gnu.org/copyleft/gpl.html.
22
 *
23
 *  This script is distributed in the hope that it will be useful,
24
 *  but WITHOUT ANY WARRANTY; without even the implied warranty of
25
 *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
26
 *  GNU General Public License for more details.
27
 *
28
 *  This copyright notice MUST APPEAR in all copies of the script!
29
 ***************************************************************/
30
31
use AOE\Crawler\Configuration\ExtensionConfigurationProvider;
32
use AOE\Crawler\Converter\JsonCompatibilityConverter;
33
use AOE\Crawler\CrawlStrategy\CrawlStrategyFactory;
34
use AOE\Crawler\Domain\Repository\ConfigurationRepository;
35
use AOE\Crawler\Domain\Repository\ProcessRepository;
36
use AOE\Crawler\Domain\Repository\QueueRepository;
37
use AOE\Crawler\QueueExecutor;
38
use AOE\Crawler\Service\UrlService;
39
use AOE\Crawler\Utility\SignalSlotUtility;
40
use Psr\Http\Message\UriInterface;
41
use Psr\Log\LoggerAwareInterface;
42
use Psr\Log\LoggerAwareTrait;
43
use TYPO3\CMS\Backend\Tree\View\PageTreeView;
44
use TYPO3\CMS\Backend\Utility\BackendUtility;
45
use TYPO3\CMS\Core\Authentication\BackendUserAuthentication;
46
use TYPO3\CMS\Core\Compatibility\PublicMethodDeprecationTrait;
47
use TYPO3\CMS\Core\Core\Bootstrap;
48
use TYPO3\CMS\Core\Core\Environment;
49
use TYPO3\CMS\Core\Database\Connection;
50
use TYPO3\CMS\Core\Database\ConnectionPool;
51
use TYPO3\CMS\Core\Database\Query\Restriction\DeletedRestriction;
52
use TYPO3\CMS\Core\Imaging\Icon;
53
use TYPO3\CMS\Core\Imaging\IconFactory;
54
use TYPO3\CMS\Core\Site\Entity\Site;
55
use TYPO3\CMS\Core\Type\Bitmask\Permission;
56
use TYPO3\CMS\Core\TypoScript\Parser\TypoScriptParser;
57
use TYPO3\CMS\Core\Utility\DebugUtility;
58
use TYPO3\CMS\Core\Utility\GeneralUtility;
59
use TYPO3\CMS\Core\Utility\MathUtility;
60
use TYPO3\CMS\Extbase\Object\ObjectManager;
61
use TYPO3\CMS\Frontend\Page\PageRepository;
62
63
/**
64
 * Class CrawlerController
65
 *
66
 * @package AOE\Crawler\Controller
67
 */
68
class CrawlerController implements LoggerAwareInterface
69
{
70
    use LoggerAwareTrait;
71
    use PublicMethodDeprecationTrait;
72
73
    public const CLI_STATUS_NOTHING_PROCCESSED = 0;
74
75
    public const CLI_STATUS_REMAIN = 1; //queue not empty
76
77
    public const CLI_STATUS_PROCESSED = 2; //(some) queue items where processed
78
79
    public const CLI_STATUS_ABORTED = 4; //instance didn't finish
80
81
    public const CLI_STATUS_POLLABLE_PROCESSED = 8;
82
83
    /**
84
     * @var integer
85
     */
86
    public $setID = 0;
87
88
    /**
89
     * @var string
90
     */
91
    public $processID = '';
92
93
    /**
94
     * @var array
95
     */
96
    public $duplicateTrack = [];
97
98
    /**
99
     * @var array
100
     */
101
    public $downloadUrls = [];
102
103
    /**
104
     * @var array
105
     */
106
    public $incomingProcInstructions = [];
107
108
    /**
109
     * @var array
110
     */
111
    public $incomingConfigurationSelection = [];
112
113
    /**
114
     * @var bool
115
     */
116
    public $registerQueueEntriesInternallyOnly = false;
117
118
    /**
119
     * @var array
120
     */
121
    public $queueEntries = [];
122
123
    /**
124
     * @var array
125
     */
126
    public $urlList = [];
127
128
    /**
129
     * @var array
130
     */
131
    public $extensionSettings = [];
132
133
    /**
134
     * Mount Point
135
     *
136
     * @var bool
137
     * Todo: Check what this is used for and adjust the type hint or code, as bool doesn't match the current code.
138
     */
139
    public $MP = false;
140
141
    /**
142
     * @var string
143
     */
144
    protected $processFilename;
145
146
    /**
147
     * Holds the internal access mode can be 'gui','cli' or 'cli_im'
148
     *
149
     * @var string
150
     */
151
    protected $accessMode;
152
153
    /**
154
     * @var QueueRepository
155
     */
156
    protected $queueRepository;
157
158
    /**
159
     * @var ProcessRepository
160
     */
161
    protected $processRepository;
162
163
    /**
164
     * @var ConfigurationRepository
165
     */
166
    protected $configurationRepository;
167
168
    /**
169
     * @var string
170
     */
171
    protected $tableName = 'tx_crawler_queue';
172
173
    /**
174
     * @var QueueExecutor
175
     */
176
    protected $queueExecutor;
177
178
    /**
179
     * @var int
180
     */
181
    protected $maximumUrlsToCompile = 10000;
182
183
    /**
184
     * @var IconFactory
185
     */
186
    protected $iconFactory;
187
188
    /**
189
     * @var string[]
190
     */
191
    private $deprecatedPublicMethods = [
0 ignored issues
show
introduced by
The private property $deprecatedPublicMethods is not used, and could be removed.
Loading history...
192
        'getLogEntriesForSetId' => 'Using crawlerController::getLogEntriesForSetId() is deprecated since 9.0.1 and will be removed in v11.x',
193
        'flushQueue' => 'Using CrawlerController::flushQueue() is deprecated since 9.0.1 and will be removed in v11.x, please use QueueRepository->flushQueue() instead.',
194
        'cleanUpOldQueueEntries' => 'Using CrawlerController::cleanUpOldQueueEntries() is deprecated since 9.0.1 and will be removed in v11.x, please use QueueRepository->cleanUpOldQueueEntries() instead.',
195
    ];
196
197
    /**
198
     * @var BackendUserAuthentication|null
199
     */
200
    private $backendUser;
201
202
    /**
203
     * @var integer
204
     */
205
    private $scheduledTime = 0;
206
207
    /**
208
     * @var integer
209
     */
210
    private $reqMinute = 0;
211
212
    /**
213
     * @var bool
214
     */
215
    private $submitCrawlUrls = false;
216
217
    /**
218
     * @var bool
219
     */
220
    private $downloadCrawlUrls = false;
221
222
    /************************************
223
     *
224
     * Getting URLs based on Page TSconfig
225
     *
226
     ************************************/
227
228
    public function __construct()
229
    {
230
        $objectManager = GeneralUtility::makeInstance(ObjectManager::class);
231
        $crawlStrategyFactory = GeneralUtility::makeInstance(CrawlStrategyFactory::class);
232
        $this->queueRepository = $objectManager->get(QueueRepository::class);
233
        $this->processRepository = $objectManager->get(ProcessRepository::class);
234
        $this->configurationRepository = $objectManager->get(ConfigurationRepository::class);
235
        $this->queueExecutor = GeneralUtility::makeInstance(QueueExecutor::class, $crawlStrategyFactory);
236
        $this->iconFactory = GeneralUtility::makeInstance(IconFactory::class);
237
238
        $this->processFilename = Environment::getVarPath() . '/lock/tx_crawler.proc';
239
240
        /** @var ExtensionConfigurationProvider $configurationProvider */
241
        $configurationProvider = GeneralUtility::makeInstance(ExtensionConfigurationProvider::class);
242
        $settings = $configurationProvider->getExtensionConfiguration();
243
        $this->extensionSettings = is_array($settings) ? $settings : [];
0 ignored issues
show
introduced by
The condition is_array($settings) is always true.
Loading history...
244
245
        // set defaults:
246
        if (MathUtility::convertToPositiveInteger($this->extensionSettings['countInARun']) === 0) {
247
            $this->extensionSettings['countInARun'] = 100;
248
        }
249
250
        $this->extensionSettings['processLimit'] = MathUtility::forceIntegerInRange($this->extensionSettings['processLimit'], 1, 99, 1);
251
        $this->maximumUrlsToCompile = MathUtility::forceIntegerInRange($this->extensionSettings['maxCompileUrls'], 1, 1000000000, 10000);
252
    }
253
254
    /**
255
     * Method to set the accessMode can be gui, cli or cli_im
256
     *
257
     * @return string
258
     */
259
    public function getAccessMode()
260
    {
261
        return $this->accessMode;
262
    }
263
264
    /**
265
     * @param string $accessMode
266
     */
267
    public function setAccessMode($accessMode): void
268
    {
269
        $this->accessMode = $accessMode;
270
    }
271
272
    /**
273
     * Set disabled status to prevent processes from being processed
274
     *
275
     * @param bool $disabled (optional, defaults to true)
276
     */
277
    public function setDisabled($disabled = true): void
278
    {
279
        if ($disabled) {
280
            GeneralUtility::writeFile($this->processFilename, '');
281
        } else {
282
            if (is_file($this->processFilename)) {
283
                unlink($this->processFilename);
284
            }
285
        }
286
    }
287
288
    /**
289
     * Get disable status
290
     *
291
     * @return bool true if disabled
292
     */
293
    public function getDisabled()
294
    {
295
        return is_file($this->processFilename);
296
    }
297
298
    /**
299
     * @param string $filenameWithPath
300
     */
301
    public function setProcessFilename($filenameWithPath): void
302
    {
303
        $this->processFilename = $filenameWithPath;
304
    }
305
306
    /**
307
     * @return string
308
     */
309
    public function getProcessFilename()
310
    {
311
        return $this->processFilename;
312
    }
313
314
    /**
315
     * Sets the extensions settings (unserialized pendant of $TYPO3_CONF_VARS['EXT']['extConf']['crawler']).
316
     */
317
    public function setExtensionSettings(array $extensionSettings): void
318
    {
319
        $this->extensionSettings = $extensionSettings;
320
    }
321
322
    /**
323
     * Check if the given page should be crawled
324
     *
325
     * @return false|string false if the page should be crawled (not excluded), true / skipMessage if it should be skipped
326
     */
327
    public function checkIfPageShouldBeSkipped(array $pageRow)
328
    {
329
        $skipPage = false;
330
        $skipMessage = 'Skipped'; // message will be overwritten later
331
332
        // if page is hidden
333
        if (! $this->extensionSettings['crawlHiddenPages']) {
334
            if ($pageRow['hidden']) {
335
                $skipPage = true;
336
                $skipMessage = 'Because page is hidden';
337
            }
338
        }
339
340
        if (! $skipPage) {
341
            if (GeneralUtility::inList('3,4,199,254,255', $pageRow['doktype'])) {
342
                $skipPage = true;
343
                $skipMessage = 'Because doktype is not allowed';
344
            }
345
        }
346
347
        if (! $skipPage) {
348
            foreach ($GLOBALS['TYPO3_CONF_VARS']['EXTCONF']['crawler']['excludeDoktype'] ?? [] as $key => $doktypeList) {
349
                if (GeneralUtility::inList($doktypeList, $pageRow['doktype'])) {
350
                    $skipPage = true;
351
                    $skipMessage = 'Doktype was excluded by "' . $key . '"';
352
                    break;
353
                }
354
            }
355
        }
356
357
        if (! $skipPage) {
358
            // veto hook
359
            foreach ($GLOBALS['TYPO3_CONF_VARS']['EXTCONF']['crawler']['pageVeto'] ?? [] as $key => $func) {
360
                $params = [
361
                    'pageRow' => $pageRow,
362
                ];
363
                // expects "false" if page is ok and "true" or a skipMessage if this page should _not_ be crawled
364
                $veto = GeneralUtility::callUserFunction($func, $params, $this);
365
                if ($veto !== false) {
366
                    $skipPage = true;
367
                    if (is_string($veto)) {
368
                        $skipMessage = $veto;
369
                    } else {
370
                        $skipMessage = 'Veto from hook "' . htmlspecialchars($key) . '"';
371
                    }
372
                    // no need to execute other hooks if a previous one return a veto
373
                    break;
374
                }
375
            }
376
        }
377
378
        return $skipPage ? $skipMessage : false;
379
    }
380
381
    /**
382
     * Wrapper method for getUrlsForPageId()
383
     * It returns an array of configurations and no urls!
384
     *
385
     * @param array $pageRow Page record with at least dok-type and uid columns.
386
     * @param string $skipMessage
387
     * @return array
388
     * @see getUrlsForPageId()
389
     */
390
    public function getUrlsForPageRow(array $pageRow, &$skipMessage = '')
391
    {
392
        $message = $this->checkIfPageShouldBeSkipped($pageRow);
393
        if ($message === false) {
394
            $res = $this->getUrlsForPageId($pageRow['uid']);
395
            $skipMessage = '';
396
        } else {
397
            $skipMessage = $message;
398
            $res = [];
399
        }
400
401
        return $res;
402
    }
403
404
    /**
405
     * Creates a list of URLs from input array (and submits them to queue if asked for)
406
     * See Web > Info module script + "indexed_search"'s crawler hook-client using this!
407
     *
408
     * @param array $vv Information about URLs from pageRow to crawl.
409
     * @param array $pageRow Page row
410
     * @param int $scheduledTime Unix time to schedule indexing to, typically time()
411
     * @param int $reqMinute Number of requests per minute (creates the interleave between requests)
412
     * @param bool $submitCrawlUrls If set, submits the URLs to queue
413
     * @param bool $downloadCrawlUrls If set (and submitcrawlUrls is false) will fill $downloadUrls with entries)
414
     * @param array $duplicateTrack Array which is passed by reference and contains the an id per url to secure we will not crawl duplicates
415
     * @param array $downloadUrls Array which will be filled with URLS for download if flag is set.
416
     * @param array $incomingProcInstructions Array of processing instructions
417
     * @return string List of URLs (meant for display in backend module)
418
     */
419
    public function urlListFromUrlArray(
420
        array $vv,
421
        array $pageRow,
422
        $scheduledTime,
423
        $reqMinute,
424
        $submitCrawlUrls,
425
        $downloadCrawlUrls,
426
        array &$duplicateTrack,
427
        array &$downloadUrls,
428
        array $incomingProcInstructions
429
    ) {
430
        if (! is_array($vv['URLs'])) {
431
            return 'ERROR - no URL generated';
432
        }
433
        $urlLog = [];
434
        $pageId = (int) $pageRow['uid'];
435
        $configurationHash = $this->getConfigurationHash($vv);
436
        $skipInnerCheck = $this->queueRepository->noUnprocessedQueueEntriesForPageWithConfigurationHashExist($pageId, $configurationHash);
437
438
        $urlService = new UrlService();
439
440
        foreach ($vv['URLs'] as $urlQuery) {
441
            if (! $this->drawURLs_PIfilter($vv['subCfg']['procInstrFilter'], $incomingProcInstructions)) {
442
                continue;
443
            }
444
            $url = (string) $urlService->getUrlFromPageAndQueryParameters(
445
                $pageId,
446
                $urlQuery,
447
                $vv['subCfg']['baseUrl'] ?? null,
448
                $vv['subCfg']['force_ssl'] ?? 0
449
            );
450
451
            // Create key by which to determine unique-ness:
452
            $uKey = $url . '|' . $vv['subCfg']['userGroups'] . '|' . $vv['subCfg']['procInstrFilter'];
453
454
            if (isset($duplicateTrack[$uKey])) {
455
                //if the url key is registered just display it and do not resubmit is
456
                $urlLog[] = '<em><span class="text-muted">' . htmlspecialchars($url) . '</span></em>';
457
            } else {
458
                // Scheduled time:
459
                $schTime = $scheduledTime + round(count($duplicateTrack) * (60 / $reqMinute));
460
                $schTime = intval($schTime / 60) * 60;
461
                $formattedDate = BackendUtility::datetime($schTime);
462
                $this->urlList[] = '[' . $formattedDate . '] ' . $url;
463
                $urlList = '[' . $formattedDate . '] ' . htmlspecialchars($url);
464
465
                // Submit for crawling!
466
                if ($submitCrawlUrls) {
467
                    $added = $this->addUrl(
468
                        $pageId,
469
                        $url,
470
                        $vv['subCfg'],
471
                        $scheduledTime,
472
                        $configurationHash,
473
                        $skipInnerCheck
474
                    );
475
                    if ($added === false) {
476
                        $urlList .= ' (URL already existed)';
477
                    }
478
                } elseif ($downloadCrawlUrls) {
479
                    $downloadUrls[$url] = $url;
480
                }
481
                $urlLog[] = $urlList;
482
            }
483
            $duplicateTrack[$uKey] = true;
484
        }
485
486
        return implode('<br>', $urlLog);
487
    }
488
489
    /**
490
     * Returns true if input processing instruction is among registered ones.
491
     *
492
     * @param string $piString PI to test
493
     * @param array $incomingProcInstructions Processing instructions
494
     * @return boolean
495
     */
496
    public function drawURLs_PIfilter($piString, array $incomingProcInstructions)
497
    {
498
        if (empty($incomingProcInstructions)) {
499
            return true;
500
        }
501
502
        foreach ($incomingProcInstructions as $pi) {
503
            if (GeneralUtility::inList($piString, $pi)) {
504
                return true;
505
            }
506
        }
507
        return false;
508
    }
509
510
    public function getPageTSconfigForId($id): array
511
    {
512
        if (! $this->MP) {
513
            $pageTSconfig = BackendUtility::getPagesTSconfig($id);
0 ignored issues
show
Deprecated Code introduced by
The function TYPO3\CMS\Backend\Utilit...ity::getPagesTSconfig() has been deprecated. ( Ignorable by Annotation )

If this is a false-positive, you can also ignore this issue in your code via the ignore-deprecated  annotation

513
            $pageTSconfig = /** @scrutinizer ignore-deprecated */ BackendUtility::getPagesTSconfig($id);
Loading history...
514
        } else {
515
            // TODO: Please check, this makes no sense to split a boolean value.
516
            [, $mountPointId] = explode('-', $this->MP);
0 ignored issues
show
Bug introduced by
$this->MP of type true is incompatible with the type string expected by parameter $string of explode(). ( Ignorable by Annotation )

If this is a false-positive, you can also ignore this issue in your code via the ignore-type  annotation

516
            [, $mountPointId] = explode('-', /** @scrutinizer ignore-type */ $this->MP);
Loading history...
517
            $pageTSconfig = BackendUtility::getPagesTSconfig($mountPointId);
0 ignored issues
show
Deprecated Code introduced by
The function TYPO3\CMS\Backend\Utilit...ity::getPagesTSconfig() has been deprecated. ( Ignorable by Annotation )

If this is a false-positive, you can also ignore this issue in your code via the ignore-deprecated  annotation

517
            $pageTSconfig = /** @scrutinizer ignore-deprecated */ BackendUtility::getPagesTSconfig($mountPointId);
Loading history...
Bug introduced by
$mountPointId of type string is incompatible with the type integer expected by parameter $id of TYPO3\CMS\Backend\Utilit...ity::getPagesTSconfig(). ( Ignorable by Annotation )

If this is a false-positive, you can also ignore this issue in your code via the ignore-type  annotation

517
            $pageTSconfig = BackendUtility::getPagesTSconfig(/** @scrutinizer ignore-type */ $mountPointId);
Loading history...
518
        }
519
520
        // Call a hook to alter configuration
521
        if (is_array($GLOBALS['TYPO3_CONF_VARS']['EXTCONF']['crawler']['getPageTSconfigForId'])) {
522
            $params = [
523
                'pageId' => $id,
524
                'pageTSConfig' => &$pageTSconfig,
525
            ];
526
            foreach ($GLOBALS['TYPO3_CONF_VARS']['EXTCONF']['crawler']['getPageTSconfigForId'] as $userFunc) {
527
                GeneralUtility::callUserFunction($userFunc, $params, $this);
528
            }
529
        }
530
        return $pageTSconfig;
531
    }
532
533
    /**
534
     * This methods returns an array of configurations.
535
     * Adds no urls!
536
     */
537
    public function getUrlsForPageId(int $pageId): array
538
    {
539
        // Get page TSconfig for page ID
540
        $pageTSconfig = $this->getPageTSconfigForId($pageId);
541
542
        $res = [];
543
544
        // Fetch Crawler Configuration from pageTSconfig
545
        $crawlerCfg = $pageTSconfig['tx_crawler.']['crawlerCfg.']['paramSets.'] ?? [];
546
        foreach ($crawlerCfg as $key => $values) {
547
            if (! is_array($values)) {
548
                continue;
549
            }
550
            $key = str_replace('.', '', $key);
551
            // Sub configuration for a single configuration string:
552
            $subCfg = (array) $crawlerCfg[$key . '.'];
553
            $subCfg['key'] = $key;
554
555
            if (strcmp($subCfg['procInstrFilter'] ?? '', '')) {
556
                $subCfg['procInstrFilter'] = implode(',', GeneralUtility::trimExplode(',', $subCfg['procInstrFilter']));
557
            }
558
            $pidOnlyList = implode(',', GeneralUtility::trimExplode(',', $subCfg['pidsOnly'], true));
559
560
            // process configuration if it is not page-specific or if the specific page is the current page:
561
            // TODO: Check if $pidOnlyList can be kept as Array instead of imploded
562
            if (! strcmp((string) $subCfg['pidsOnly'], '') || GeneralUtility::inList($pidOnlyList, strval($pageId))) {
563
564
                // Explode, process etc.:
565
                $res[$key] = [];
566
                $res[$key]['subCfg'] = $subCfg;
567
                $res[$key]['paramParsed'] = GeneralUtility::explodeUrl2Array($crawlerCfg[$key]);
568
                $res[$key]['paramExpanded'] = $this->expandParameters($res[$key]['paramParsed'], $pageId);
569
                $res[$key]['origin'] = 'pagets';
570
571
                // recognize MP value
572
                if (! $this->MP) {
573
                    $res[$key]['URLs'] = $this->compileUrls($res[$key]['paramExpanded'], ['?id=' . $pageId]);
574
                } else {
575
                    $res[$key]['URLs'] = $this->compileUrls($res[$key]['paramExpanded'], ['?id=' . $pageId . '&MP=' . $this->MP]);
0 ignored issues
show
Bug introduced by
Are you sure $this->MP of type true can be used in concatenation? ( Ignorable by Annotation )

If this is a false-positive, you can also ignore this issue in your code via the ignore-type  annotation

575
                    $res[$key]['URLs'] = $this->compileUrls($res[$key]['paramExpanded'], ['?id=' . $pageId . '&MP=' . /** @scrutinizer ignore-type */ $this->MP]);
Loading history...
576
                }
577
            }
578
        }
579
580
        // Get configuration from tx_crawler_configuration records up the rootline
581
        $crawlerConfigurations = $this->configurationRepository->getCrawlerConfigurationRecordsFromRootLine($pageId);
582
        foreach ($crawlerConfigurations as $configurationRecord) {
583
584
            // check access to the configuration record
585
            if (empty($configurationRecord['begroups']) || $this->getBackendUser()->isAdmin() || $this->hasGroupAccess($this->getBackendUser()->user['usergroup_cached_list'], $configurationRecord['begroups'])) {
586
                $pidOnlyList = implode(',', GeneralUtility::trimExplode(',', $configurationRecord['pidsonly'], true));
587
588
                // process configuration if it is not page-specific or if the specific page is the current page:
589
                // TODO: Check if $pidOnlyList can be kept as Array instead of imploded
590
                if (! strcmp($configurationRecord['pidsonly'], '') || GeneralUtility::inList($pidOnlyList, strval($pageId))) {
591
                    $key = $configurationRecord['name'];
592
593
                    // don't overwrite previously defined paramSets
594
                    if (! isset($res[$key])) {
595
596
                        /* @var $TSparserObject \TYPO3\CMS\Core\TypoScript\Parser\TypoScriptParser */
597
                        $TSparserObject = GeneralUtility::makeInstance(TypoScriptParser::class);
598
                        $TSparserObject->parse($configurationRecord['processing_instruction_parameters_ts']);
599
600
                        $subCfg = [
601
                            'procInstrFilter' => $configurationRecord['processing_instruction_filter'],
602
                            'procInstrParams.' => $TSparserObject->setup,
603
                            'baseUrl' => $configurationRecord['base_url'],
604
                            'force_ssl' => (int) $configurationRecord['force_ssl'],
605
                            'userGroups' => $configurationRecord['fegroups'],
606
                            'exclude' => $configurationRecord['exclude'],
607
                            'key' => $key,
608
                        ];
609
610
                        if (! in_array($pageId, $this->expandExcludeString($subCfg['exclude']), true)) {
611
                            $res[$key] = [];
612
                            $res[$key]['subCfg'] = $subCfg;
613
                            $res[$key]['paramParsed'] = GeneralUtility::explodeUrl2Array($configurationRecord['configuration']);
614
                            $res[$key]['paramExpanded'] = $this->expandParameters($res[$key]['paramParsed'], $pageId);
615
                            $res[$key]['URLs'] = $this->compileUrls($res[$key]['paramExpanded'], ['?id=' . $pageId]);
616
                            $res[$key]['origin'] = 'tx_crawler_configuration_' . $configurationRecord['uid'];
617
                        }
618
                    }
619
                }
620
            }
621
        }
622
623
        foreach ($GLOBALS['TYPO3_CONF_VARS']['EXTCONF']['crawler']['processUrls'] ?? [] as $func) {
624
            $params = [
625
                'res' => &$res,
626
            ];
627
            GeneralUtility::callUserFunction($func, $params, $this);
628
        }
629
        return $res;
630
    }
631
632
    /**
633
     * Find all configurations of subpages of a page
634
     * TODO: Write Functional Tests
635
     */
636
    public function getConfigurationsForBranch(int $rootid, int $depth): array
637
    {
638
        $configurationsForBranch = [];
639
        $pageTSconfig = $this->getPageTSconfigForId($rootid);
640
        $sets = $pageTSconfig['tx_crawler.']['crawlerCfg.']['paramSets.'] ?? [];
641
        foreach ($sets as $key => $value) {
642
            if (! is_array($value)) {
643
                continue;
644
            }
645
            $configurationsForBranch[] = substr($key, -1) === '.' ? substr($key, 0, -1) : $key;
646
        }
647
        $pids = [];
648
        $rootLine = BackendUtility::BEgetRootLine($rootid);
649
        foreach ($rootLine as $node) {
650
            $pids[] = $node['uid'];
651
        }
652
        /* @var PageTreeView $tree */
653
        $tree = GeneralUtility::makeInstance(PageTreeView::class);
654
        $perms_clause = $this->getBackendUser()->getPagePermsClause(Permission::PAGE_SHOW);
655
        $tree->init(empty($perms_clause) ? '' : ('AND ' . $perms_clause));
656
        $tree->getTree($rootid, $depth, '');
657
        foreach ($tree->tree as $node) {
658
            $pids[] = $node['row']['uid'];
659
        }
660
661
        $queryBuilder = $this->getQueryBuilder('tx_crawler_configuration');
662
        $statement = $queryBuilder
663
            ->select('name')
664
            ->from('tx_crawler_configuration')
665
            ->where(
666
                $queryBuilder->expr()->in('pid', $queryBuilder->createNamedParameter($pids, Connection::PARAM_INT_ARRAY))
667
            )
668
            ->execute();
669
670
        while ($row = $statement->fetch()) {
671
            $configurationsForBranch[] = $row['name'];
672
        }
673
        return $configurationsForBranch;
674
    }
675
676
    /**
677
     * Check if a user has access to an item
678
     * (e.g. get the group list of the current logged in user from $GLOBALS['TSFE']->gr_list)
679
     *
680
     * @param string $groupList Comma-separated list of (fe_)group UIDs from a user
681
     * @param string $accessList Comma-separated list of (fe_)group UIDs of the item to access
682
     * @return bool                 TRUE if at least one of the users group UIDs is in the access list or the access list is empty
683
     * @see \TYPO3\CMS\Frontend\Page\PageRepository::getMultipleGroupsWhereClause()
684
     */
685
    public function hasGroupAccess($groupList, $accessList)
686
    {
687
        if (empty($accessList)) {
688
            return true;
689
        }
690
        foreach (GeneralUtility::intExplode(',', $groupList) as $groupUid) {
691
            if (GeneralUtility::inList($accessList, $groupUid)) {
692
                return true;
693
            }
694
        }
695
        return false;
696
    }
697
698
    /**
699
     * Will expand the parameters configuration to individual values. This follows a certain syntax of the value of each parameter.
700
     * Syntax of values:
701
     * - Basically: If the value is wrapped in [...] it will be expanded according to the following syntax, otherwise the value is taken literally
702
     * - Configuration is splitted by "|" and the parts are processed individually and finally added together
703
     * - For each configuration part:
704
     *         - "[int]-[int]" = Integer range, will be expanded to all values in between, values included, starting from low to high (max. 1000). Example "1-34" or "-40--30"
705
     *         - "_TABLE:[TCA table name];[_PID:[optional page id, default is current page]];[_ENABLELANG:1]" = Look up of table records from PID, filtering out deleted records. Example "_TABLE:tt_content; _PID:123"
706
     *        _ENABLELANG:1 picks only original records without their language overlays
707
     *         - Default: Literal value
708
     *
709
     * @param array $paramArray Array with key (GET var name) and values (value of GET var which is configuration for expansion)
710
     * @param integer $pid Current page ID
711
     * @return array
712
     *
713
     * TODO: Write Functional Tests
714
     */
715
    public function expandParameters($paramArray, $pid)
716
    {
717
        // Traverse parameter names:
718
        foreach ($paramArray as $p => $v) {
719
            $v = trim($v);
720
721
            // If value is encapsulated in square brackets it means there are some ranges of values to find, otherwise the value is literal
722
            if (strpos($v, '[') === 0 && substr($v, -1) === ']') {
723
                // So, find the value inside brackets and reset the paramArray value as an array.
724
                $v = substr($v, 1, -1);
725
                $paramArray[$p] = [];
726
727
                // Explode parts and traverse them:
728
                $parts = explode('|', $v);
729
                foreach ($parts as $pV) {
730
731
                    // Look for integer range: (fx. 1-34 or -40--30 // reads minus 40 to minus 30)
732
                    if (preg_match('/^(-?[0-9]+)\s*-\s*(-?[0-9]+)$/', trim($pV), $reg)) {
733
                        $reg = $this->swapIfFirstIsLargerThanSecond($reg);
734
735
                        // Traverse range, add values:
736
                        $runAwayBrake = 1000; // Limit to size of range!
737
                        for ($a = $reg[1]; $a <= $reg[2]; $a++) {
738
                            $paramArray[$p][] = $a;
739
                            $runAwayBrake--;
740
                            if ($runAwayBrake <= 0) {
741
                                break;
742
                            }
743
                        }
744
                    } elseif (strpos(trim($pV), '_TABLE:') === 0) {
745
746
                        // Parse parameters:
747
                        $subparts = GeneralUtility::trimExplode(';', $pV);
748
                        $subpartParams = [];
749
                        foreach ($subparts as $spV) {
750
                            [$pKey, $pVal] = GeneralUtility::trimExplode(':', $spV);
751
                            $subpartParams[$pKey] = $pVal;
752
                        }
753
754
                        // Table exists:
755
                        if (isset($GLOBALS['TCA'][$subpartParams['_TABLE']])) {
756
                            $lookUpPid = isset($subpartParams['_PID']) ? intval($subpartParams['_PID']) : intval($pid);
757
                            $recursiveDepth = isset($subpartParams['_RECURSIVE']) ? intval($subpartParams['_RECURSIVE']) : 0;
758
                            $pidField = isset($subpartParams['_PIDFIELD']) ? trim($subpartParams['_PIDFIELD']) : 'pid';
759
                            $where = $subpartParams['_WHERE'] ?? '';
760
                            $addTable = $subpartParams['_ADDTABLE'] ?? '';
761
762
                            $fieldName = $subpartParams['_FIELD'] ? $subpartParams['_FIELD'] : 'uid';
763
                            if ($fieldName === 'uid' || $GLOBALS['TCA'][$subpartParams['_TABLE']]['columns'][$fieldName]) {
764
                                $queryBuilder = $this->getQueryBuilder($subpartParams['_TABLE']);
765
766
                                if ($recursiveDepth > 0) {
767
                                    /** @var \TYPO3\CMS\Core\Database\QueryGenerator $queryGenerator */
768
                                    $queryGenerator = GeneralUtility::makeInstance(\TYPO3\CMS\Core\Database\QueryGenerator::class);
769
                                    $pidList = $queryGenerator->getTreeList($lookUpPid, $recursiveDepth, 0, 1);
770
                                    $pidArray = GeneralUtility::intExplode(',', $pidList);
771
                                } else {
772
                                    $pidArray = [(string) $lookUpPid];
773
                                }
774
775
                                $queryBuilder->getRestrictions()
776
                                    ->removeAll()
777
                                    ->add(GeneralUtility::makeInstance(DeletedRestriction::class));
778
779
                                $queryBuilder
780
                                    ->select($fieldName)
781
                                    ->from($subpartParams['_TABLE'])
782
                                    ->where(
783
                                        $queryBuilder->expr()->in($pidField, $queryBuilder->createNamedParameter($pidArray, Connection::PARAM_INT_ARRAY)),
784
                                        $where
785
                                    );
786
787
                                if (! empty($addTable)) {
788
                                    // TODO: Check if this works as intended!
789
                                    $queryBuilder->add('from', $addTable);
790
                                }
791
                                $transOrigPointerField = $GLOBALS['TCA'][$subpartParams['_TABLE']]['ctrl']['transOrigPointerField'];
792
793
                                if ($subpartParams['_ENABLELANG'] && $transOrigPointerField) {
794
                                    $queryBuilder->andWhere(
795
                                        $queryBuilder->expr()->lte(
796
                                            $transOrigPointerField,
797
                                            0
798
                                        )
799
                                    );
800
                                }
801
802
                                $statement = $queryBuilder->execute();
803
804
                                $rows = [];
805
                                while ($row = $statement->fetch()) {
806
                                    $rows[$row[$fieldName]] = $row;
807
                                }
808
809
                                if (is_array($rows)) {
810
                                    $paramArray[$p] = array_merge($paramArray[$p], array_keys($rows));
811
                                }
812
                            }
813
                        }
814
                    } else { // Just add value:
815
                        $paramArray[$p][] = $pV;
816
                    }
817
                    // Hook for processing own expandParameters place holder
818
                    if (is_array($GLOBALS['TYPO3_CONF_VARS']['SC_OPTIONS']['crawler/class.tx_crawler_lib.php']['expandParameters'])) {
819
                        $_params = [
820
                            'pObj' => &$this,
821
                            'paramArray' => &$paramArray,
822
                            'currentKey' => $p,
823
                            'currentValue' => $pV,
824
                            'pid' => $pid,
825
                        ];
826
                        foreach ($GLOBALS['TYPO3_CONF_VARS']['SC_OPTIONS']['crawler/class.tx_crawler_lib.php']['expandParameters'] as $_funcRef) {
827
                            GeneralUtility::callUserFunction($_funcRef, $_params, $this);
828
                        }
829
                    }
830
                }
831
832
                // Make unique set of values and sort array by key:
833
                $paramArray[$p] = array_unique($paramArray[$p]);
834
                ksort($paramArray);
835
            } else {
836
                // Set the literal value as only value in array:
837
                $paramArray[$p] = [$v];
838
            }
839
        }
840
841
        return $paramArray;
842
    }
843
844
    /**
845
     * Compiling URLs from parameter array (output of expandParameters())
846
     * The number of URLs will be the multiplication of the number of parameter values for each key
847
     *
848
     * @param array $paramArray Output of expandParameters(): Array with keys (GET var names) and for each an array of values
849
     * @param array $urls URLs accumulated in this array (for recursion)
850
     * @return array
851
     */
852
    public function compileUrls($paramArray, array $urls)
853
    {
854
        if (empty($paramArray)) {
855
            return $urls;
856
        }
857
        // shift first off stack:
858
        reset($paramArray);
859
        $varName = key($paramArray);
860
        $valueSet = array_shift($paramArray);
861
862
        // Traverse value set:
863
        $newUrls = [];
864
        foreach ($urls as $url) {
865
            foreach ($valueSet as $val) {
866
                $newUrls[] = $url . (strcmp((string) $val, '') ? '&' . rawurlencode($varName) . '=' . rawurlencode((string) $val) : '');
867
868
                if (count($newUrls) > $this->maximumUrlsToCompile) {
869
                    break;
870
                }
871
            }
872
        }
873
        return $this->compileUrls($paramArray, $newUrls);
874
    }
875
876
    /************************************
877
     *
878
     * Crawler log
879
     *
880
     ************************************/
881
882
    /**
883
     * Return array of records from crawler queue for input page ID
884
     *
885
     * @param integer $id Page ID for which to look up log entries.
886
     * @param string $filter Filter: "all" => all entries, "pending" => all that is not yet run, "finished" => all complete ones
887
     * @param boolean $doFlush If TRUE, then entries selected at DELETED(!) instead of selected!
888
     * @param boolean $doFullFlush
889
     * @param integer $itemsPerPage Limit the amount of entries per page default is 10
890
     * @return array
891
     */
892
    public function getLogEntriesForPageId($id, $filter = '', $doFlush = false, $doFullFlush = false, $itemsPerPage = 10)
893
    {
894
        $queryBuilder = GeneralUtility::makeInstance(ConnectionPool::class)->getQueryBuilderForTable($this->tableName);
895
        $queryBuilder
896
            ->select('*')
897
            ->from($this->tableName)
898
            ->where(
899
                $queryBuilder->expr()->eq('page_id', $queryBuilder->createNamedParameter($id, \PDO::PARAM_INT))
900
            )
901
            ->orderBy('scheduled', 'DESC');
902
903
        $expressionBuilder = GeneralUtility::makeInstance(ConnectionPool::class)
904
            ->getConnectionForTable($this->tableName)
905
            ->getExpressionBuilder();
906
        $query = $expressionBuilder->andX();
0 ignored issues
show
Unused Code introduced by
The assignment to $query is dead and can be removed.
Loading history...
907
        // PHPStorm adds the highlight that the $addWhere is immediately overwritten,
908
        // but the $query = $expressionBuilder->andX() ensures that the $addWhere is written correctly with AND
909
        // between the statements, it's not a mistake in the code.
910
        switch ($filter) {
911
            case 'pending':
912
                $queryBuilder->andWhere($queryBuilder->expr()->eq('exec_time', 0));
913
                break;
914
            case 'finished':
915
                $queryBuilder->andWhere($queryBuilder->expr()->gt('exec_time', 0));
916
                break;
917
        }
918
919
        if ($doFlush) {
920
            if ($doFullFlush) {
921
                $this->queueRepository->flushQueue('all');
922
            } else {
923
                $this->queueRepository->flushQueue($filter);
924
            }
925
        }
926
        if ($itemsPerPage > 0) {
927
            $queryBuilder
928
                ->setMaxResults((int) $itemsPerPage);
929
        }
930
931
        return $queryBuilder->execute()->fetchAll();
932
    }
933
934
    /**
935
     * Return array of records from crawler queue for input set ID
936
     *
937
     * @param int $set_id Set ID for which to look up log entries.
938
     * @param string $filter Filter: "all" => all entries, "pending" => all that is not yet run, "finished" => all complete ones
939
     * @param bool $doFlush If TRUE, then entries selected at DELETED(!) instead of selected!
940
     * @param int $itemsPerPage Limit the amount of entries per page default is 10
941
     * @return array
942
     *
943
     * @deprecated
944
     */
945
    public function getLogEntriesForSetId(int $set_id, string $filter = '', bool $doFlush = false, bool $doFullFlush = false, int $itemsPerPage = 10)
946
    {
947
        $queryBuilder = GeneralUtility::makeInstance(ConnectionPool::class)->getQueryBuilderForTable($this->tableName);
948
        $queryBuilder
949
            ->select('*')
950
            ->from($this->tableName)
951
            ->where(
952
                $queryBuilder->expr()->eq('set_id', $queryBuilder->createNamedParameter($set_id, \PDO::PARAM_INT))
953
            )
954
            ->orderBy('scheduled', 'DESC');
955
956
        $expressionBuilder = GeneralUtility::makeInstance(ConnectionPool::class)
957
            ->getConnectionForTable($this->tableName)
958
            ->getExpressionBuilder();
959
        $query = $expressionBuilder->andX();
960
        // PHPStorm adds the highlight that the $addWhere is immediately overwritten,
961
        // but the $query = $expressionBuilder->andX() ensures that the $addWhere is written correctly with AND
962
        // between the statements, it's not a mistake in the code.
963
        $addWhere = '';
964
        switch ($filter) {
965
            case 'pending':
966
                $queryBuilder->andWhere($queryBuilder->expr()->eq('exec_time', 0));
967
                $addWhere = $query->add($expressionBuilder->eq('exec_time', 0));
0 ignored issues
show
Unused Code introduced by
The assignment to $addWhere is dead and can be removed.
Loading history...
968
                break;
969
            case 'finished':
970
                $queryBuilder->andWhere($queryBuilder->expr()->gt('exec_time', 0));
971
                $addWhere = $query->add($expressionBuilder->gt('exec_time', 0));
972
                break;
973
        }
974
        if ($doFlush) {
975
            $addWhere = $query->add($expressionBuilder->eq('set_id', (int) $set_id));
976
            $this->flushQueue($doFullFlush ? '' : $addWhere);
0 ignored issues
show
Deprecated Code introduced by
The function AOE\Crawler\Controller\C...ontroller::flushQueue() has been deprecated. ( Ignorable by Annotation )

If this is a false-positive, you can also ignore this issue in your code via the ignore-deprecated  annotation

976
            /** @scrutinizer ignore-deprecated */ $this->flushQueue($doFullFlush ? '' : $addWhere);
Loading history...
977
            return [];
978
        }
979
        if ($itemsPerPage > 0) {
980
            $queryBuilder
981
                ->setMaxResults((int) $itemsPerPage);
982
        }
983
984
        return $queryBuilder->execute()->fetchAll();
985
    }
986
987
    /**
988
     * Adding call back entries to log (called from hooks typically, see indexed search class "class.crawler.php"
989
     *
990
     * @param integer $setId Set ID
991
     * @param array $params Parameters to pass to call back function
992
     * @param string $callBack Call back object reference, eg. 'EXT:indexed_search/class.crawler.php:&tx_indexedsearch_crawler'
993
     * @param integer $page_id Page ID to attach it to
994
     * @param integer $schedule Time at which to activate
995
     */
996
    public function addQueueEntry_callBack($setId, $params, $callBack, $page_id = 0, $schedule = 0): void
997
    {
998
        if (! is_array($params)) {
0 ignored issues
show
introduced by
The condition is_array($params) is always true.
Loading history...
999
            $params = [];
1000
        }
1001
        $params['_CALLBACKOBJ'] = $callBack;
1002
1003
        GeneralUtility::makeInstance(ConnectionPool::class)->getConnectionForTable('tx_crawler_queue')
1004
            ->insert(
1005
                'tx_crawler_queue',
1006
                [
1007
                    'page_id' => (int) $page_id,
1008
                    'parameters' => json_encode($params),
1009
                    'scheduled' => (int) $schedule ?: $this->getCurrentTime(),
1010
                    'exec_time' => 0,
1011
                    'set_id' => (int) $setId,
1012
                    'result_data' => '',
1013
                ]
1014
            );
1015
    }
1016
1017
    /************************************
1018
     *
1019
     * URL setting
1020
     *
1021
     ************************************/
1022
1023
    /**
1024
     * Setting a URL for crawling:
1025
     *
1026
     * @param integer $id Page ID
1027
     * @param string $url Complete URL
1028
     * @param array $subCfg Sub configuration array (from TS config)
1029
     * @param integer $tstamp Scheduled-time
1030
     * @param string $configurationHash (optional) configuration hash
1031
     * @param bool $skipInnerDuplicationCheck (optional) skip inner duplication check
1032
     * @return bool
1033
     */
1034
    public function addUrl(
1035
        $id,
1036
        $url,
1037
        array $subCfg,
1038
        $tstamp,
1039
        $configurationHash = '',
1040
        $skipInnerDuplicationCheck = false
1041
    ) {
1042
        $urlAdded = false;
1043
        $rows = [];
1044
1045
        // Creating parameters:
1046
        $parameters = [
1047
            'url' => $url,
1048
        ];
1049
1050
        // fe user group simulation:
1051
        $uGs = implode(',', array_unique(GeneralUtility::intExplode(',', $subCfg['userGroups'], true)));
1052
        if ($uGs) {
1053
            $parameters['feUserGroupList'] = $uGs;
1054
        }
1055
1056
        // Setting processing instructions
1057
        $parameters['procInstructions'] = GeneralUtility::trimExplode(',', $subCfg['procInstrFilter']);
1058
        if (is_array($subCfg['procInstrParams.'])) {
1059
            $parameters['procInstrParams'] = $subCfg['procInstrParams.'];
1060
        }
1061
1062
        // Compile value array:
1063
        $parameters_serialized = json_encode($parameters);
1064
        $fieldArray = [
1065
            'page_id' => (int) $id,
1066
            'parameters' => $parameters_serialized,
1067
            'parameters_hash' => GeneralUtility::shortMD5($parameters_serialized),
1068
            'configuration_hash' => $configurationHash,
1069
            'scheduled' => $tstamp,
1070
            'exec_time' => 0,
1071
            'set_id' => (int) $this->setID,
1072
            'result_data' => '',
1073
            'configuration' => $subCfg['key'],
1074
        ];
1075
1076
        if ($this->registerQueueEntriesInternallyOnly) {
1077
            //the entries will only be registered and not stored to the database
1078
            $this->queueEntries[] = $fieldArray;
1079
        } else {
1080
            if (! $skipInnerDuplicationCheck) {
1081
                // check if there is already an equal entry
1082
                $rows = $this->getDuplicateRowsIfExist($tstamp, $fieldArray);
1083
            }
1084
1085
            if (empty($rows)) {
1086
                $connectionForCrawlerQueue = GeneralUtility::makeInstance(ConnectionPool::class)->getConnectionForTable('tx_crawler_queue');
1087
                $connectionForCrawlerQueue->insert(
1088
                    'tx_crawler_queue',
1089
                    $fieldArray
1090
                );
1091
                $uid = $connectionForCrawlerQueue->lastInsertId('tx_crawler_queue', 'qid');
1092
                $rows[] = $uid;
1093
                $urlAdded = true;
1094
1095
                $signalPayload = ['uid' => $uid, 'fieldArray' => $fieldArray];
1096
                SignalSlotUtility::emitSignal(
1097
                    self::class,
1098
                    SignalSlotUtility::SIGNAL_URL_ADDED_TO_QUEUE,
1099
                    $signalPayload
1100
                );
1101
            } else {
1102
                $signalPayload = ['rows' => $rows, 'fieldArray' => $fieldArray];
1103
                SignalSlotUtility::emitSignal(
1104
                    self::class,
1105
                    SignalSlotUtility::SIGNAL_DUPLICATE_URL_IN_QUEUE,
1106
                    $signalPayload
1107
                );
1108
            }
1109
        }
1110
1111
        return $urlAdded;
1112
    }
1113
1114
    /**
1115
     * Returns the current system time
1116
     *
1117
     * @return int
1118
     */
1119
    public function getCurrentTime()
1120
    {
1121
        return time();
1122
    }
1123
1124
    /************************************
1125
     *
1126
     * URL reading
1127
     *
1128
     ************************************/
1129
1130
    /**
1131
     * Read URL for single queue entry
1132
     *
1133
     * @param integer $queueId
1134
     * @param boolean $force If set, will process even if exec_time has been set!
1135
     * @return integer
1136
     */
1137
    public function readUrl($queueId, $force = false)
1138
    {
1139
        $queryBuilder = GeneralUtility::makeInstance(ConnectionPool::class)->getQueryBuilderForTable($this->tableName);
1140
        $ret = 0;
1141
        $this->logger->debug('crawler-readurl start ' . microtime(true));
1142
        // Get entry:
1143
        $queryBuilder
1144
            ->select('*')
1145
            ->from('tx_crawler_queue')
1146
            ->where(
1147
                $queryBuilder->expr()->eq('qid', $queryBuilder->createNamedParameter($queueId, \PDO::PARAM_INT))
1148
            );
1149
        if (! $force) {
1150
            $queryBuilder
1151
                ->andWhere('exec_time = 0')
1152
                ->andWhere('process_scheduled > 0');
1153
        }
1154
        $queueRec = $queryBuilder->execute()->fetch();
1155
1156
        if (! is_array($queueRec)) {
1157
            return;
1158
        }
1159
1160
        SignalSlotUtility::emitSignal(
1161
            self::class,
1162
            SignalSlotUtility::SIGNAL_QUEUEITEM_PREPROCESS,
1163
            [$queueId, &$queueRec]
1164
        );
1165
1166
        // Set exec_time to lock record:
1167
        $field_array = ['exec_time' => $this->getCurrentTime()];
1168
1169
        if (isset($this->processID)) {
1170
            //if mulitprocessing is used we need to store the id of the process which has handled this entry
1171
            $field_array['process_id_completed'] = $this->processID;
1172
        }
1173
1174
        GeneralUtility::makeInstance(ConnectionPool::class)->getConnectionForTable('tx_crawler_queue')
1175
            ->update(
1176
                'tx_crawler_queue',
1177
                $field_array,
1178
                ['qid' => (int) $queueId]
1179
            );
1180
1181
        $result = $this->queueExecutor->executeQueueItem($queueRec, $this);
1182
        if ($result['content'] === null) {
1183
            $resultData = 'An errors happened';
1184
        } else {
1185
            /** @var JsonCompatibilityConverter $jsonCompatibilityConverter */
1186
            $jsonCompatibilityConverter = GeneralUtility::makeInstance(JsonCompatibilityConverter::class);
1187
            $resultData = $jsonCompatibilityConverter->convert($result['content']);
1188
        }
1189
1190
        //atm there's no need to point to specific pollable extensions
1191
        if (is_array($GLOBALS['TYPO3_CONF_VARS']['EXTCONF']['crawler']['pollSuccess'])) {
1192
            foreach ($GLOBALS['TYPO3_CONF_VARS']['EXTCONF']['crawler']['pollSuccess'] as $pollable) {
1193
                // only check the success value if the instruction is runnig
1194
                // it is important to name the pollSuccess key same as the procInstructions key
1195
                if (is_array($resultData['parameters']['procInstructions'])
1196
                    && in_array(
1197
                        $pollable,
1198
                        $resultData['parameters']['procInstructions'], true
1199
                    )
1200
                ) {
1201
                    if (! empty($resultData['success'][$pollable]) && $resultData['success'][$pollable]) {
1202
                        $ret |= self::CLI_STATUS_POLLABLE_PROCESSED;
1203
                    }
1204
                }
1205
            }
1206
        }
1207
1208
        // Set result in log which also denotes the end of the processing of this entry.
1209
        $field_array = ['result_data' => json_encode($result)];
1210
1211
        SignalSlotUtility::emitSignal(
1212
            self::class,
1213
            SignalSlotUtility::SIGNAL_QUEUEITEM_POSTPROCESS,
1214
            [$queueId, &$field_array]
1215
        );
1216
1217
        GeneralUtility::makeInstance(ConnectionPool::class)->getConnectionForTable('tx_crawler_queue')
1218
            ->update(
1219
                'tx_crawler_queue',
1220
                $field_array,
1221
                ['qid' => (int) $queueId]
1222
            );
1223
1224
        $this->logger->debug('crawler-readurl stop ' . microtime(true));
1225
        return $ret;
1226
    }
1227
1228
    /**
1229
     * Read URL for not-yet-inserted log-entry
1230
     *
1231
     * @param array $field_array Queue field array,
1232
     *
1233
     * @return array|bool|mixed|string
1234
     */
1235
    public function readUrlFromArray($field_array)
1236
    {
1237
        // Set exec_time to lock record:
1238
        $field_array['exec_time'] = $this->getCurrentTime();
1239
        $connectionForCrawlerQueue = GeneralUtility::makeInstance(ConnectionPool::class)->getConnectionForTable($this->tableName);
1240
        $connectionForCrawlerQueue->insert(
1241
            $this->tableName,
1242
            $field_array
1243
        );
1244
        $queueId = $field_array['qid'] = $connectionForCrawlerQueue->lastInsertId($this->tableName, 'qid');
1245
1246
        $result = $this->queueExecutor->executeQueueItem($field_array, $this);
1247
1248
        // Set result in log which also denotes the end of the processing of this entry.
1249
        $field_array = ['result_data' => json_encode($result)];
1250
1251
        SignalSlotUtility::emitSignal(
1252
            self::class,
1253
            SignalSlotUtility::SIGNAL_QUEUEITEM_POSTPROCESS,
1254
            [$queueId, &$field_array]
1255
        );
1256
1257
        $connectionForCrawlerQueue->update(
1258
            $this->tableName,
1259
            $field_array,
1260
            ['qid' => $queueId]
1261
        );
1262
1263
        return $result;
1264
    }
1265
1266
    /*****************************
1267
     *
1268
     * Compiling URLs to crawl - tools
1269
     *
1270
     *****************************/
1271
1272
    /**
1273
     * @param integer $id Root page id to start from.
1274
     * @param integer $depth Depth of tree, 0=only id-page, 1= on sublevel, 99 = infinite
1275
     * @param integer $scheduledTime Unix Time when the URL is timed to be visited when put in queue
1276
     * @param integer $reqMinute Number of requests per minute (creates the interleave between requests)
1277
     * @param boolean $submitCrawlUrls If set, submits the URLs to queue in database (real crawling)
1278
     * @param boolean $downloadCrawlUrls If set (and submitcrawlUrls is false) will fill $downloadUrls with entries)
1279
     * @param array $incomingProcInstructions Array of processing instructions
1280
     * @param array $configurationSelection Array of configuration keys
1281
     * @return string
1282
     */
1283
    public function getPageTreeAndUrls(
1284
        $id,
1285
        $depth,
1286
        $scheduledTime,
1287
        $reqMinute,
1288
        $submitCrawlUrls,
1289
        $downloadCrawlUrls,
1290
        array $incomingProcInstructions,
1291
        array $configurationSelection
1292
    ) {
1293
        $this->scheduledTime = $scheduledTime;
1294
        $this->reqMinute = $reqMinute;
1295
        $this->submitCrawlUrls = $submitCrawlUrls;
1296
        $this->downloadCrawlUrls = $downloadCrawlUrls;
1297
        $this->incomingProcInstructions = $incomingProcInstructions;
1298
        $this->incomingConfigurationSelection = $configurationSelection;
1299
1300
        $this->duplicateTrack = [];
1301
        $this->downloadUrls = [];
1302
1303
        // Drawing tree:
1304
        /* @var PageTreeView $tree */
1305
        $tree = GeneralUtility::makeInstance(PageTreeView::class);
1306
        $perms_clause = $this->getBackendUser()->getPagePermsClause(Permission::PAGE_SHOW);
1307
        $tree->init('AND ' . $perms_clause);
1308
1309
        $pageInfo = BackendUtility::readPageAccess($id, $perms_clause);
1310
        if (is_array($pageInfo)) {
0 ignored issues
show
introduced by
The condition is_array($pageInfo) is always true.
Loading history...
1311
            // Set root row:
1312
            $tree->tree[] = [
1313
                'row' => $pageInfo,
1314
                'HTML' => $this->iconFactory->getIconForRecord('pages', $pageInfo, Icon::SIZE_SMALL),
1315
            ];
1316
        }
1317
1318
        // Get branch beneath:
1319
        if ($depth) {
1320
            $tree->getTree($id, $depth, '');
1321
        }
1322
1323
        // Traverse page tree:
1324
        $code = '';
1325
1326
        foreach ($tree->tree as $data) {
1327
            $this->MP = false;
1328
1329
            // recognize mount points
1330
            if ($data['row']['doktype'] === PageRepository::DOKTYPE_MOUNTPOINT) {
1331
                $queryBuilder = GeneralUtility::makeInstance(ConnectionPool::class)->getQueryBuilderForTable('pages');
1332
                $queryBuilder->getRestrictions()->removeAll()->add(GeneralUtility::makeInstance(DeletedRestriction::class));
1333
                $mountpage = $queryBuilder
1334
                    ->select('*')
1335
                    ->from('pages')
1336
                    ->where(
1337
                        $queryBuilder->expr()->eq('uid', $queryBuilder->createNamedParameter($data['row']['uid'], \PDO::PARAM_INT))
1338
                    )
1339
                    ->execute()
1340
                    ->fetchAll();
1341
                $queryBuilder->resetRestrictions();
1342
1343
                // fetch mounted pages
1344
                $this->MP = $mountpage[0]['mount_pid'] . '-' . $data['row']['uid'];
0 ignored issues
show
Documentation Bug introduced by
The property $MP was declared of type boolean, but $mountpage[0]['mount_pid...' . $data['row']['uid'] is of type string. Maybe add a type cast?

This check looks for assignments to scalar types that may be of the wrong type.

To ensure the code behaves as expected, it may be a good idea to add an explicit type cast.

$answer = 42;

$correct = false;

$correct = (bool) $answer;
Loading history...
1345
1346
                $mountTree = GeneralUtility::makeInstance(PageTreeView::class);
1347
                $mountTree->init('AND ' . $perms_clause);
1348
                $mountTree->getTree($mountpage[0]['mount_pid'], $depth);
1349
1350
                foreach ($mountTree->tree as $mountData) {
1351
                    $code .= $this->drawURLs_addRowsForPage(
1352
                        $mountData['row'],
1353
                        $mountData['HTML'] . BackendUtility::getRecordTitle('pages', $mountData['row'], true)
1354
                    );
1355
                }
1356
1357
                // replace page when mount_pid_ol is enabled
1358
                if ($mountpage[0]['mount_pid_ol']) {
1359
                    $data['row']['uid'] = $mountpage[0]['mount_pid'];
1360
                } else {
1361
                    // if the mount_pid_ol is not set the MP must not be used for the mountpoint page
1362
                    $this->MP = false;
1363
                }
1364
            }
1365
1366
            $code .= $this->drawURLs_addRowsForPage(
1367
                $data['row'],
1368
                $data['HTML'] . BackendUtility::getRecordTitle('pages', $data['row'], true)
1369
            );
1370
        }
1371
1372
        return $code;
1373
    }
1374
1375
    /**
1376
     * Expands exclude string
1377
     *
1378
     * @param string $excludeString Exclude string
1379
     * @return array
1380
     */
1381
    public function expandExcludeString($excludeString)
1382
    {
1383
        // internal static caches;
1384
        static $expandedExcludeStringCache;
1385
        static $treeCache;
1386
1387
        if (empty($expandedExcludeStringCache[$excludeString])) {
1388
            $pidList = [];
1389
1390
            if (! empty($excludeString)) {
1391
                /** @var PageTreeView $tree */
1392
                $tree = GeneralUtility::makeInstance(PageTreeView::class);
1393
                $tree->init('AND ' . $this->getBackendUser()->getPagePermsClause(Permission::PAGE_SHOW));
1394
1395
                $excludeParts = GeneralUtility::trimExplode(',', $excludeString);
1396
1397
                foreach ($excludeParts as $excludePart) {
1398
                    [$pid, $depth] = GeneralUtility::trimExplode('+', $excludePart);
1399
1400
                    // default is "page only" = "depth=0"
1401
                    if (empty($depth)) {
1402
                        $depth = (stristr($excludePart, '+')) ? 99 : 0;
1403
                    }
1404
1405
                    $pidList[] = (int) $pid;
1406
1407
                    if ($depth > 0) {
1408
                        if (empty($treeCache[$pid][$depth])) {
1409
                            $tree->reset();
1410
                            $tree->getTree($pid, $depth);
0 ignored issues
show
Bug introduced by
$pid of type string is incompatible with the type integer expected by parameter $uid of TYPO3\CMS\Backend\Tree\V...ractTreeView::getTree(). ( Ignorable by Annotation )

If this is a false-positive, you can also ignore this issue in your code via the ignore-type  annotation

1410
                            $tree->getTree(/** @scrutinizer ignore-type */ $pid, $depth);
Loading history...
1411
                            $treeCache[$pid][$depth] = $tree->tree;
1412
                        }
1413
1414
                        foreach ($treeCache[$pid][$depth] as $data) {
1415
                            $pidList[] = (int) $data['row']['uid'];
1416
                        }
1417
                    }
1418
                }
1419
            }
1420
1421
            $expandedExcludeStringCache[$excludeString] = array_unique($pidList);
1422
        }
1423
1424
        return $expandedExcludeStringCache[$excludeString];
1425
    }
1426
1427
    /**
1428
     * Create the rows for display of the page tree
1429
     * For each page a number of rows are shown displaying GET variable configuration
1430
     */
1431
    public function drawURLs_addRowsForPage(array $pageRow, string $pageTitle): string
1432
    {
1433
        $skipMessage = '';
1434
1435
        // Get list of configurations
1436
        $configurations = $this->getUrlsForPageRow($pageRow, $skipMessage);
1437
1438
        if (! empty($this->incomingConfigurationSelection)) {
1439
            // remove configuration that does not match the current selection
1440
            foreach ($configurations as $confKey => $confArray) {
1441
                if (! in_array($confKey, $this->incomingConfigurationSelection, true)) {
1442
                    unset($configurations[$confKey]);
1443
                }
1444
            }
1445
        }
1446
1447
        // Traverse parameter combinations:
1448
        $c = 0;
1449
        $content = '';
1450
        if (! empty($configurations)) {
1451
            foreach ($configurations as $confKey => $confArray) {
1452
1453
                // Title column:
1454
                if (! $c) {
1455
                    $titleClm = '<td rowspan="' . count($configurations) . '">' . $pageTitle . '</td>';
1456
                } else {
1457
                    $titleClm = '';
1458
                }
1459
1460
                if (! in_array($pageRow['uid'], $this->expandExcludeString($confArray['subCfg']['exclude']), true)) {
1461
1462
                    // URL list:
1463
                    $urlList = $this->urlListFromUrlArray(
1464
                        $confArray,
1465
                        $pageRow,
1466
                        $this->scheduledTime,
1467
                        $this->reqMinute,
1468
                        $this->submitCrawlUrls,
1469
                        $this->downloadCrawlUrls,
1470
                        $this->duplicateTrack,
1471
                        $this->downloadUrls,
1472
                        $this->incomingProcInstructions // if empty the urls won't be filtered by processing instructions
1473
                    );
1474
1475
                    // Expanded parameters:
1476
                    $paramExpanded = '';
1477
                    $calcAccu = [];
1478
                    $calcRes = 1;
1479
                    foreach ($confArray['paramExpanded'] as $gVar => $gVal) {
1480
                        $paramExpanded .= '
1481
                            <tr>
1482
                                <td>' . htmlspecialchars('&' . $gVar . '=') . '<br/>' .
1483
                            '(' . count($gVal) . ')' .
1484
                            '</td>
1485
                                <td nowrap="nowrap">' . nl2br(htmlspecialchars(implode(chr(10), $gVal))) . '</td>
1486
                            </tr>
1487
                        ';
1488
                        $calcRes *= count($gVal);
1489
                        $calcAccu[] = count($gVal);
1490
                    }
1491
                    $paramExpanded = '<table>' . $paramExpanded . '</table>';
1492
                    $paramExpanded .= 'Comb: ' . implode('*', $calcAccu) . '=' . $calcRes;
1493
1494
                    // Options
1495
                    $optionValues = '';
1496
                    if ($confArray['subCfg']['userGroups']) {
1497
                        $optionValues .= 'User Groups: ' . $confArray['subCfg']['userGroups'] . '<br/>';
1498
                    }
1499
                    if ($confArray['subCfg']['procInstrFilter']) {
1500
                        $optionValues .= 'ProcInstr: ' . $confArray['subCfg']['procInstrFilter'] . '<br/>';
1501
                    }
1502
1503
                    // Compile row:
1504
                    $content .= '
1505
                        <tr>
1506
                            ' . $titleClm . '
1507
                            <td>' . htmlspecialchars($confKey) . '</td>
1508
                            <td>' . nl2br(htmlspecialchars(rawurldecode(trim(str_replace('&', chr(10) . '&', GeneralUtility::implodeArrayForUrl('', $confArray['paramParsed'])))))) . '</td>
1509
                            <td>' . $paramExpanded . '</td>
1510
                            <td nowrap="nowrap">' . $urlList . '</td>
1511
                            <td nowrap="nowrap">' . $optionValues . '</td>
1512
                            <td nowrap="nowrap">' . DebugUtility::viewArray($confArray['subCfg']['procInstrParams.']) . '</td>
1513
                        </tr>';
1514
                } else {
1515
                    $content .= '<tr>
1516
                            ' . $titleClm . '
1517
                            <td>' . htmlspecialchars($confKey) . '</td>
1518
                            <td colspan="5"><em>No entries</em> (Page is excluded in this configuration)</td>
1519
                        </tr>';
1520
                }
1521
1522
                $c++;
1523
            }
1524
        } else {
1525
            $message = ! empty($skipMessage) ? ' (' . $skipMessage . ')' : '';
1526
1527
            // Compile row:
1528
            $content .= '
1529
                <tr>
1530
                    <td>' . $pageTitle . '</td>
1531
                    <td colspan="6"><em>No entries</em>' . $message . '</td>
1532
                </tr>';
1533
        }
1534
1535
        return $content;
1536
    }
1537
1538
    /*****************************
1539
     *
1540
     * CLI functions
1541
     *
1542
     *****************************/
1543
1544
    /**
1545
     * Running the functionality of the CLI (crawling URLs from queue)
1546
     */
1547
    public function CLI_run(int $countInARun, int $sleepTime, int $sleepAfterFinish): int
1548
    {
1549
        $result = 0;
1550
        $counter = 0;
1551
1552
        // First, run hooks:
1553
        $this->CLI_runHooks();
1554
1555
        // Clean up the queue
1556
        $this->queueRepository->cleanupQueue();
1557
1558
        // Select entries:
1559
        $rows = $this->queueRepository->fetchRecordsToBeCrawled($countInARun);
1560
1561
        if (! empty($rows)) {
1562
            $quidList = [];
1563
1564
            foreach ($rows as $r) {
1565
                $quidList[] = $r['qid'];
1566
            }
1567
1568
            $processId = $this->CLI_buildProcessId();
1569
1570
            //save the number of assigned queue entries to determine how many have been processed later
1571
            $numberOfAffectedRows = $this->queueRepository->updateProcessIdAndSchedulerForQueueIds($quidList, $processId);
1572
            $this->processRepository->updateProcessAssignItemsCount($numberOfAffectedRows, $processId);
1573
1574
            if ($numberOfAffectedRows !== count($quidList)) {
1575
                $this->CLI_debug('Nothing processed due to multi-process collision (' . $this->CLI_buildProcessId() . ')');
1576
                return ($result | self::CLI_STATUS_ABORTED);
1577
            }
1578
1579
            foreach ($rows as $r) {
1580
                $result |= $this->readUrl($r['qid']);
1581
1582
                $counter++;
1583
                usleep((int) $sleepTime); // Just to relax the system
1584
1585
                // if during the start and the current read url the cli has been disable we need to return from the function
1586
                // mark the process NOT as ended.
1587
                if ($this->getDisabled()) {
1588
                    return ($result | self::CLI_STATUS_ABORTED);
1589
                }
1590
1591
                if (! $this->processRepository->isProcessActive($this->CLI_buildProcessId())) {
1592
                    $this->CLI_debug('conflict / timeout (' . $this->CLI_buildProcessId() . ')');
1593
                    $result |= self::CLI_STATUS_ABORTED;
1594
                    break; //possible timeout
1595
                }
1596
            }
1597
1598
            sleep((int) $sleepAfterFinish);
1599
1600
            $msg = 'Rows: ' . $counter;
1601
            $this->CLI_debug($msg . ' (' . $this->CLI_buildProcessId() . ')');
1602
        } else {
1603
            $this->CLI_debug('Nothing within queue which needs to be processed (' . $this->CLI_buildProcessId() . ')');
1604
        }
1605
1606
        if ($counter > 0) {
1607
            $result |= self::CLI_STATUS_PROCESSED;
1608
        }
1609
1610
        return $result;
1611
    }
1612
1613
    /**
1614
     * Activate hooks
1615
     */
1616
    public function CLI_runHooks(): void
1617
    {
1618
        foreach ($GLOBALS['TYPO3_CONF_VARS']['EXTCONF']['crawler']['cli_hooks'] ?? [] as $objRef) {
1619
            $hookObj = GeneralUtility::makeInstance($objRef);
1620
            if (is_object($hookObj)) {
1621
                $hookObj->crawler_init($this);
1622
            }
1623
        }
1624
    }
1625
1626
    /**
1627
     * Try to acquire a new process with the given id
1628
     * also performs some auto-cleanup for orphan processes
1629
     * @param string $id identification string for the process
1630
     * @return boolean
1631
     * @todo preemption might not be the most elegant way to clean up
1632
     */
1633
    public function CLI_checkAndAcquireNewProcess($id)
1634
    {
1635
        $queryBuilder = GeneralUtility::makeInstance(ConnectionPool::class)->getQueryBuilderForTable($this->tableName);
1636
        $ret = true;
1637
1638
        $systemProcessId = getmypid();
1639
        if ($systemProcessId < 1) {
1640
            return false;
1641
        }
1642
1643
        $processCount = 0;
1644
        $orphanProcesses = [];
1645
1646
        $statement = $queryBuilder
1647
            ->select('process_id', 'ttl')
1648
            ->from('tx_crawler_process')
1649
            ->where(
1650
                'active = 1 AND deleted = 0'
1651
            )
1652
            ->execute();
1653
1654
        $currentTime = $this->getCurrentTime();
1655
1656
        while ($row = $statement->fetch()) {
1657
            if ($row['ttl'] < $currentTime) {
1658
                $orphanProcesses[] = $row['process_id'];
1659
            } else {
1660
                $processCount++;
1661
            }
1662
        }
1663
1664
        // if there are less than allowed active processes then add a new one
1665
        if ($processCount < (int) $this->extensionSettings['processLimit']) {
1666
            $this->CLI_debug('add process ' . $this->CLI_buildProcessId() . ' (' . ($processCount + 1) . '/' . (int) $this->extensionSettings['processLimit'] . ')');
1667
1668
            GeneralUtility::makeInstance(ConnectionPool::class)->getConnectionForTable('tx_crawler_process')->insert(
1669
                'tx_crawler_process',
1670
                [
1671
                    'process_id' => $id,
1672
                    'active' => 1,
1673
                    'ttl' => $currentTime + (int) $this->extensionSettings['processMaxRunTime'],
1674
                    'system_process_id' => $systemProcessId,
1675
                ]
1676
            );
1677
        } else {
1678
            $this->CLI_debug('Processlimit reached (' . ($processCount) . '/' . (int) $this->extensionSettings['processLimit'] . ')');
1679
            $ret = false;
1680
        }
1681
1682
        $this->processRepository->deleteProcessesMarkedAsDeleted();
1683
        $this->CLI_releaseProcesses($orphanProcesses);
1684
1685
        return $ret;
1686
    }
1687
1688
    /**
1689
     * Release a process and the required resources
1690
     *
1691
     * @param mixed $releaseIds string with a single process-id or array with multiple process-ids
1692
     * @return boolean
1693
     */
1694
    public function CLI_releaseProcesses($releaseIds)
1695
    {
1696
        $queryBuilder = GeneralUtility::makeInstance(ConnectionPool::class)->getQueryBuilderForTable($this->tableName);
1697
1698
        if (! is_array($releaseIds)) {
1699
            $releaseIds = [$releaseIds];
1700
        }
1701
1702
        if (empty($releaseIds)) {
1703
            return false;   //nothing to release
1704
        }
1705
1706
        // some kind of 2nd chance algo - this way you need at least 2 processes to have a real cleanup
1707
        // this ensures that a single process can't mess up the entire process table
1708
1709
        // mark all processes as deleted which have no "waiting" queue-entires and which are not active
1710
1711
        $queryBuilder
1712
            ->update($this->tableName, 'q')
1713
            ->where(
1714
                'q.process_id IN(SELECT p.process_id FROM tx_crawler_process as p WHERE p.active = 0)'
1715
            )
1716
            ->set('q.process_scheduled', 0)
1717
            ->set('q.process_id', '')
1718
            ->execute();
1719
1720
        // FIXME: Not entirely sure that this is equivalent to the previous version
1721
        $queryBuilder->resetQueryPart('set');
1722
1723
        $queryBuilder
1724
            ->update('tx_crawler_process')
1725
            ->where(
1726
                $queryBuilder->expr()->eq('active', 0),
1727
                'process_id IN(SELECT q.process_id FROM tx_crawler_queue as q WHERE q.exec_time = 0)'
1728
            )
1729
            ->set('system_process_id', 0)
1730
            ->execute();
1731
1732
        $this->processRepository->markRequestedProcessesAsNotActive($releaseIds);
1733
        $this->queueRepository->unsetProcessScheduledAndProcessIdForQueueEntries($releaseIds);
1734
1735
        return true;
1736
    }
1737
1738
    /**
1739
     * Create a unique Id for the current process
1740
     *
1741
     * @return string  the ID
1742
     */
1743
    public function CLI_buildProcessId()
1744
    {
1745
        if (! $this->processID) {
1746
            $this->processID = GeneralUtility::shortMD5(microtime(true));
1747
        }
1748
        return $this->processID;
1749
    }
1750
1751
    /**
1752
     * Prints a message to the stdout (only if debug-mode is enabled)
1753
     *
1754
     * @param string $msg the message
1755
     */
1756
    public function CLI_debug($msg): void
1757
    {
1758
        if ((int) $this->extensionSettings['processDebug']) {
1759
            echo $msg . "\n";
1760
            flush();
1761
        }
1762
    }
1763
1764
    /**
1765
     * Cleans up entries that stayed for too long in the queue. These are:
1766
     * - processed entries that are over 1.5 days in age
1767
     * - scheduled entries that are over 7 days old
1768
     *
1769
     * @deprecated
1770
     */
1771
    public function cleanUpOldQueueEntries(): void
1772
    {
1773
        $processedAgeInSeconds = $this->extensionSettings['cleanUpProcessedAge'] * 86400; // 24*60*60 Seconds in 24 hours
1774
        $scheduledAgeInSeconds = $this->extensionSettings['cleanUpScheduledAge'] * 86400;
1775
1776
        $now = time();
1777
        $condition = '(exec_time<>0 AND exec_time<' . ($now - $processedAgeInSeconds) . ') OR scheduled<=' . ($now - $scheduledAgeInSeconds);
1778
        $this->flushQueue($condition);
0 ignored issues
show
Deprecated Code introduced by
The function AOE\Crawler\Controller\C...ontroller::flushQueue() has been deprecated. ( Ignorable by Annotation )

If this is a false-positive, you can also ignore this issue in your code via the ignore-deprecated  annotation

1778
        /** @scrutinizer ignore-deprecated */ $this->flushQueue($condition);
Loading history...
1779
    }
1780
1781
    /**
1782
     * Removes queue entries
1783
     *
1784
     * @param string $where SQL related filter for the entries which should be removed
1785
     *
1786
     * @deprecated
1787
     */
1788
    protected function flushQueue($where = ''): void
1789
    {
1790
        $realWhere = strlen((string) $where) > 0 ? $where : '1=1';
1791
1792
        $queryBuilder = $this->getQueryBuilder($this->tableName);
1793
1794
        $groups = $queryBuilder
1795
            ->selectLiteral('DISTINCT set_id')
1796
            ->from($this->tableName)
1797
            ->where($realWhere)
1798
            ->execute()
1799
            ->fetchAll();
1800
        if (is_array($groups)) {
0 ignored issues
show
introduced by
The condition is_array($groups) is always true.
Loading history...
1801
            foreach ($groups as $group) {
1802
                $subSet = $queryBuilder
1803
                    ->select('qid', 'set_id')
1804
                    ->from($this->tableName)
1805
                    ->where(
1806
                        $realWhere,
1807
                        $queryBuilder->expr()->eq('set_id', $group['set_id'])
1808
                    )
1809
                    ->execute()
1810
                    ->fetchAll();
1811
1812
                $payLoad = ['subSet' => $subSet];
1813
                SignalSlotUtility::emitSignal(
1814
                    self::class,
1815
                    SignalSlotUtility::SIGNAL_QUEUE_ENTRY_FLUSH,
1816
                    $payLoad
1817
                );
1818
            }
1819
        }
1820
1821
        $queryBuilder
1822
            ->delete($this->tableName)
1823
            ->where($realWhere)
1824
            ->execute();
1825
    }
1826
1827
    /**
1828
     * This method determines duplicates for a queue entry with the same parameters and this timestamp.
1829
     * If the timestamp is in the past, it will check if there is any unprocessed queue entry in the past.
1830
     * If the timestamp is in the future it will check, if the queued entry has exactly the same timestamp
1831
     *
1832
     * @param int $tstamp
1833
     * @param array $fieldArray
1834
     *
1835
     * @return array
1836
     */
1837
    protected function getDuplicateRowsIfExist($tstamp, $fieldArray)
1838
    {
1839
        $rows = [];
1840
1841
        $currentTime = $this->getCurrentTime();
1842
1843
        $queryBuilder = GeneralUtility::makeInstance(ConnectionPool::class)->getQueryBuilderForTable($this->tableName);
1844
        $queryBuilder
1845
            ->select('qid')
1846
            ->from('tx_crawler_queue');
1847
        //if this entry is scheduled with "now"
1848
        if ($tstamp <= $currentTime) {
1849
            if ($this->extensionSettings['enableTimeslot']) {
1850
                $timeBegin = $currentTime - 100;
1851
                $timeEnd = $currentTime + 100;
1852
                $queryBuilder
1853
                    ->where(
1854
                        'scheduled BETWEEN ' . $timeBegin . ' AND ' . $timeEnd . ''
1855
                    )
1856
                    ->orWhere(
1857
                        $queryBuilder->expr()->lte('scheduled', $currentTime)
1858
                    );
1859
            } else {
1860
                $queryBuilder
1861
                    ->where(
1862
                        $queryBuilder->expr()->lte('scheduled', $currentTime)
1863
                    );
1864
            }
1865
        } elseif ($tstamp > $currentTime) {
1866
            //entry with a timestamp in the future need to have the same schedule time
1867
            $queryBuilder
1868
                ->where(
1869
                    $queryBuilder->expr()->eq('scheduled', $tstamp)
1870
                );
1871
        }
1872
1873
        $queryBuilder
1874
            ->andWhere('NOT exec_time')
1875
            ->andWhere('NOT process_id')
1876
            ->andWhere($queryBuilder->expr()->eq('page_id', $queryBuilder->createNamedParameter($fieldArray['page_id'], \PDO::PARAM_INT)))
1877
            ->andWhere($queryBuilder->expr()->eq('parameters_hash', $queryBuilder->createNamedParameter($fieldArray['parameters_hash'], \PDO::PARAM_STR)));
1878
1879
        $statement = $queryBuilder->execute();
1880
1881
        while ($row = $statement->fetch()) {
1882
            $rows[] = $row['qid'];
1883
        }
1884
1885
        return $rows;
1886
    }
1887
1888
    /**
1889
     * Returns a md5 hash generated from a serialized configuration array.
1890
     *
1891
     * @return string
1892
     */
1893
    protected function getConfigurationHash(array $configuration)
1894
    {
1895
        unset($configuration['paramExpanded']);
1896
        unset($configuration['URLs']);
1897
        return md5(serialize($configuration));
1898
    }
1899
1900
    /**
1901
     * Build a URL from a Page and the Query String. If the page has a Site configuration, it can be built by using
1902
     * the Site instance.
1903
     *
1904
     * @param int $httpsOrHttp see tx_crawler_configuration.force_ssl
1905
     * @throws \TYPO3\CMS\Core\Exception\SiteNotFoundException
1906
     * @throws \TYPO3\CMS\Core\Routing\InvalidRouteArgumentsException
1907
     *
1908
     * @deprecated Using CrawlerController::getUrlFromPageAndQueryParameters() is deprecated since 9.1.1 and will be removed in v11.x, please use UrlService->getUrlFromPageAndQueryParameters() instead.
1909
     */
1910
    protected function getUrlFromPageAndQueryParameters(int $pageId, string $queryString, ?string $alternativeBaseUrl, int $httpsOrHttp): UriInterface
1911
    {
1912
        $urlService = new UrlService();
1913
        return $urlService->getUrlFromPageAndQueryParameters($pageId, $queryString, $alternativeBaseUrl, $httpsOrHttp);
1914
    }
1915
1916
    protected function swapIfFirstIsLargerThanSecond(array $reg): array
1917
    {
1918
        // Swap if first is larger than last:
1919
        if ($reg[1] > $reg[2]) {
1920
            $temp = $reg[2];
1921
            $reg[2] = $reg[1];
1922
            $reg[1] = $temp;
1923
        }
1924
1925
        return $reg;
1926
    }
1927
1928
    /**
1929
     * @return BackendUserAuthentication
1930
     */
1931
    private function getBackendUser()
1932
    {
1933
        // Make sure the _cli_ user is loaded
1934
        Bootstrap::initializeBackendAuthentication();
1935
        if ($this->backendUser === null) {
1936
            $this->backendUser = $GLOBALS['BE_USER'];
1937
        }
1938
        return $this->backendUser;
1939
    }
1940
1941
    /**
1942
     * Get querybuilder for given table
1943
     *
1944
     * @return \TYPO3\CMS\Core\Database\Query\QueryBuilder
1945
     */
1946
    private function getQueryBuilder(string $table)
1947
    {
1948
        return GeneralUtility::makeInstance(ConnectionPool::class)->getQueryBuilderForTable($table);
1949
    }
1950
}
1951