Completed
Push — ci/infection ( c77b24...2591ac )
by Tomas Norre
14:39
created

CrawlerController::getMaximumUrlsToCompile()   A

Complexity

Conditions 1
Paths 1

Size

Total Lines 3
Code Lines 1

Duplication

Lines 0
Ratio 0 %

Code Coverage

Tests 0
CRAP Score 2

Importance

Changes 0
Metric Value
cc 1
eloc 1
nc 1
nop 0
dl 0
loc 3
ccs 0
cts 0
cp 0
crap 2
rs 10
c 0
b 0
f 0
1
<?php
2
3
declare(strict_types=1);
4
5
namespace AOE\Crawler\Controller;
6
7
/***************************************************************
8
 *  Copyright notice
9
 *
10
 *  (c) 2020 AOE GmbH <[email protected]>
11
 *
12
 *  All rights reserved
13
 *
14
 *  This script is part of the TYPO3 project. The TYPO3 project is
15
 *  free software; you can redistribute it and/or modify
16
 *  it under the terms of the GNU General Public License as published by
17
 *  the Free Software Foundation; either version 3 of the License, or
18
 *  (at your option) any later version.
19
 *
20
 *  The GNU General Public License can be found at
21
 *  http://www.gnu.org/copyleft/gpl.html.
22
 *
23
 *  This script is distributed in the hope that it will be useful,
24
 *  but WITHOUT ANY WARRANTY; without even the implied warranty of
25
 *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
26
 *  GNU General Public License for more details.
27
 *
28
 *  This copyright notice MUST APPEAR in all copies of the script!
29
 ***************************************************************/
30
31
use AOE\Crawler\Configuration\ExtensionConfigurationProvider;
32
use AOE\Crawler\Converter\JsonCompatibilityConverter;
33
use AOE\Crawler\Domain\Repository\ConfigurationRepository;
34
use AOE\Crawler\Domain\Repository\ProcessRepository;
35
use AOE\Crawler\Domain\Repository\QueueRepository;
36
use AOE\Crawler\QueueExecutor;
37
use AOE\Crawler\Utility\SignalSlotUtility;
38
use Psr\Http\Message\UriInterface;
39
use Psr\Log\LoggerAwareInterface;
40
use Psr\Log\LoggerAwareTrait;
41
use TYPO3\CMS\Backend\Tree\View\PageTreeView;
42
use TYPO3\CMS\Backend\Utility\BackendUtility;
43
use TYPO3\CMS\Core\Authentication\BackendUserAuthentication;
44
use TYPO3\CMS\Core\Compatibility\PublicMethodDeprecationTrait;
45
use TYPO3\CMS\Core\Core\Bootstrap;
46
use TYPO3\CMS\Core\Core\Environment;
47
use TYPO3\CMS\Core\Database\Connection;
48
use TYPO3\CMS\Core\Database\ConnectionPool;
49
use TYPO3\CMS\Core\Database\Query\Restriction\DeletedRestriction;
50
use TYPO3\CMS\Core\Http\Uri;
51
use TYPO3\CMS\Core\Imaging\Icon;
52
use TYPO3\CMS\Core\Imaging\IconFactory;
53
use TYPO3\CMS\Core\Routing\SiteMatcher;
54
use TYPO3\CMS\Core\Site\Entity\Site;
55
use TYPO3\CMS\Core\Type\Bitmask\Permission;
56
use TYPO3\CMS\Core\TypoScript\Parser\TypoScriptParser;
57
use TYPO3\CMS\Core\Utility\DebugUtility;
58
use TYPO3\CMS\Core\Utility\GeneralUtility;
59
use TYPO3\CMS\Core\Utility\MathUtility;
60
use TYPO3\CMS\Extbase\Object\ObjectManager;
61
use TYPO3\CMS\Frontend\Page\CacheHashCalculator;
62
use TYPO3\CMS\Frontend\Page\PageRepository;
63
64
/**
65
 * Class CrawlerController
66
 *
67
 * @package AOE\Crawler\Controller
68
 */
69
class CrawlerController implements LoggerAwareInterface
70
{
71
    use LoggerAwareTrait;
72
    use PublicMethodDeprecationTrait;
73
74
    public const CLI_STATUS_NOTHING_PROCCESSED = 0;
75
76
    public const CLI_STATUS_REMAIN = 1; //queue not empty
77
78
    public const CLI_STATUS_PROCESSED = 2; //(some) queue items where processed
79
80
    public const CLI_STATUS_ABORTED = 4; //instance didn't finish
81
82
    public const CLI_STATUS_POLLABLE_PROCESSED = 8;
83
84
    /**
85
     * @var integer
86
     */
87
    public $setID = 0;
88
89
    /**
90
     * @var string
91
     */
92
    public $processID = '';
93
94
    /**
95
     * @var array
96
     */
97
    public $duplicateTrack = [];
98
99
    /**
100
     * @var array
101
     */
102
    public $downloadUrls = [];
103
104
    /**
105
     * @var array
106
     */
107
    public $incomingProcInstructions = [];
108
109
    /**
110
     * @var array
111
     */
112
    public $incomingConfigurationSelection = [];
113
114
    /**
115
     * @var bool
116
     */
117
    public $registerQueueEntriesInternallyOnly = false;
118
119
    /**
120
     * @var array
121
     */
122
    public $queueEntries = [];
123
124
    /**
125
     * @var array
126
     */
127
    public $urlList = [];
128
129
    /**
130
     * @var array
131
     */
132
    public $extensionSettings = [];
133
134
    /**
135
     * Mount Point
136
     *
137
     * @var bool
138
     * Todo: Check what this is used for and adjust the type hint or code, as bool doesn't match the current code.
139
     */
140
    public $MP = false;
141
142
    /**
143
     * @var string
144
     */
145
    protected $processFilename;
146
147
    /**
148
     * Holds the internal access mode can be 'gui','cli' or 'cli_im'
149
     *
150
     * @var string
151
     */
152
    protected $accessMode;
153
154
    /**
155
     * @var QueueRepository
156
     */
157
    protected $queueRepository;
158
159
    /**
160
     * @var ProcessRepository
161
     */
162
    protected $processRepository;
163
164
    /**
165
     * @var ConfigurationRepository
166
     */
167
    protected $configurationRepository;
168
169
    /**
170
     * @var string
171
     */
172
    protected $tableName = 'tx_crawler_queue';
173
174
    /**
175
     * @var QueueExecutor
176
     */
177
    protected $queueExecutor;
178
179
    /**
180
     * @var int
181
     */
182
    protected $maximumUrlsToCompile = 10000;
183
184
    /**
185
     * @var IconFactory
186
     */
187
    protected $iconFactory;
188
189
    /**
190
     * @var string[]
191
     */
192
    private $deprecatedPublicMethods = [
0 ignored issues
show
introduced by
The private property $deprecatedPublicMethods is not used, and could be removed.
Loading history...
193
        'getLogEntriesForSetId' => 'Using crawlerController::getLogEntriesForSetId() is deprecated since 9.0.1 and will be removed in v11.x',
194
        'flushQueue' => 'Using CrawlerController::flushQueue() is deprecated since 9.0.1 and will be removed in v11.x, please use QueueRepository->flushQueue() instead.',
195
        'cleanUpOldQueueEntries' => 'Using CrawlerController::cleanUpOldQueueEntries() is deprecated since 9.0.1 and will be removed in v11.x, please use QueueRepository->cleanUpOldQueueEntries() instead.',
196
    ];
197
198
    /**
199
     * @var BackendUserAuthentication|null
200
     */
201
    private $backendUser;
202
203
    /**
204
     * @var integer
205
     */
206
    private $scheduledTime = 0;
207
208
    /**
209
     * @var integer
210
     */
211
    private $reqMinute = 0;
212
213
    /**
214
     * @var bool
215
     */
216
    private $submitCrawlUrls = false;
217
218
    /**
219
     * @var bool
220
     */
221
    private $downloadCrawlUrls = false;
222
223
    /************************************
224
     *
225
     * Getting URLs based on Page TSconfig
226
     *
227
     ************************************/
228
229 37
    public function __construct()
230
    {
231 37
        $objectManager = GeneralUtility::makeInstance(ObjectManager::class);
232 37
        $this->queueRepository = $objectManager->get(QueueRepository::class);
233 37
        $this->processRepository = $objectManager->get(ProcessRepository::class);
234 37
        $this->configurationRepository = $objectManager->get(ConfigurationRepository::class);
235 37
        $this->queueExecutor = $objectManager->get(QueueExecutor::class);
236 37
        $this->iconFactory = GeneralUtility::makeInstance(IconFactory::class);
237
238 37
        $this->processFilename = Environment::getVarPath() . '/lock/tx_crawler.proc';
239
240
        /** @var ExtensionConfigurationProvider $configurationProvider */
241 37
        $configurationProvider = GeneralUtility::makeInstance(ExtensionConfigurationProvider::class);
242 37
        $settings = $configurationProvider->getExtensionConfiguration();
243 37
        $this->extensionSettings = is_array($settings) ? $settings : [];
0 ignored issues
show
introduced by
The condition is_array($settings) is always true.
Loading history...
244
245
        // set defaults:
246 37
        if (MathUtility::convertToPositiveInteger($this->extensionSettings['countInARun']) === 0) {
247
            $this->extensionSettings['countInARun'] = 100;
248
        }
249
250 37
        $this->extensionSettings['processLimit'] = MathUtility::forceIntegerInRange($this->extensionSettings['processLimit'], 1, 99, 1);
251 37
        $this->maximumUrlsToCompile = MathUtility::forceIntegerInRange($this->extensionSettings['maxCompileUrls'], 1, 1000000000, 10000);
252 37
    }
253
254
    public function getMaximumUrlsToCompile(): int
255
    {
256
        return $this->maximumUrlsToCompile;
257
    }
258
259 1
    public function setMaximumUrlsToCompile(int $maximumUrlsToCompile): void
260
    {
261 1
        $this->maximumUrlsToCompile = $maximumUrlsToCompile;
262
    }
263
264
    /**
265
     * Method to set the accessMode can be gui, cli or cli_im
266
     *
267 1
     * @return string
268
     */
269 1
    public function getAccessMode()
270 1
    {
271
        return $this->accessMode;
272
    }
273
274
    /**
275
     * @param string $accessMode
276
     */
277 2
    public function setAccessMode($accessMode): void
278
    {
279 2
        $this->accessMode = $accessMode;
280 1
    }
281 1
282 1
    /**
283
     * Set disabled status to prevent processes from being processed
284 2
     *
285
     * @param bool $disabled (optional, defaults to true)
286
     */
287
    public function setDisabled($disabled = true): void
288
    {
289
        if ($disabled) {
290
            GeneralUtility::writeFile($this->processFilename, '');
291 2
        } elseif (is_file($this->processFilename)) {
292
            unlink($this->processFilename);
293 2
        }
294
    }
295
296
    /**
297
     * Get disable status
298
     *
299 3
     * @return bool true if disabled
300
     */
301 3
    public function getDisabled()
302 3
    {
303
        return is_file($this->processFilename);
304
    }
305
306
    /**
307 1
     * @param string $filenameWithPath
308
     */
309 1
    public function setProcessFilename($filenameWithPath): void
310
    {
311
        $this->processFilename = $filenameWithPath;
312
    }
313
314
    /**
315 6
     * @return string
316
     */
317 6
    public function getProcessFilename()
318 6
    {
319
        return $this->processFilename;
320
    }
321
322
    /**
323
     * Sets the extensions settings (unserialized pendant of $TYPO3_CONF_VARS['EXT']['extConf']['crawler']).
324
     */
325
    public function setExtensionSettings(array $extensionSettings): void
326
    {
327
        $this->extensionSettings = $extensionSettings;
328
    }
329
330
    /**
331
     * Check if the given page should be crawled
332
     *
333
     * @return false|string false if the page should be crawled (not excluded), true / skipMessage if it should be skipped
334
     */
335
    public function checkIfPageShouldBeSkipped(array $pageRow)
336
    {
337
        $skipPage = false;
338
        $skipMessage = 'Skipped'; // message will be overwritten later
339
340
        // if page is hidden
341
        if (! $this->extensionSettings['crawlHiddenPages']) {
342
            if ($pageRow['hidden']) {
343
                $skipPage = true;
344
                $skipMessage = 'Because page is hidden';
345
            }
346
        }
347
348
        if (! $skipPage) {
349
            if (GeneralUtility::inList('3,4,199,254,255', $pageRow['doktype'])) {
350
                $skipPage = true;
351
                $skipMessage = 'Because doktype is not allowed';
352
            }
353
        }
354
355
        if (! $skipPage) {
356
            foreach ($GLOBALS['TYPO3_CONF_VARS']['EXTCONF']['crawler']['excludeDoktype'] ?? [] as $key => $doktypeList) {
357
                if (GeneralUtility::inList($doktypeList, $pageRow['doktype'])) {
358
                    $skipPage = true;
359
                    $skipMessage = 'Doktype was excluded by "' . $key . '"';
360
                    break;
361
                }
362
            }
363
        }
364
365
        if (! $skipPage) {
366
            // veto hook
367
            foreach ($GLOBALS['TYPO3_CONF_VARS']['EXTCONF']['crawler']['pageVeto'] ?? [] as $key => $func) {
368
                $params = [
369
                    'pageRow' => $pageRow,
370
                ];
371
                // expects "false" if page is ok and "true" or a skipMessage if this page should _not_ be crawled
372
                $veto = GeneralUtility::callUserFunction($func, $params, $this);
373
                if ($veto !== false) {
374
                    $skipPage = true;
375
                    if (is_string($veto)) {
376
                        $skipMessage = $veto;
377
                    } else {
378
                        $skipMessage = 'Veto from hook "' . htmlspecialchars($key) . '"';
379
                    }
380
                    // no need to execute other hooks if a previous one return a veto
381
                    break;
382
                }
383
            }
384
        }
385
386
        return $skipPage ? $skipMessage : false;
387
    }
388 2
389
    /**
390 2
     * Wrapper method for getUrlsForPageId()
391 2
     * It returns an array of configurations and no urls!
392 1
     *
393 1
     * @param array $pageRow Page record with at least dok-type and uid columns.
394
     * @param string $skipMessage
395 1
     * @return array
396 1
     * @see getUrlsForPageId()
397
     */
398
    public function getUrlsForPageRow(array $pageRow, &$skipMessage = '')
399 2
    {
400
        $message = $this->checkIfPageShouldBeSkipped($pageRow);
401
        if ($message === false) {
402
            $res = $this->getUrlsForPageId($pageRow['uid']);
403
            $skipMessage = '';
404
        } else {
405
            $skipMessage = $message;
406
            $res = [];
407
        }
408
409
        return $res;
410
    }
411
412
    /**
413
     * Creates a list of URLs from input array (and submits them to queue if asked for)
414
     * See Web > Info module script + "indexed_search"'s crawler hook-client using this!
415
     *
416
     * @param array $vv Information about URLs from pageRow to crawl.
417
     * @param array $pageRow Page row
418
     * @param int $scheduledTime Unix time to schedule indexing to, typically time()
419
     * @param int $reqMinute Number of requests per minute (creates the interleave between requests)
420
     * @param bool $submitCrawlUrls If set, submits the URLs to queue
421
     * @param bool $downloadCrawlUrls If set (and submitcrawlUrls is false) will fill $downloadUrls with entries)
422
     * @param array $duplicateTrack Array which is passed by reference and contains the an id per url to secure we will not crawl duplicates
423
     * @param array $downloadUrls Array which will be filled with URLS for download if flag is set.
424
     * @param array $incomingProcInstructions Array of processing instructions
425
     * @return string List of URLs (meant for display in backend module)
426
     */
427
    public function urlListFromUrlArray(
428
        array $vv,
429
        array $pageRow,
430
        $scheduledTime,
431
        $reqMinute,
432
        $submitCrawlUrls,
433
        $downloadCrawlUrls,
434
        array &$duplicateTrack,
435
        array &$downloadUrls,
436
        array $incomingProcInstructions
437
    ) {
438
        if (! is_array($vv['URLs'])) {
439
            return 'ERROR - no URL generated';
440
        }
441
        $urlLog = [];
442
        $pageId = (int) $pageRow['uid'];
443
        $configurationHash = $this->getConfigurationHash($vv);
444
        $skipInnerCheck = $this->queueRepository->noUnprocessedQueueEntriesForPageWithConfigurationHashExist($pageId, $configurationHash);
445
446
        foreach ($vv['URLs'] as $urlQuery) {
447
            if (! $this->drawURLs_PIfilter($vv['subCfg']['procInstrFilter'], $incomingProcInstructions)) {
448
                continue;
449
            }
450
            $url = (string) $this->getUrlFromPageAndQueryParameters(
451
                $pageId,
452
                $urlQuery,
453
                $vv['subCfg']['baseUrl'] ?? null,
454
                $vv['subCfg']['force_ssl'] ?? 0
455
            );
456
457
            // Create key by which to determine unique-ness:
458
            $uKey = $url . '|' . $vv['subCfg']['userGroups'] . '|' . $vv['subCfg']['procInstrFilter'];
459
460
            if (isset($duplicateTrack[$uKey])) {
461
                //if the url key is registered just display it and do not resubmit is
462
                $urlLog[] = '<em><span class="text-muted">' . htmlspecialchars($url) . '</span></em>';
463
            } else {
464
                // Scheduled time:
465
                $schTime = $scheduledTime + round(count($duplicateTrack) * (60 / $reqMinute));
466
                $schTime = intval($schTime / 60) * 60;
467
                $formattedDate = BackendUtility::datetime($schTime);
468
                $this->urlList[] = '[' . $formattedDate . '] ' . $url;
469
                $urlList = '[' . $formattedDate . '] ' . htmlspecialchars($url);
470
471
                // Submit for crawling!
472
                if ($submitCrawlUrls) {
473
                    $added = $this->addUrl(
474
                        $pageId,
475
                        $url,
476
                        $vv['subCfg'],
477
                        $scheduledTime,
478
                        $configurationHash,
479
                        $skipInnerCheck
480
                    );
481
                    if ($added === false) {
482
                        $urlList .= ' (URL already existed)';
483
                    }
484
                } elseif ($downloadCrawlUrls) {
485
                    $downloadUrls[$url] = $url;
486
                }
487
                $urlLog[] = $urlList;
488
            }
489
            $duplicateTrack[$uKey] = true;
490
        }
491
492 5
        return implode('<br>', $urlLog);
493
    }
494 5
495 1
    /**
496
     * Returns true if input processing instruction is among registered ones.
497
     *
498 4
     * @param string $piString PI to test
499 4
     * @param array $incomingProcInstructions Processing instructions
500 2
     * @return boolean
501
     */
502
    public function drawURLs_PIfilter($piString, array $incomingProcInstructions)
503 2
    {
504
        if (empty($incomingProcInstructions)) {
505
            return true;
506 1
        }
507
508 1
        foreach ($incomingProcInstructions as $pi) {
509 1
            if (GeneralUtility::inList($piString, $pi)) {
510
                return true;
511
            }
512
        }
513
        return false;
514
    }
515
516
    public function getPageTSconfigForId($id): array
517 1
    {
518
        if (! $this->MP) {
519
            $pageTSconfig = BackendUtility::getPagesTSconfig($id);
0 ignored issues
show
Deprecated Code introduced by
The function TYPO3\CMS\Backend\Utilit...ity::getPagesTSconfig() has been deprecated. ( Ignorable by Annotation )

If this is a false-positive, you can also ignore this issue in your code via the ignore-deprecated  annotation

519
            $pageTSconfig = /** @scrutinizer ignore-deprecated */ BackendUtility::getPagesTSconfig($id);
Loading history...
520
        } else {
521
            // TODO: Please check, this makes no sense to split a boolean value.
522
            [, $mountPointId] = explode('-', $this->MP);
0 ignored issues
show
Bug introduced by
$this->MP of type true is incompatible with the type string expected by parameter $string of explode(). ( Ignorable by Annotation )

If this is a false-positive, you can also ignore this issue in your code via the ignore-type  annotation

522
            [, $mountPointId] = explode('-', /** @scrutinizer ignore-type */ $this->MP);
Loading history...
523
            $pageTSconfig = BackendUtility::getPagesTSconfig($mountPointId);
0 ignored issues
show
Deprecated Code introduced by
The function TYPO3\CMS\Backend\Utilit...ity::getPagesTSconfig() has been deprecated. ( Ignorable by Annotation )

If this is a false-positive, you can also ignore this issue in your code via the ignore-deprecated  annotation

523
            $pageTSconfig = /** @scrutinizer ignore-deprecated */ BackendUtility::getPagesTSconfig($mountPointId);
Loading history...
Bug introduced by
$mountPointId of type string is incompatible with the type integer expected by parameter $id of TYPO3\CMS\Backend\Utilit...ity::getPagesTSconfig(). ( Ignorable by Annotation )

If this is a false-positive, you can also ignore this issue in your code via the ignore-type  annotation

523
            $pageTSconfig = BackendUtility::getPagesTSconfig(/** @scrutinizer ignore-type */ $mountPointId);
Loading history...
524
        }
525
526 1
        // Call a hook to alter configuration
527
        if (is_array($GLOBALS['TYPO3_CONF_VARS']['EXTCONF']['crawler']['getPageTSconfigForId'])) {
528
            $params = [
529
                'pageId' => $id,
530
                'pageTSConfig' => &$pageTSconfig,
531
            ];
532
            foreach ($GLOBALS['TYPO3_CONF_VARS']['EXTCONF']['crawler']['getPageTSconfigForId'] as $userFunc) {
533
                GeneralUtility::callUserFunction($userFunc, $params, $this);
534
            }
535
        }
536
        return $pageTSconfig;
537
    }
538
539
    /**
540
     * This methods returns an array of configurations.
541
     * Adds no urls!
542
     */
543
    public function getUrlsForPageId(int $pageId): array
544
    {
545
        // Get page TSconfig for page ID
546
        $pageTSconfig = $this->getPageTSconfigForId($pageId);
547
548
        $res = [];
549
550
        // Fetch Crawler Configuration from pageTSconfig
551
        $crawlerCfg = $pageTSconfig['tx_crawler.']['crawlerCfg.']['paramSets.'] ?? [];
552
        foreach ($crawlerCfg as $key => $values) {
553
            if (! is_array($values)) {
554
                continue;
555
            }
556
            $key = str_replace('.', '', $key);
557
            // Sub configuration for a single configuration string:
558
            $subCfg = (array) $crawlerCfg[$key . '.'];
559
            $subCfg['key'] = $key;
560
561
            if (strcmp($subCfg['procInstrFilter'] ?? '', '')) {
562
                $subCfg['procInstrFilter'] = implode(',', GeneralUtility::trimExplode(',', $subCfg['procInstrFilter']));
563
            }
564
            $pidOnlyList = implode(',', GeneralUtility::trimExplode(',', $subCfg['pidsOnly'], true));
565
566
            // process configuration if it is not page-specific or if the specific page is the current page:
567
            // TODO: Check if $pidOnlyList can be kept as Array instead of imploded
568
            if (! strcmp((string) $subCfg['pidsOnly'], '') || GeneralUtility::inList($pidOnlyList, strval($pageId))) {
569
570
                // Explode, process etc.:
571
                $res[$key] = [];
572
                $res[$key]['subCfg'] = $subCfg;
573
                $res[$key]['paramParsed'] = GeneralUtility::explodeUrl2Array($crawlerCfg[$key]);
574
                $res[$key]['paramExpanded'] = $this->expandParameters($res[$key]['paramParsed'], $pageId);
575
                $res[$key]['origin'] = 'pagets';
576
577
                // recognize MP value
578
                if (! $this->MP) {
579
                    $res[$key]['URLs'] = $this->compileUrls($res[$key]['paramExpanded'], ['?id=' . $pageId]);
580
                } else {
581
                    $res[$key]['URLs'] = $this->compileUrls($res[$key]['paramExpanded'], ['?id=' . $pageId . '&MP=' . $this->MP]);
0 ignored issues
show
Bug introduced by
Are you sure $this->MP of type true can be used in concatenation? ( Ignorable by Annotation )

If this is a false-positive, you can also ignore this issue in your code via the ignore-type  annotation

581
                    $res[$key]['URLs'] = $this->compileUrls($res[$key]['paramExpanded'], ['?id=' . $pageId . '&MP=' . /** @scrutinizer ignore-type */ $this->MP]);
Loading history...
582
                }
583
            }
584
        }
585
586
        // Get configuration from tx_crawler_configuration records up the rootline
587
        $crawlerConfigurations = $this->configurationRepository->getCrawlerConfigurationRecordsFromRootLine($pageId);
588
        foreach ($crawlerConfigurations as $configurationRecord) {
589
590
            // check access to the configuration record
591
            if (empty($configurationRecord['begroups']) || $this->getBackendUser()->isAdmin() || $this->hasGroupAccess($this->getBackendUser()->user['usergroup_cached_list'], $configurationRecord['begroups'])) {
592
                $pidOnlyList = implode(',', GeneralUtility::trimExplode(',', $configurationRecord['pidsonly'], true));
593
594
                // process configuration if it is not page-specific or if the specific page is the current page:
595
                // TODO: Check if $pidOnlyList can be kept as Array instead of imploded
596
                if (! strcmp($configurationRecord['pidsonly'], '') || GeneralUtility::inList($pidOnlyList, strval($pageId))) {
597
                    $key = $configurationRecord['name'];
598
599
                    // don't overwrite previously defined paramSets
600
                    if (! isset($res[$key])) {
601
602
                        /* @var $TSparserObject \TYPO3\CMS\Core\TypoScript\Parser\TypoScriptParser */
603
                        $TSparserObject = GeneralUtility::makeInstance(TypoScriptParser::class);
604
                        $TSparserObject->parse($configurationRecord['processing_instruction_parameters_ts']);
605
606
                        $subCfg = [
607
                            'procInstrFilter' => $configurationRecord['processing_instruction_filter'],
608
                            'procInstrParams.' => $TSparserObject->setup,
609
                            'baseUrl' => $configurationRecord['base_url'],
610
                            'force_ssl' => (int) $configurationRecord['force_ssl'],
611
                            'userGroups' => $configurationRecord['fegroups'],
612
                            'exclude' => $configurationRecord['exclude'],
613
                            'key' => $key,
614
                        ];
615
616
                        if (! in_array($pageId, $this->expandExcludeString($subCfg['exclude']), true)) {
617
                            $res[$key] = [];
618
                            $res[$key]['subCfg'] = $subCfg;
619
                            $res[$key]['paramParsed'] = GeneralUtility::explodeUrl2Array($configurationRecord['configuration']);
620
                            $res[$key]['paramExpanded'] = $this->expandParameters($res[$key]['paramParsed'], $pageId);
621
                            $res[$key]['URLs'] = $this->compileUrls($res[$key]['paramExpanded'], ['?id=' . $pageId]);
622
                            $res[$key]['origin'] = 'tx_crawler_configuration_' . $configurationRecord['uid'];
623
                        }
624
                    }
625
                }
626
            }
627
        }
628
629
        foreach ($GLOBALS['TYPO3_CONF_VARS']['EXTCONF']['crawler']['processUrls'] ?? [] as $func) {
630
            $params = [
631
                'res' => &$res,
632 1
            ];
633
            GeneralUtility::callUserFunction($func, $params, $this);
634 1
        }
635 1
        return $res;
636 1
    }
637 1
638
    /**
639
     * Find all configurations of subpages of a page
640
     * TODO: Write Functional Tests
641
     */
642
    public function getConfigurationsForBranch(int $rootid, int $depth): array
643 1
    {
644 1
        $configurationsForBranch = [];
645 1
        $pageTSconfig = $this->getPageTSconfigForId($rootid);
646 1
        $sets = $pageTSconfig['tx_crawler.']['crawlerCfg.']['paramSets.'] ?? [];
647
        foreach ($sets as $key => $value) {
648
            if (! is_array($value)) {
649 1
                continue;
650 1
            }
651 1
            $configurationsForBranch[] = substr($key, -1) === '.' ? substr($key, 0, -1) : $key;
652 1
        }
653 1
        $pids = [];
654
        $rootLine = BackendUtility::BEgetRootLine($rootid);
655
        foreach ($rootLine as $node) {
656
            $pids[] = $node['uid'];
657 1
        }
658
        /* @var PageTreeView $tree */
659 1
        $tree = GeneralUtility::makeInstance(PageTreeView::class);
660 1
        $perms_clause = $this->getBackendUser()->getPagePermsClause(Permission::PAGE_SHOW);
661 1
        $tree->init(empty($perms_clause) ? '' : ('AND ' . $perms_clause));
662 1
        $tree->getTree($rootid, $depth, '');
663
        foreach ($tree->tree as $node) {
664 1
            $pids[] = $node['row']['uid'];
665
        }
666 1
667 1
        $queryBuilder = $this->getQueryBuilder('tx_crawler_configuration');
668
        $statement = $queryBuilder
669 1
            ->select('name')
670
            ->from('tx_crawler_configuration')
671
            ->where(
672
                $queryBuilder->expr()->in('pid', $queryBuilder->createNamedParameter($pids, Connection::PARAM_INT_ARRAY))
673
            )
674
            ->execute();
675
676
        while ($row = $statement->fetch()) {
677
            $configurationsForBranch[] = $row['name'];
678
        }
679
        return $configurationsForBranch;
680
    }
681 3
682
    /**
683 3
     * Check if a user has access to an item
684 1
     * (e.g. get the group list of the current logged in user from $GLOBALS['TSFE']->gr_list)
685
     *
686 2
     * @param string $groupList Comma-separated list of (fe_)group UIDs from a user
687 2
     * @param string $accessList Comma-separated list of (fe_)group UIDs of the item to access
688 1
     * @return bool                 TRUE if at least one of the users group UIDs is in the access list or the access list is empty
689
     * @see \TYPO3\CMS\Frontend\Page\PageRepository::getMultipleGroupsWhereClause()
690
     */
691 1
    public function hasGroupAccess($groupList, $accessList)
692
    {
693
        if (empty($accessList)) {
694
            return true;
695
        }
696
        foreach (GeneralUtility::intExplode(',', $groupList) as $groupUid) {
697
            if (GeneralUtility::inList($accessList, $groupUid)) {
698
                return true;
699
            }
700
        }
701
        return false;
702
    }
703
704
    /**
705
     * Will expand the parameters configuration to individual values. This follows a certain syntax of the value of each parameter.
706
     * Syntax of values:
707
     * - Basically: If the value is wrapped in [...] it will be expanded according to the following syntax, otherwise the value is taken literally
708
     * - Configuration is splitted by "|" and the parts are processed individually and finally added together
709
     * - For each configuration part:
710
     *         - "[int]-[int]" = Integer range, will be expanded to all values in between, values included, starting from low to high (max. 1000). Example "1-34" or "-40--30"
711 7
     *         - "_TABLE:[TCA table name];[_PID:[optional page id, default is current page]];[_ENABLELANG:1]" = Look up of table records from PID, filtering out deleted records. Example "_TABLE:tt_content; _PID:123"
712
     *        _ENABLELANG:1 picks only original records without their language overlays
713
     *         - Default: Literal value
714 7
     *
715 7
     * @param array $paramArray Array with key (GET var name) and values (value of GET var which is configuration for expansion)
716
     * @param integer $pid Current page ID
717
     * @return array
718 7
     *
719
     * TODO: Write Functional Tests
720 7
     */
721 7
    public function expandParameters($paramArray, $pid)
722
    {
723
        // Traverse parameter names:
724 7
        foreach ($paramArray as $p => $v) {
725 7
            $v = trim($v);
726
727
            // If value is encapsulated in square brackets it means there are some ranges of values to find, otherwise the value is literal
728 7
            if (strpos($v, '[') === 0 && substr($v, -1) === ']') {
729 1
                // So, find the value inside brackets and reset the paramArray value as an array.
730
                $v = substr($v, 1, -1);
731
                $paramArray[$p] = [];
732 1
733 1
                // Explode parts and traverse them:
734 1
                $parts = explode('|', $v);
735 1
                foreach ($parts as $pV) {
736 1
737
                    // Look for integer range: (fx. 1-34 or -40--30 // reads minus 40 to minus 30)
738
                    if (preg_match('/^(-?[0-9]+)\s*-\s*(-?[0-9]+)$/', trim($pV), $reg)) {
739
                        $reg = $this->swapIfFirstIsLargerThanSecond($reg);
740 6
741
                        // Traverse range, add values:
742
                        $runAwayBrake = 1000; // Limit to size of range!
743 6
                        for ($a = $reg[1]; $a <= $reg[2]; $a++) {
744 6
                            $paramArray[$p][] = $a;
745 6
                            $runAwayBrake--;
746 6
                            if ($runAwayBrake <= 0) {
747 6
                                break;
748
                            }
749
                        }
750
                    } elseif (strpos(trim($pV), '_TABLE:') === 0) {
751 6
752 6
                        // Parse parameters:
753 6
                        $subparts = GeneralUtility::trimExplode(';', $pV);
754 6
                        $subpartParams = [];
755 6
                        foreach ($subparts as $spV) {
756 6
                            [$pKey, $pVal] = GeneralUtility::trimExplode(':', $spV);
757
                            $subpartParams[$pKey] = $pVal;
758 6
                        }
759 6
760 6
                        // Table exists:
761
                        if (isset($GLOBALS['TCA'][$subpartParams['_TABLE']])) {
762 6
                            $lookUpPid = isset($subpartParams['_PID']) ? intval($subpartParams['_PID']) : intval($pid);
763
                            $recursiveDepth = isset($subpartParams['_RECURSIVE']) ? intval($subpartParams['_RECURSIVE']) : 0;
764 2
                            $pidField = isset($subpartParams['_PIDFIELD']) ? trim($subpartParams['_PIDFIELD']) : 'pid';
765 2
                            $where = $subpartParams['_WHERE'] ?? '';
766 2
                            $addTable = $subpartParams['_ADDTABLE'] ?? '';
767
768 4
                            $fieldName = $subpartParams['_FIELD'] ? $subpartParams['_FIELD'] : 'uid';
769
                            if ($fieldName === 'uid' || $GLOBALS['TCA'][$subpartParams['_TABLE']]['columns'][$fieldName]) {
770
                                $queryBuilder = $this->getQueryBuilder($subpartParams['_TABLE']);
771 6
772 6
                                if ($recursiveDepth > 0) {
773 6
                                    /** @var \TYPO3\CMS\Core\Database\QueryGenerator $queryGenerator */
774
                                    $queryGenerator = GeneralUtility::makeInstance(\TYPO3\CMS\Core\Database\QueryGenerator::class);
775
                                    $pidList = $queryGenerator->getTreeList($lookUpPid, $recursiveDepth, 0, 1);
776 6
                                    $pidArray = GeneralUtility::intExplode(',', $pidList);
777 6
                                } else {
778 6
                                    $pidArray = [(string) $lookUpPid];
779 6
                                }
780 6
781
                                $queryBuilder->getRestrictions()
782
                                    ->removeAll()
783 6
                                    ->add(GeneralUtility::makeInstance(DeletedRestriction::class));
784
785
                                $queryBuilder
786
                                    ->select($fieldName)
787 6
                                    ->from($subpartParams['_TABLE'])
788
                                    ->where(
789 6
                                        $queryBuilder->expr()->in($pidField, $queryBuilder->createNamedParameter($pidArray, Connection::PARAM_INT_ARRAY)),
790
                                        $where
791
                                    );
792
793
                                if (! empty($addTable)) {
794
                                    // TODO: Check if this works as intended!
795
                                    $queryBuilder->add('from', $addTable);
796
                                }
797
                                $transOrigPointerField = $GLOBALS['TCA'][$subpartParams['_TABLE']]['ctrl']['transOrigPointerField'];
798 6
799
                                if ($subpartParams['_ENABLELANG'] && $transOrigPointerField) {
800 6
                                    $queryBuilder->andWhere(
801 6
                                        $queryBuilder->expr()->lte(
802 6
                                            $transOrigPointerField,
803
                                            0
804
                                        )
805 6
                                    );
806 6
                                }
807
808
                                $statement = $queryBuilder->execute();
809
810
                                $rows = [];
811
                                while ($row = $statement->fetch()) {
812
                                    $rows[$row[$fieldName]] = $row;
813
                                }
814 7
815
                                if (is_array($rows)) {
816
                                    $paramArray[$p] = array_merge($paramArray[$p], array_keys($rows));
817
                                }
818
                            }
819
                        }
820
                    } else { // Just add value:
821
                        $paramArray[$p][] = $pV;
822
                    }
823
                    // Hook for processing own expandParameters place holder
824
                    if (is_array($GLOBALS['TYPO3_CONF_VARS']['SC_OPTIONS']['crawler/class.tx_crawler_lib.php']['expandParameters'])) {
825
                        $_params = [
826
                            'pObj' => &$this,
827
                            'paramArray' => &$paramArray,
828
                            'currentKey' => $p,
829 7
                            'currentValue' => $pV,
830 7
                            'pid' => $pid,
831
                        ];
832
                        foreach ($GLOBALS['TYPO3_CONF_VARS']['SC_OPTIONS']['crawler/class.tx_crawler_lib.php']['expandParameters'] as $_funcRef) {
833
                            GeneralUtility::callUserFunction($_funcRef, $_params, $this);
834
                        }
835
                    }
836
                }
837 7
838
                // Make unique set of values and sort array by key:
839
                $paramArray[$p] = array_unique($paramArray[$p]);
840
                ksort($paramArray);
841
            } else {
842
                // Set the literal value as only value in array:
843
                $paramArray[$p] = [$v];
844
            }
845
        }
846
847
        return $paramArray;
848 3
    }
849
850 3
    /**
851 3
     * Compiling URLs from parameter array (output of expandParameters())
852
     * The number of URLs will be the multiplication of the number of parameter values for each key
853
     *
854 2
     * @param array $paramArray Output of expandParameters(): Array with keys (GET var names) and for each an array of values
855 2
     * @param array $urls URLs accumulated in this array (for recursion)
856 2
     * @return array
857
     */
858
    public function compileUrls($paramArray, array $urls)
859 2
    {
860 2
        if (empty($paramArray)) {
861 1
            return $urls;
862 1
        }
863
        // shift first off stack:
864 1
        reset($paramArray);
865
        $varName = key($paramArray);
866
        $valueSet = array_shift($paramArray);
867
868
        // Traverse value set:
869 2
        $newUrls = [];
870
        foreach ($urls as $url) {
871
            foreach ($valueSet as $val) {
872
                if (count($newUrls) < $this->maximumUrlsToCompile) {
873
                    $newUrls[] = $url . (strcmp((string) $val, '') ? '&' . rawurlencode($varName) . '=' . rawurlencode((string) $val) : '');
874
                }
875
            }
876
        }
877
        return $this->compileUrls($paramArray, $newUrls);
878
    }
879
880
    /************************************
881
     *
882
     * Crawler log
883
     *
884
     ************************************/
885
886
    /**
887
     * Return array of records from crawler queue for input page ID
888 4
     *
889
     * @param integer $id Page ID for which to look up log entries.
890 4
     * @param string $filter Filter: "all" => all entries, "pending" => all that is not yet run, "finished" => all complete ones
891
     * @param boolean $doFlush If TRUE, then entries selected at DELETED(!) instead of selected!
892 4
     * @param boolean $doFullFlush
893 4
     * @param integer $itemsPerPage Limit the amount of entries per page default is 10
894 4
     * @return array
895 4
     */
896
    public function getLogEntriesForPageId($id, $filter = '', $doFlush = false, $doFullFlush = false, $itemsPerPage = 10)
897 4
    {
898
        $queryBuilder = GeneralUtility::makeInstance(ConnectionPool::class)->getQueryBuilderForTable($this->tableName);
899 4
        $queryBuilder
900 4
            ->select('*')
901 4
            ->from($this->tableName)
902 4
            ->where(
903
                $queryBuilder->expr()->eq('page_id', $queryBuilder->createNamedParameter($id, \PDO::PARAM_INT))
904
            )
905
            ->orderBy('scheduled', 'DESC');
906 4
907 4
        $expressionBuilder = GeneralUtility::makeInstance(ConnectionPool::class)
908
            ->getConnectionForTable($this->tableName)
909
            ->getExpressionBuilder();
910 4
        $query = $expressionBuilder->andX();
0 ignored issues
show
Unused Code introduced by
The assignment to $query is dead and can be removed.
Loading history...
911
        // PHPStorm adds the highlight that the $addWhere is immediately overwritten,
912
        // but the $query = $expressionBuilder->andX() ensures that the $addWhere is written correctly with AND
913
        // between the statements, it's not a mistake in the code.
914
        switch ($filter) {
915 4
            case 'pending':
916 2
                $queryBuilder->andWhere($queryBuilder->expr()->eq('exec_time', 0));
917 1
                break;
918
            case 'finished':
919 1
                $queryBuilder->andWhere($queryBuilder->expr()->gt('exec_time', 0));
920
                break;
921
        }
922 4
923
        if ($doFlush) {
924 4
            if ($doFullFlush) {
925
                $this->queueRepository->flushQueue('all');
926
            } else {
927 4
                $this->queueRepository->flushQueue($filter);
928
            }
929
        }
930
        if ($itemsPerPage > 0) {
931
            $queryBuilder
932
                ->setMaxResults((int) $itemsPerPage);
933
        }
934
935
        return $queryBuilder->execute()->fetchAll();
936
    }
937
938
    /**
939
     * Return array of records from crawler queue for input set ID
940
     *
941 6
     * @param int $set_id Set ID for which to look up log entries.
942
     * @param string $filter Filter: "all" => all entries, "pending" => all that is not yet run, "finished" => all complete ones
943 6
     * @param bool $doFlush If TRUE, then entries selected at DELETED(!) instead of selected!
944
     * @param int $itemsPerPage Limit the amount of entries per page default is 10
945 6
     * @return array
946 6
     *
947 6
     * @deprecated
948 6
     */
949
    public function getLogEntriesForSetId(int $set_id, string $filter = '', bool $doFlush = false, bool $doFullFlush = false, int $itemsPerPage = 10)
950 6
    {
951
        $queryBuilder = GeneralUtility::makeInstance(ConnectionPool::class)->getQueryBuilderForTable($this->tableName);
952 6
        $queryBuilder
953 6
            ->select('*')
954 6
            ->from($this->tableName)
955 6
            ->where(
956
                $queryBuilder->expr()->eq('set_id', $queryBuilder->createNamedParameter($set_id, \PDO::PARAM_INT))
957
            )
958
            ->orderBy('scheduled', 'DESC');
959 6
960 6
        $expressionBuilder = GeneralUtility::makeInstance(ConnectionPool::class)
961 6
            ->getConnectionForTable($this->tableName)
962 1
            ->getExpressionBuilder();
963 1
        $query = $expressionBuilder->andX();
964 1
        // PHPStorm adds the highlight that the $addWhere is immediately overwritten,
965 5
        // but the $query = $expressionBuilder->andX() ensures that the $addWhere is written correctly with AND
966 1
        // between the statements, it's not a mistake in the code.
967 1
        $addWhere = '';
968 1
        switch ($filter) {
969
            case 'pending':
970 6
                $queryBuilder->andWhere($queryBuilder->expr()->eq('exec_time', 0));
971 4
                $addWhere = $query->add($expressionBuilder->eq('exec_time', 0));
0 ignored issues
show
Unused Code introduced by
The assignment to $addWhere is dead and can be removed.
Loading history...
972 4
                break;
973 4
            case 'finished':
974
                $queryBuilder->andWhere($queryBuilder->expr()->gt('exec_time', 0));
975 2
                $addWhere = $query->add($expressionBuilder->gt('exec_time', 0));
976
                break;
977 2
        }
978
        if ($doFlush) {
979
            $addWhere = $query->add($expressionBuilder->eq('set_id', (int) $set_id));
980 2
            $this->flushQueue($doFullFlush ? '' : $addWhere);
0 ignored issues
show
Deprecated Code introduced by
The function AOE\Crawler\Controller\C...ontroller::flushQueue() has been deprecated. ( Ignorable by Annotation )

If this is a false-positive, you can also ignore this issue in your code via the ignore-deprecated  annotation

980
            /** @scrutinizer ignore-deprecated */ $this->flushQueue($doFullFlush ? '' : $addWhere);
Loading history...
981
            return [];
982
        }
983
        if ($itemsPerPage > 0) {
984
            $queryBuilder
985
                ->setMaxResults((int) $itemsPerPage);
986
        }
987
988
        return $queryBuilder->execute()->fetchAll();
989
    }
990
991
    /**
992
     * Adding call back entries to log (called from hooks typically, see indexed search class "class.crawler.php"
993
     *
994
     * @param integer $setId Set ID
995
     * @param array $params Parameters to pass to call back function
996
     * @param string $callBack Call back object reference, eg. 'EXT:indexed_search/class.crawler.php:&tx_indexedsearch_crawler'
997
     * @param integer $page_id Page ID to attach it to
998
     * @param integer $schedule Time at which to activate
999
     */
1000
    public function addQueueEntry_callBack($setId, $params, $callBack, $page_id = 0, $schedule = 0): void
1001
    {
1002
        if (! is_array($params)) {
0 ignored issues
show
introduced by
The condition is_array($params) is always true.
Loading history...
1003
            $params = [];
1004
        }
1005
        $params['_CALLBACKOBJ'] = $callBack;
1006
1007
        GeneralUtility::makeInstance(ConnectionPool::class)->getConnectionForTable('tx_crawler_queue')
1008
            ->insert(
1009
                'tx_crawler_queue',
1010
                [
1011
                    'page_id' => (int) $page_id,
1012
                    'parameters' => json_encode($params),
1013
                    'scheduled' => (int) $schedule ?: $this->getCurrentTime(),
1014
                    'exec_time' => 0,
1015
                    'set_id' => (int) $setId,
1016
                    'result_data' => '',
1017
                ]
1018
            );
1019
    }
1020
1021
    /************************************
1022
     *
1023
     * URL setting
1024
     *
1025
     ************************************/
1026
1027
    /**
1028
     * Setting a URL for crawling:
1029
     *
1030 4
     * @param integer $id Page ID
1031
     * @param string $url Complete URL
1032
     * @param array $subCfg Sub configuration array (from TS config)
1033
     * @param integer $tstamp Scheduled-time
1034
     * @param string $configurationHash (optional) configuration hash
1035
     * @param bool $skipInnerDuplicationCheck (optional) skip inner duplication check
1036
     * @return bool
1037
     */
1038 4
    public function addUrl(
1039 4
        $id,
1040
        $url,
1041
        array $subCfg,
1042
        $tstamp,
1043 4
        $configurationHash = '',
1044
        $skipInnerDuplicationCheck = false
1045
    ) {
1046
        $urlAdded = false;
1047 4
        $rows = [];
1048 4
1049 1
        // Creating parameters:
1050
        $parameters = [
1051
            'url' => $url,
1052
        ];
1053 4
1054 4
        // fe user group simulation:
1055 1
        $uGs = implode(',', array_unique(GeneralUtility::intExplode(',', $subCfg['userGroups'], true)));
1056
        if ($uGs) {
1057
            $parameters['feUserGroupList'] = $uGs;
1058
        }
1059 4
1060
        // Setting processing instructions
1061 4
        $parameters['procInstructions'] = GeneralUtility::trimExplode(',', $subCfg['procInstrFilter']);
1062 4
        if (is_array($subCfg['procInstrParams.'])) {
1063 4
            $parameters['procInstrParams'] = $subCfg['procInstrParams.'];
1064 4
        }
1065 4
1066 4
        // Compile value array:
1067 4
        $parameters_serialized = json_encode($parameters);
1068 4
        $fieldArray = [
1069 4
            'page_id' => (int) $id,
1070
            'parameters' => $parameters_serialized,
1071
            'parameters_hash' => GeneralUtility::shortMD5($parameters_serialized),
1072 4
            'configuration_hash' => $configurationHash,
1073
            'scheduled' => $tstamp,
1074 1
            'exec_time' => 0,
1075
            'set_id' => (int) $this->setID,
1076 3
            'result_data' => '',
1077
            'configuration' => $subCfg['key'],
1078 2
        ];
1079
1080
        if ($this->registerQueueEntriesInternallyOnly) {
1081 3
            //the entries will only be registered and not stored to the database
1082 2
            $this->queueEntries[] = $fieldArray;
1083 2
        } else {
1084 2
            if (! $skipInnerDuplicationCheck) {
1085 2
                // check if there is already an equal entry
1086
                $rows = $this->getDuplicateRowsIfExist($tstamp, $fieldArray);
1087 2
            }
1088 2
1089 2
            if (empty($rows)) {
1090
                $connectionForCrawlerQueue = GeneralUtility::makeInstance(ConnectionPool::class)->getConnectionForTable('tx_crawler_queue');
1091 2
                $connectionForCrawlerQueue->insert(
1092 2
                    'tx_crawler_queue',
1093 2
                    $fieldArray
1094 2
                );
1095 2
                $uid = $connectionForCrawlerQueue->lastInsertId('tx_crawler_queue', 'qid');
1096
                $rows[] = $uid;
1097
                $urlAdded = true;
1098 1
1099 1
                $signalPayload = ['uid' => $uid, 'fieldArray' => $fieldArray];
1100 1
                SignalSlotUtility::emitSignal(
1101 1
                    self::class,
1102 1
                    SignalSlotUtility::SIGNAL_URL_ADDED_TO_QUEUE,
1103
                    $signalPayload
1104
                );
1105
            } else {
1106
                $signalPayload = ['rows' => $rows, 'fieldArray' => $fieldArray];
1107 4
                SignalSlotUtility::emitSignal(
1108
                    self::class,
1109
                    SignalSlotUtility::SIGNAL_DUPLICATE_URL_IN_QUEUE,
1110
                    $signalPayload
1111
                );
1112
            }
1113
        }
1114
1115
        return $urlAdded;
1116
    }
1117
1118
    /**
1119
     * Returns the current system time
1120
     *
1121
     * @return int
1122
     */
1123
    public function getCurrentTime()
1124
    {
1125
        return time();
1126
    }
1127
1128
    /************************************
1129
     *
1130
     * URL reading
1131
     *
1132
     ************************************/
1133
1134
    /**
1135
     * Read URL for single queue entry
1136
     *
1137
     * @param integer $queueId
1138
     * @param boolean $force If set, will process even if exec_time has been set!
1139
     * @return integer
1140
     */
1141
    public function readUrl($queueId, $force = false)
1142
    {
1143
        $queryBuilder = GeneralUtility::makeInstance(ConnectionPool::class)->getQueryBuilderForTable($this->tableName);
1144
        $ret = 0;
1145
        $this->logger->debug('crawler-readurl start ' . microtime(true));
1146
        // Get entry:
1147
        $queryBuilder
1148
            ->select('*')
1149
            ->from('tx_crawler_queue')
1150
            ->where(
1151
                $queryBuilder->expr()->eq('qid', $queryBuilder->createNamedParameter($queueId, \PDO::PARAM_INT))
1152
            );
1153
        if (! $force) {
1154
            $queryBuilder
1155
                ->andWhere('exec_time = 0')
1156
                ->andWhere('process_scheduled > 0');
1157
        }
1158
        $queueRec = $queryBuilder->execute()->fetch();
1159
1160
        if (! is_array($queueRec)) {
1161
            return;
1162
        }
1163
1164
        SignalSlotUtility::emitSignal(
1165
            self::class,
1166
            SignalSlotUtility::SIGNAL_QUEUEITEM_PREPROCESS,
1167
            [$queueId, &$queueRec]
1168
        );
1169
1170
        // Set exec_time to lock record:
1171
        $field_array = ['exec_time' => $this->getCurrentTime()];
1172
1173
        if (isset($this->processID)) {
1174
            //if mulitprocessing is used we need to store the id of the process which has handled this entry
1175
            $field_array['process_id_completed'] = $this->processID;
1176
        }
1177
1178
        GeneralUtility::makeInstance(ConnectionPool::class)->getConnectionForTable('tx_crawler_queue')
1179
            ->update(
1180
                'tx_crawler_queue',
1181
                $field_array,
1182
                ['qid' => (int) $queueId]
1183
            );
1184
1185
        $result = $this->queueExecutor->executeQueueItem($queueRec, $this);
1186
        if ($result['content'] === null) {
1187
            $resultData = 'An errors happened';
1188
        } else {
1189
            /** @var JsonCompatibilityConverter $jsonCompatibilityConverter */
1190
            $jsonCompatibilityConverter = GeneralUtility::makeInstance(JsonCompatibilityConverter::class);
1191
            $resultData = $jsonCompatibilityConverter->convert($result['content']);
1192
        }
1193
1194
        //atm there's no need to point to specific pollable extensions
1195
        if (is_array($GLOBALS['TYPO3_CONF_VARS']['EXTCONF']['crawler']['pollSuccess'])) {
1196
            foreach ($GLOBALS['TYPO3_CONF_VARS']['EXTCONF']['crawler']['pollSuccess'] as $pollable) {
1197
                // only check the success value if the instruction is runnig
1198
                // it is important to name the pollSuccess key same as the procInstructions key
1199
                if (is_array($resultData['parameters']['procInstructions'])
1200
                    && in_array(
1201
                        $pollable,
1202
                        $resultData['parameters']['procInstructions'], true
1203
                    )
1204
                ) {
1205
                    if (! empty($resultData['success'][$pollable]) && $resultData['success'][$pollable]) {
1206
                        $ret |= self::CLI_STATUS_POLLABLE_PROCESSED;
1207
                    }
1208
                }
1209
            }
1210
        }
1211
1212
        // Set result in log which also denotes the end of the processing of this entry.
1213
        $field_array = ['result_data' => json_encode($result)];
1214
1215
        SignalSlotUtility::emitSignal(
1216
            self::class,
1217
            SignalSlotUtility::SIGNAL_QUEUEITEM_POSTPROCESS,
1218
            [$queueId, &$field_array]
1219
        );
1220
1221
        GeneralUtility::makeInstance(ConnectionPool::class)->getConnectionForTable('tx_crawler_queue')
1222
            ->update(
1223
                'tx_crawler_queue',
1224
                $field_array,
1225
                ['qid' => (int) $queueId]
1226
            );
1227
1228
        $this->logger->debug('crawler-readurl stop ' . microtime(true));
1229
        return $ret;
1230
    }
1231
1232
    /**
1233
     * Read URL for not-yet-inserted log-entry
1234
     *
1235
     * @param array $field_array Queue field array,
1236
     *
1237
     * @return string
1238
     */
1239
    public function readUrlFromArray($field_array)
1240
    {
1241
        // Set exec_time to lock record:
1242
        $field_array['exec_time'] = $this->getCurrentTime();
1243
        $connectionForCrawlerQueue = GeneralUtility::makeInstance(ConnectionPool::class)->getConnectionForTable($this->tableName);
1244
        $connectionForCrawlerQueue->insert(
1245
            $this->tableName,
1246
            $field_array
1247
        );
1248
        $queueId = $field_array['qid'] = $connectionForCrawlerQueue->lastInsertId($this->tableName, 'qid');
1249
1250
        $result = $this->queueExecutor->executeQueueItem($field_array, $this);
1251
1252
        // Set result in log which also denotes the end of the processing of this entry.
1253
        $field_array = ['result_data' => json_encode($result)];
1254
1255
        SignalSlotUtility::emitSignal(
1256
            self::class,
1257
            SignalSlotUtility::SIGNAL_QUEUEITEM_POSTPROCESS,
1258
            [$queueId, &$field_array]
1259
        );
1260
1261
        $connectionForCrawlerQueue->update(
1262
            $this->tableName,
1263
            $field_array,
1264
            ['qid' => $queueId]
1265
        );
1266
1267
        return $result;
0 ignored issues
show
Bug Best Practice introduced by
The expression return $result also could return the type array|boolean which is incompatible with the documented return type string.
Loading history...
1268
    }
1269
1270
    /*****************************
1271
     *
1272
     * Compiling URLs to crawl - tools
1273
     *
1274
     *****************************/
1275
1276
    /**
1277
     * @param integer $id Root page id to start from.
1278
     * @param integer $depth Depth of tree, 0=only id-page, 1= on sublevel, 99 = infinite
1279
     * @param integer $scheduledTime Unix Time when the URL is timed to be visited when put in queue
1280
     * @param integer $reqMinute Number of requests per minute (creates the interleave between requests)
1281
     * @param boolean $submitCrawlUrls If set, submits the URLs to queue in database (real crawling)
1282
     * @param boolean $downloadCrawlUrls If set (and submitcrawlUrls is false) will fill $downloadUrls with entries)
1283
     * @param array $incomingProcInstructions Array of processing instructions
1284
     * @param array $configurationSelection Array of configuration keys
1285
     * @return string
1286
     */
1287
    public function getPageTreeAndUrls(
1288
        $id,
1289
        $depth,
1290
        $scheduledTime,
1291
        $reqMinute,
1292
        $submitCrawlUrls,
1293
        $downloadCrawlUrls,
1294
        array $incomingProcInstructions,
1295
        array $configurationSelection
1296
    ) {
1297
        $this->scheduledTime = $scheduledTime;
1298
        $this->reqMinute = $reqMinute;
1299
        $this->submitCrawlUrls = $submitCrawlUrls;
1300
        $this->downloadCrawlUrls = $downloadCrawlUrls;
1301
        $this->incomingProcInstructions = $incomingProcInstructions;
1302
        $this->incomingConfigurationSelection = $configurationSelection;
1303
1304
        $this->duplicateTrack = [];
1305
        $this->downloadUrls = [];
1306
1307
        // Drawing tree:
1308
        /* @var PageTreeView $tree */
1309
        $tree = GeneralUtility::makeInstance(PageTreeView::class);
1310
        $perms_clause = $this->getBackendUser()->getPagePermsClause(Permission::PAGE_SHOW);
1311
        $tree->init('AND ' . $perms_clause);
1312
1313
        $pageInfo = BackendUtility::readPageAccess($id, $perms_clause);
1314
        if (is_array($pageInfo)) {
0 ignored issues
show
introduced by
The condition is_array($pageInfo) is always true.
Loading history...
1315
            // Set root row:
1316
            $tree->tree[] = [
1317
                'row' => $pageInfo,
1318
                'HTML' => $this->iconFactory->getIconForRecord('pages', $pageInfo, Icon::SIZE_SMALL),
1319
            ];
1320
        }
1321
1322
        // Get branch beneath:
1323
        if ($depth) {
1324
            $tree->getTree($id, $depth, '');
1325
        }
1326
1327
        // Traverse page tree:
1328
        $code = '';
1329
1330
        foreach ($tree->tree as $data) {
1331
            $this->MP = false;
1332
1333
            // recognize mount points
1334
            if ($data['row']['doktype'] === PageRepository::DOKTYPE_MOUNTPOINT) {
1335
                $queryBuilder = GeneralUtility::makeInstance(ConnectionPool::class)->getQueryBuilderForTable('pages');
1336
                $queryBuilder->getRestrictions()->removeAll()->add(GeneralUtility::makeInstance(DeletedRestriction::class));
1337
                $mountpage = $queryBuilder
1338
                    ->select('*')
1339
                    ->from('pages')
1340
                    ->where(
1341
                        $queryBuilder->expr()->eq('uid', $queryBuilder->createNamedParameter($data['row']['uid'], \PDO::PARAM_INT))
1342
                    )
1343
                    ->execute()
1344
                    ->fetchAll();
1345
                $queryBuilder->resetRestrictions();
1346
1347
                // fetch mounted pages
1348
                $this->MP = $mountpage[0]['mount_pid'] . '-' . $data['row']['uid'];
0 ignored issues
show
Documentation Bug introduced by
The property $MP was declared of type boolean, but $mountpage[0]['mount_pid...' . $data['row']['uid'] is of type string. Maybe add a type cast?

This check looks for assignments to scalar types that may be of the wrong type.

To ensure the code behaves as expected, it may be a good idea to add an explicit type cast.

$answer = 42;

$correct = false;

$correct = (bool) $answer;
Loading history...
1349
1350
                $mountTree = GeneralUtility::makeInstance(PageTreeView::class);
1351
                $mountTree->init('AND ' . $perms_clause);
1352
                $mountTree->getTree($mountpage[0]['mount_pid'], $depth);
1353
1354
                foreach ($mountTree->tree as $mountData) {
1355
                    $code .= $this->drawURLs_addRowsForPage(
1356
                        $mountData['row'],
1357
                        $mountData['HTML'] . BackendUtility::getRecordTitle('pages', $mountData['row'], true)
1358
                    );
1359
                }
1360
1361
                // replace page when mount_pid_ol is enabled
1362
                if ($mountpage[0]['mount_pid_ol']) {
1363
                    $data['row']['uid'] = $mountpage[0]['mount_pid'];
1364
                } else {
1365
                    // if the mount_pid_ol is not set the MP must not be used for the mountpoint page
1366
                    $this->MP = false;
1367
                }
1368
            }
1369
1370
            $code .= $this->drawURLs_addRowsForPage(
1371
                $data['row'],
1372
                $data['HTML'] . BackendUtility::getRecordTitle('pages', $data['row'], true)
1373
            );
1374
        }
1375
1376
        return $code;
1377
    }
1378
1379
    /**
1380
     * Expands exclude string
1381
     *
1382
     * @param string $excludeString Exclude string
1383
     * @return array
1384
     */
1385
    public function expandExcludeString($excludeString)
1386
    {
1387
        // internal static caches;
1388
        static $expandedExcludeStringCache;
1389
        static $treeCache;
1390
1391
        if (empty($expandedExcludeStringCache[$excludeString])) {
1392
            $pidList = [];
1393
1394
            if (! empty($excludeString)) {
1395
                /** @var PageTreeView $tree */
1396
                $tree = GeneralUtility::makeInstance(PageTreeView::class);
1397
                $tree->init('AND ' . $this->getBackendUser()->getPagePermsClause(Permission::PAGE_SHOW));
1398
1399
                $excludeParts = GeneralUtility::trimExplode(',', $excludeString);
1400
1401
                foreach ($excludeParts as $excludePart) {
1402
                    [$pid, $depth] = GeneralUtility::trimExplode('+', $excludePart);
1403
1404
                    // default is "page only" = "depth=0"
1405
                    if (empty($depth)) {
1406
                        $depth = (stristr($excludePart, '+')) ? 99 : 0;
1407
                    }
1408
1409
                    $pidList[] = $pid;
1410
1411
                    if ($depth > 0) {
1412
                        if (empty($treeCache[$pid][$depth])) {
1413
                            $tree->reset();
1414
                            $tree->getTree($pid, $depth);
0 ignored issues
show
Bug introduced by
$pid of type string is incompatible with the type integer expected by parameter $uid of TYPO3\CMS\Backend\Tree\V...ractTreeView::getTree(). ( Ignorable by Annotation )

If this is a false-positive, you can also ignore this issue in your code via the ignore-type  annotation

1414
                            $tree->getTree(/** @scrutinizer ignore-type */ $pid, $depth);
Loading history...
1415
                            $treeCache[$pid][$depth] = $tree->tree;
1416
                        }
1417
1418
                        foreach ($treeCache[$pid][$depth] as $data) {
1419
                            $pidList[] = $data['row']['uid'];
1420
                        }
1421
                    }
1422
                }
1423
            }
1424
1425
            $expandedExcludeStringCache[$excludeString] = array_unique($pidList);
1426
        }
1427
1428
        return $expandedExcludeStringCache[$excludeString];
1429
    }
1430
1431
    /**
1432
     * Create the rows for display of the page tree
1433
     * For each page a number of rows are shown displaying GET variable configuration
1434
     */
1435
    public function drawURLs_addRowsForPage(array $pageRow, string $pageTitle): string
1436
    {
1437
        $skipMessage = '';
1438
1439
        // Get list of configurations
1440
        $configurations = $this->getUrlsForPageRow($pageRow, $skipMessage);
1441
1442
        if (! empty($this->incomingConfigurationSelection)) {
1443
            // remove configuration that does not match the current selection
1444
            foreach ($configurations as $confKey => $confArray) {
1445
                if (! in_array($confKey, $this->incomingConfigurationSelection, true)) {
1446
                    unset($configurations[$confKey]);
1447
                }
1448
            }
1449
        }
1450
1451
        // Traverse parameter combinations:
1452
        $c = 0;
1453
        $content = '';
1454
        if (! empty($configurations)) {
1455
            foreach ($configurations as $confKey => $confArray) {
1456
1457
                // Title column:
1458
                if (! $c) {
1459
                    $titleClm = '<td rowspan="' . count($configurations) . '">' . $pageTitle . '</td>';
1460
                } else {
1461
                    $titleClm = '';
1462
                }
1463
1464
                if (! in_array($pageRow['uid'], $this->expandExcludeString($confArray['subCfg']['exclude']), true)) {
1465
1466
                    // URL list:
1467
                    $urlList = $this->urlListFromUrlArray(
1468
                        $confArray,
1469
                        $pageRow,
1470
                        $this->scheduledTime,
1471
                        $this->reqMinute,
1472
                        $this->submitCrawlUrls,
1473
                        $this->downloadCrawlUrls,
1474
                        $this->duplicateTrack,
1475
                        $this->downloadUrls,
1476
                        $this->incomingProcInstructions // if empty the urls won't be filtered by processing instructions
1477
                    );
1478
1479
                    // Expanded parameters:
1480
                    $paramExpanded = '';
1481
                    $calcAccu = [];
1482
                    $calcRes = 1;
1483
                    foreach ($confArray['paramExpanded'] as $gVar => $gVal) {
1484
                        $paramExpanded .= '
1485
                            <tr>
1486
                                <td>' . htmlspecialchars('&' . $gVar . '=') . '<br/>' .
1487
                            '(' . count($gVal) . ')' .
1488
                            '</td>
1489
                                <td nowrap="nowrap">' . nl2br(htmlspecialchars(implode(chr(10), $gVal))) . '</td>
1490
                            </tr>
1491
                        ';
1492
                        $calcRes *= count($gVal);
1493
                        $calcAccu[] = count($gVal);
1494
                    }
1495
                    $paramExpanded = '<table>' . $paramExpanded . '</table>';
1496
                    $paramExpanded .= 'Comb: ' . implode('*', $calcAccu) . '=' . $calcRes;
1497
1498
                    // Options
1499
                    $optionValues = '';
1500
                    if ($confArray['subCfg']['userGroups']) {
1501
                        $optionValues .= 'User Groups: ' . $confArray['subCfg']['userGroups'] . '<br/>';
1502
                    }
1503
                    if ($confArray['subCfg']['procInstrFilter']) {
1504
                        $optionValues .= 'ProcInstr: ' . $confArray['subCfg']['procInstrFilter'] . '<br/>';
1505
                    }
1506
1507
                    // Compile row:
1508
                    $content .= '
1509
                        <tr>
1510
                            ' . $titleClm . '
1511
                            <td>' . htmlspecialchars($confKey) . '</td>
1512
                            <td>' . nl2br(htmlspecialchars(rawurldecode(trim(str_replace('&', chr(10) . '&', GeneralUtility::implodeArrayForUrl('', $confArray['paramParsed'])))))) . '</td>
1513
                            <td>' . $paramExpanded . '</td>
1514
                            <td nowrap="nowrap">' . $urlList . '</td>
1515
                            <td nowrap="nowrap">' . $optionValues . '</td>
1516
                            <td nowrap="nowrap">' . DebugUtility::viewArray($confArray['subCfg']['procInstrParams.']) . '</td>
1517
                        </tr>';
1518
                } else {
1519
                    $content .= '<tr>
1520
                            ' . $titleClm . '
1521
                            <td>' . htmlspecialchars($confKey) . '</td>
1522
                            <td colspan="5"><em>No entries</em> (Page is excluded in this configuration)</td>
1523
                        </tr>';
1524
                }
1525
1526
                $c++;
1527
            }
1528
        } else {
1529
            $message = ! empty($skipMessage) ? ' (' . $skipMessage . ')' : '';
1530
1531
            // Compile row:
1532
            $content .= '
1533
                <tr>
1534
                    <td>' . $pageTitle . '</td>
1535
                    <td colspan="6"><em>No entries</em>' . $message . '</td>
1536
                </tr>';
1537
        }
1538
1539
        return $content;
1540
    }
1541
1542
    /*****************************
1543
     *
1544
     * CLI functions
1545
     *
1546
     *****************************/
1547
1548
    /**
1549
     * Running the functionality of the CLI (crawling URLs from queue)
1550
     */
1551
    public function CLI_run(int $countInARun, int $sleepTime, int $sleepAfterFinish): int
1552
    {
1553
        $result = 0;
1554
        $counter = 0;
1555
1556
        // First, run hooks:
1557
        $this->CLI_runHooks();
1558
1559
        // Clean up the queue
1560
        $this->queueRepository->cleanupQueue();
1561
1562
        // Select entries:
1563
        $rows = $this->queueRepository->fetchRecordsToBeCrawled($countInARun);
1564
1565
        if (! empty($rows)) {
1566
            $quidList = [];
1567
1568
            foreach ($rows as $r) {
1569
                $quidList[] = $r['qid'];
1570
            }
1571
1572
            $processId = $this->CLI_buildProcessId();
1573
1574
            //save the number of assigned queue entries to determine how many have been processed later
1575
            $numberOfAffectedRows = $this->queueRepository->updateProcessIdAndSchedulerForQueueIds($quidList, $processId);
1576
            $this->processRepository->updateProcessAssignItemsCount($numberOfAffectedRows, $processId);
1577
1578
            if ($numberOfAffectedRows !== count($quidList)) {
1579
                $this->CLI_debug('Nothing processed due to multi-process collision (' . $this->CLI_buildProcessId() . ')');
1580
                return ($result | self::CLI_STATUS_ABORTED);
1581
            }
1582
1583
            foreach ($rows as $r) {
1584
                $result |= $this->readUrl($r['qid']);
1585
1586
                $counter++;
1587
                usleep((int) $sleepTime); // Just to relax the system
1588
1589
                // if during the start and the current read url the cli has been disable we need to return from the function
1590
                // mark the process NOT as ended.
1591
                if ($this->getDisabled()) {
1592
                    return ($result | self::CLI_STATUS_ABORTED);
1593
                }
1594
1595
                if (! $this->processRepository->isProcessActive($this->CLI_buildProcessId())) {
1596
                    $this->CLI_debug('conflict / timeout (' . $this->CLI_buildProcessId() . ')');
1597
                    $result |= self::CLI_STATUS_ABORTED;
1598
                    break; //possible timeout
1599
                }
1600
            }
1601
1602
            sleep((int) $sleepAfterFinish);
1603
1604
            $msg = 'Rows: ' . $counter;
1605
            $this->CLI_debug($msg . ' (' . $this->CLI_buildProcessId() . ')');
1606
        } else {
1607
            $this->CLI_debug('Nothing within queue which needs to be processed (' . $this->CLI_buildProcessId() . ')');
1608
        }
1609
1610
        if ($counter > 0) {
1611
            $result |= self::CLI_STATUS_PROCESSED;
1612
        }
1613
1614
        return $result;
1615
    }
1616
1617
    /**
1618
     * Activate hooks
1619
     */
1620
    public function CLI_runHooks(): void
1621
    {
1622
        foreach ($GLOBALS['TYPO3_CONF_VARS']['EXTCONF']['crawler']['cli_hooks'] ?? [] as $objRef) {
1623
            $hookObj = GeneralUtility::makeInstance($objRef);
1624
            if (is_object($hookObj)) {
1625
                $hookObj->crawler_init($this);
1626
            }
1627
        }
1628
    }
1629
1630
    /**
1631
     * Try to acquire a new process with the given id
1632
     * also performs some auto-cleanup for orphan processes
1633
     * @param string $id identification string for the process
1634
     * @return boolean
1635
     * @todo preemption might not be the most elegant way to clean up
1636
     */
1637
    public function CLI_checkAndAcquireNewProcess($id)
1638
    {
1639
        $queryBuilder = GeneralUtility::makeInstance(ConnectionPool::class)->getQueryBuilderForTable($this->tableName);
1640
        $ret = true;
1641
1642
        $systemProcessId = getmypid();
1643
        if ($systemProcessId < 1) {
1644
            return false;
1645
        }
1646
1647
        $processCount = 0;
1648
        $orphanProcesses = [];
1649
1650
        $statement = $queryBuilder
1651
            ->select('process_id', 'ttl')
1652
            ->from('tx_crawler_process')
1653
            ->where(
1654
                'active = 1 AND deleted = 0'
1655
            )
1656
            ->execute();
1657
1658
        $currentTime = $this->getCurrentTime();
1659
1660
        while ($row = $statement->fetch()) {
1661
            if ($row['ttl'] < $currentTime) {
1662
                $orphanProcesses[] = $row['process_id'];
1663
            } else {
1664
                $processCount++;
1665
            }
1666
        }
1667
1668
        // if there are less than allowed active processes then add a new one
1669
        if ($processCount < (int) $this->extensionSettings['processLimit']) {
1670
            $this->CLI_debug('add process ' . $this->CLI_buildProcessId() . ' (' . ($processCount + 1) . '/' . (int) $this->extensionSettings['processLimit'] . ')');
1671
1672
            GeneralUtility::makeInstance(ConnectionPool::class)->getConnectionForTable('tx_crawler_process')->insert(
1673
                'tx_crawler_process',
1674
                [
1675
                    'process_id' => $id,
1676
                    'active' => 1,
1677
                    'ttl' => $currentTime + (int) $this->extensionSettings['processMaxRunTime'],
1678
                    'system_process_id' => $systemProcessId,
1679
                ]
1680
            );
1681
        } else {
1682
            $this->CLI_debug('Processlimit reached (' . ($processCount) . '/' . (int) $this->extensionSettings['processLimit'] . ')');
1683
            $ret = false;
1684
        }
1685
1686
        $this->processRepository->deleteProcessesMarkedAsDeleted();
1687
        $this->CLI_releaseProcesses($orphanProcesses);
1688
1689
        return $ret;
1690
    }
1691
1692
    /**
1693
     * Release a process and the required resources
1694
     *
1695
     * @param mixed $releaseIds string with a single process-id or array with multiple process-ids
1696
     * @return boolean
1697
     */
1698
    public function CLI_releaseProcesses($releaseIds)
1699
    {
1700
        $queryBuilder = GeneralUtility::makeInstance(ConnectionPool::class)->getQueryBuilderForTable($this->tableName);
1701
1702
        if (! is_array($releaseIds)) {
1703
            $releaseIds = [$releaseIds];
1704
        }
1705
1706
        if (empty($releaseIds)) {
1707
            return false;   //nothing to release
1708
        }
1709
1710
        // some kind of 2nd chance algo - this way you need at least 2 processes to have a real cleanup
1711
        // this ensures that a single process can't mess up the entire process table
1712
1713
        // mark all processes as deleted which have no "waiting" queue-entires and which are not active
1714
1715
        $queryBuilder
1716
            ->update($this->tableName, 'q')
1717
            ->where(
1718
                'q.process_id IN(SELECT p.process_id FROM tx_crawler_process as p WHERE p.active = 0)'
1719
            )
1720
            ->set('q.process_scheduled', 0)
1721
            ->set('q.process_id', '')
1722
            ->execute();
1723
1724
        // FIXME: Not entirely sure that this is equivalent to the previous version
1725
        $queryBuilder->resetQueryPart('set');
1726
1727
        $queryBuilder
1728
            ->update('tx_crawler_process')
1729
            ->where(
1730
                $queryBuilder->expr()->eq('active', 0),
1731
                'process_id IN(SELECT q.process_id FROM tx_crawler_queue as q WHERE q.exec_time = 0)'
1732
            )
1733
            ->set('system_process_id', 0)
1734
            ->execute();
1735
1736
        $this->processRepository->markRequestedProcessesAsNotActive($releaseIds);
1737
        $this->queueRepository->unsetProcessScheduledAndProcessIdForQueueEntries($releaseIds);
1738
1739 1
        return true;
1740
    }
1741 1
1742
    /**
1743
     * Create a unique Id for the current process
1744 1
     *
1745
     * @return string  the ID
1746
     */
1747
    public function CLI_buildProcessId()
1748
    {
1749
        if (! $this->processID) {
1750
            $this->processID = GeneralUtility::shortMD5(microtime(true));
1751
        }
1752
        return $this->processID;
1753
    }
1754
1755
    /**
1756
     * Prints a message to the stdout (only if debug-mode is enabled)
1757
     *
1758
     * @param string $msg the message
1759
     */
1760
    public function CLI_debug($msg): void
1761
    {
1762
        if ((int) $this->extensionSettings['processDebug']) {
1763
            echo $msg . "\n";
1764
            flush();
1765
        }
1766
    }
1767 1
1768
    /**
1769 1
     * Cleans up entries that stayed for too long in the queue. These are:
1770 1
     * - processed entries that are over 1.5 days in age
1771
     * - scheduled entries that are over 7 days old
1772 1
     *
1773 1
     * @deprecated
1774 1
     */
1775 1
    public function cleanUpOldQueueEntries(): void
1776
    {
1777
        $processedAgeInSeconds = $this->extensionSettings['cleanUpProcessedAge'] * 86400; // 24*60*60 Seconds in 24 hours
1778
        $scheduledAgeInSeconds = $this->extensionSettings['cleanUpScheduledAge'] * 86400;
1779
1780
        $now = time();
1781
        $condition = '(exec_time<>0 AND exec_time<' . ($now - $processedAgeInSeconds) . ') OR scheduled<=' . ($now - $scheduledAgeInSeconds);
1782
        $this->flushQueue($condition);
0 ignored issues
show
Deprecated Code introduced by
The function AOE\Crawler\Controller\C...ontroller::flushQueue() has been deprecated. ( Ignorable by Annotation )

If this is a false-positive, you can also ignore this issue in your code via the ignore-deprecated  annotation

1782
        /** @scrutinizer ignore-deprecated */ $this->flushQueue($condition);
Loading history...
1783
    }
1784 5
1785
    /**
1786 5
     * Removes queue entries
1787
     *
1788 5
     * @param string $where SQL related filter for the entries which should be removed
1789
     *
1790
     * @deprecated
1791 5
     */
1792 5
    protected function flushQueue($where = ''): void
1793 5
    {
1794 5
        $realWhere = strlen((string) $where) > 0 ? $where : '1=1';
1795 5
1796 5
        $queryBuilder = $this->getQueryBuilder($this->tableName);
1797 5
1798
        $groups = $queryBuilder
1799 4
            ->selectLiteral('DISTINCT set_id')
1800 4
            ->from($this->tableName)
1801 4
            ->where($realWhere)
1802 4
            ->execute()
1803 4
            ->fetchAll();
1804
        if (is_array($groups)) {
0 ignored issues
show
introduced by
The condition is_array($groups) is always true.
Loading history...
1805 4
            foreach ($groups as $group) {
1806 4
                $subSet = $queryBuilder
1807
                    ->select('qid', 'set_id')
1808 4
                    ->from($this->tableName)
1809 4
                    ->where(
1810 4
                        $realWhere,
1811 4
                        $queryBuilder->expr()->eq('set_id', $group['set_id'])
1812 4
                    )
1813
                    ->execute()
1814
                    ->fetchAll();
1815
1816
                $payLoad = ['subSet' => $subSet];
1817
                SignalSlotUtility::emitSignal(
1818 5
                    self::class,
1819 5
                    SignalSlotUtility::SIGNAL_QUEUE_ENTRY_FLUSH,
1820 5
                    $payLoad
1821 5
                );
1822
            }
1823
        }
1824
1825
        $queryBuilder
1826
            ->delete($this->tableName)
1827
            ->where($realWhere)
1828
            ->execute();
1829
    }
1830
1831
    /**
1832
     * This method determines duplicates for a queue entry with the same parameters and this timestamp.
1833 5
     * If the timestamp is in the past, it will check if there is any unprocessed queue entry in the past.
1834
     * If the timestamp is in the future it will check, if the queued entry has exactly the same timestamp
1835 5
     *
1836
     * @param int $tstamp
1837 5
     * @param array $fieldArray
1838
     *
1839 5
     * @return array
1840
     */
1841 5
    protected function getDuplicateRowsIfExist($tstamp, $fieldArray)
1842 5
    {
1843
        $rows = [];
1844 5
1845 2
        $currentTime = $this->getCurrentTime();
1846 1
1847 1
        $queryBuilder = GeneralUtility::makeInstance(ConnectionPool::class)->getQueryBuilderForTable($this->tableName);
1848
        $queryBuilder
1849 1
            ->select('qid')
1850 1
            ->from('tx_crawler_queue');
1851
        //if this entry is scheduled with "now"
1852 1
        if ($tstamp <= $currentTime) {
1853 1
            if ($this->extensionSettings['enableTimeslot']) {
1854
                $timeBegin = $currentTime - 100;
1855
                $timeEnd = $currentTime + 100;
1856
                $queryBuilder
1857 1
                    ->where(
1858 2
                        'scheduled BETWEEN ' . $timeBegin . ' AND ' . $timeEnd . ''
1859
                    )
1860
                    ->orWhere(
1861 3
                        $queryBuilder->expr()->lte('scheduled', $currentTime)
1862
                    );
1863
            } else {
1864 3
                $queryBuilder
1865 3
                    ->where(
1866
                        $queryBuilder->expr()->lte('scheduled', $currentTime)
1867
                    );
1868
            }
1869
        } elseif ($tstamp > $currentTime) {
1870 5
            //entry with a timestamp in the future need to have the same schedule time
1871 5
            $queryBuilder
1872 5
                ->where(
1873 5
                    $queryBuilder->expr()->eq('scheduled', $tstamp)
1874
                );
1875 5
        }
1876
1877 5
        $queryBuilder
1878 5
            ->andWhere('NOT exec_time')
1879
            ->andWhere('NOT process_id')
1880
            ->andWhere($queryBuilder->expr()->eq('page_id', $queryBuilder->createNamedParameter($fieldArray['page_id'], \PDO::PARAM_INT)))
1881 5
            ->andWhere($queryBuilder->expr()->eq('parameters_hash', $queryBuilder->createNamedParameter($fieldArray['parameters_hash'], \PDO::PARAM_STR)));
1882
1883
        $statement = $queryBuilder->execute();
1884
1885
        while ($row = $statement->fetch()) {
1886
            $rows[] = $row['qid'];
1887
        }
1888
1889 6
        return $rows;
1890
    }
1891 6
1892 6
    /**
1893 6
     * Returns a md5 hash generated from a serialized configuration array.
1894
     *
1895
     * @return string
1896
     */
1897
    protected function getConfigurationHash(array $configuration)
1898
    {
1899
        unset($configuration['paramExpanded']);
1900
        unset($configuration['URLs']);
1901
        return md5(serialize($configuration));
1902
    }
1903
1904 8
    /**
1905
     * Build a URL from a Page and the Query String. If the page has a Site configuration, it can be built by using
1906 8
     * the Site instance.
1907 8
     *
1908 5
     * @param int $httpsOrHttp see tx_crawler_configuration.force_ssl
1909 5
     * @throws \TYPO3\CMS\Core\Exception\SiteNotFoundException
1910 5
     * @throws \TYPO3\CMS\Core\Routing\InvalidRouteArgumentsException
1911 5
     */
1912
    protected function getUrlFromPageAndQueryParameters(int $pageId, string $queryString, ?string $alternativeBaseUrl, int $httpsOrHttp): UriInterface
1913 5
    {
1914
        $site = GeneralUtility::makeInstance(SiteMatcher::class)->matchByPageId((int) $pageId);
1915
        if ($site instanceof Site) {
1916
            $queryString = ltrim($queryString, '?&');
1917
            $queryParts = [];
1918 5
            parse_str($queryString, $queryParts);
1919
            unset($queryParts['id']);
1920 5
            // workaround as long as we don't have native language support in crawler configurations
1921 5
            if (isset($queryParts['L'])) {
1922 3
                $queryParts['_language'] = $queryParts['L'];
1923 3
                unset($queryParts['L']);
1924 3
                $siteLanguage = $site->getLanguageById((int) $queryParts['_language']);
0 ignored issues
show
Unused Code introduced by
The assignment to $siteLanguage is dead and can be removed.
Loading history...
1925 3
            } else {
1926 3
                $siteLanguage = $site->getDefaultLanguage();
1927 5
            }
1928
            $url = $site->getRouter()->generateUri($pageId, $queryParts);
1929
            if (! empty($alternativeBaseUrl)) {
1930
                $alternativeBaseUrl = new Uri($alternativeBaseUrl);
1931
                $url = $url->withHost($alternativeBaseUrl->getHost());
1932
                $url = $url->withScheme($alternativeBaseUrl->getScheme());
1933 3
                $url = $url->withPort($alternativeBaseUrl->getPort());
1934 3
                if ($userInfo = $alternativeBaseUrl->getUserInfo()) {
1935 3
                    $url = $url->withUserInfo($userInfo);
1936 3
                }
1937 3
            }
1938
        } else {
1939
            // Technically this is not possible with site handling, but kept for backwards-compatibility reasons
1940 8
            // Once EXT:crawler is v10-only compatible, this should be removed completely
1941 2
            $baseUrl = ($alternativeBaseUrl ?: GeneralUtility::getIndpEnv('TYPO3_SITE_URL'));
1942 6
            $cacheHashCalculator = GeneralUtility::makeInstance(CacheHashCalculator::class);
1943 6
            $queryString .= '&cHash=' . $cacheHashCalculator->generateForParameters($queryString);
1944
            $url = rtrim($baseUrl, '/') . '/index.php' . $queryString;
1945
            $url = new Uri($url);
1946 8
        }
1947
1948
        if ($httpsOrHttp === -1) {
1949 1
            $url = $url->withScheme('http');
1950
        } elseif ($httpsOrHttp === 1) {
1951
            $url = $url->withScheme('https');
1952 1
        }
1953
1954
        return $url;
1955
    }
1956
1957
    protected function swapIfFirstIsLargerThanSecond(array $reg): array
1958 1
    {
1959
        // Swap if first is larger than last:
1960
        if ($reg[1] > $reg[2]) {
1961
            $temp = $reg[2];
1962
            $reg[2] = $reg[1];
1963
            $reg[1] = $temp;
1964 1
        }
1965
1966
        return $reg;
1967 1
    }
1968 1
1969 1
    /**
1970
     * @return BackendUserAuthentication
1971 1
     */
1972
    private function getBackendUser()
1973
    {
1974
        // Make sure the _cli_ user is loaded
1975
        Bootstrap::initializeBackendAuthentication();
1976
        if ($this->backendUser === null) {
1977
            $this->backendUser = $GLOBALS['BE_USER'];
1978
        }
1979 12
        return $this->backendUser;
1980
    }
1981 12
1982
    /**
1983
     * Get querybuilder for given table
1984
     *
1985
     * @return \TYPO3\CMS\Core\Database\Query\QueryBuilder
1986
     */
1987
    private function getQueryBuilder(string $table)
1988
    {
1989
        return GeneralUtility::makeInstance(ConnectionPool::class)->getQueryBuilderForTable($table);
1990
    }
1991
}
1992