Passed
Push — Cleanup/misc ( 4cc982 )
by Tomas Norre
06:26
created

CrawlerController::setExtensionSettings()   A

Complexity

Conditions 1
Paths 1

Size

Total Lines 3
Code Lines 1

Duplication

Lines 0
Ratio 0 %

Code Coverage

Tests 2
CRAP Score 1

Importance

Changes 0
Metric Value
cc 1
eloc 1
c 0
b 0
f 0
nc 1
nop 1
dl 0
loc 3
ccs 2
cts 2
cp 1
crap 1
rs 10
1
<?php
2
3
declare(strict_types=1);
4
5
namespace AOE\Crawler\Controller;
6
7
/***************************************************************
8
 *  Copyright notice
9
 *
10
 *  (c) 2020 AOE GmbH <[email protected]>
11
 *
12
 *  All rights reserved
13
 *
14
 *  This script is part of the TYPO3 project. The TYPO3 project is
15
 *  free software; you can redistribute it and/or modify
16
 *  it under the terms of the GNU General Public License as published by
17
 *  the Free Software Foundation; either version 3 of the License, or
18
 *  (at your option) any later version.
19
 *
20
 *  The GNU General Public License can be found at
21
 *  http://www.gnu.org/copyleft/gpl.html.
22
 *
23
 *  This script is distributed in the hope that it will be useful,
24
 *  but WITHOUT ANY WARRANTY; without even the implied warranty of
25
 *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
26
 *  GNU General Public License for more details.
27
 *
28
 *  This copyright notice MUST APPEAR in all copies of the script!
29
 ***************************************************************/
30
31
use AOE\Crawler\Configuration\ExtensionConfigurationProvider;
32
use AOE\Crawler\Converter\JsonCompatibilityConverter;
33
use AOE\Crawler\Domain\Repository\ConfigurationRepository;
34
use AOE\Crawler\Domain\Repository\ProcessRepository;
35
use AOE\Crawler\Domain\Repository\QueueRepository;
36
use AOE\Crawler\QueueExecutor;
37
use AOE\Crawler\Utility\HookUtility;
38
use AOE\Crawler\Utility\SignalSlotUtility;
39
use Psr\Http\Message\UriInterface;
40
use Psr\Log\LoggerAwareInterface;
41
use Psr\Log\LoggerAwareTrait;
42
use TYPO3\CMS\Backend\Tree\View\PageTreeView;
43
use TYPO3\CMS\Backend\Utility\BackendUtility;
44
use TYPO3\CMS\Core\Authentication\BackendUserAuthentication;
45
use TYPO3\CMS\Core\Compatibility\PublicMethodDeprecationTrait;
46
use TYPO3\CMS\Core\Core\Bootstrap;
47
use TYPO3\CMS\Core\Core\Environment;
48
use TYPO3\CMS\Core\Database\Connection;
49
use TYPO3\CMS\Core\Database\ConnectionPool;
50
use TYPO3\CMS\Core\Database\Query\Restriction\DeletedRestriction;
51
use TYPO3\CMS\Core\Http\Uri;
52
use TYPO3\CMS\Core\Imaging\Icon;
53
use TYPO3\CMS\Core\Imaging\IconFactory;
54
use TYPO3\CMS\Core\Routing\SiteMatcher;
55
use TYPO3\CMS\Core\Site\Entity\Site;
56
use TYPO3\CMS\Core\Type\Bitmask\Permission;
57
use TYPO3\CMS\Core\TypoScript\Parser\TypoScriptParser;
58
use TYPO3\CMS\Core\Utility\DebugUtility;
59
use TYPO3\CMS\Core\Utility\GeneralUtility;
60
use TYPO3\CMS\Core\Utility\MathUtility;
61
use TYPO3\CMS\Extbase\Object\ObjectManager;
62
use TYPO3\CMS\Frontend\Page\CacheHashCalculator;
63
use TYPO3\CMS\Frontend\Page\PageRepository;
64
65
/**
66
 * Class CrawlerController
67
 *
68
 * @package AOE\Crawler\Controller
69
 */
70
class CrawlerController implements LoggerAwareInterface
71
{
72
    use LoggerAwareTrait;
73
    use PublicMethodDeprecationTrait;
74
75
    public const CLI_STATUS_NOTHING_PROCCESSED = 0;
76
77
    public const CLI_STATUS_REMAIN = 1; //queue not empty
78
79
    public const CLI_STATUS_PROCESSED = 2; //(some) queue items where processed
80
81
    public const CLI_STATUS_ABORTED = 4; //instance didn't finish
82
83
    public const CLI_STATUS_POLLABLE_PROCESSED = 8;
84
85
    /**
86
     * @var integer
87
     */
88
    public $setID = 0;
89
90
    /**
91
     * @var string
92
     */
93
    public $processID = '';
94
95
    /**
96
     * @var array
97
     */
98
    public $duplicateTrack = [];
99
100
    /**
101
     * @var array
102
     */
103
    public $downloadUrls = [];
104
105
    /**
106
     * @var array
107
     */
108
    public $incomingProcInstructions = [];
109
110
    /**
111
     * @var array
112
     */
113
    public $incomingConfigurationSelection = [];
114
115
    /**
116
     * @var bool
117
     */
118
    public $registerQueueEntriesInternallyOnly = false;
119
120
    /**
121
     * @var array
122
     */
123
    public $queueEntries = [];
124
125
    /**
126
     * @var array
127
     */
128
    public $urlList = [];
129
130
    /**
131
     * @var array
132
     */
133
    public $extensionSettings = [];
134
135
    /**
136
     * Mount Point
137
     *
138
     * @var bool
139
     * Todo: Check what this is used for and adjust the type hint or code, as bool doesn't match the current code.
140
     */
141
    public $MP = false;
142
143
    /**
144
     * @var string
145
     */
146
    protected $processFilename;
147
148
    /**
149
     * Holds the internal access mode can be 'gui','cli' or 'cli_im'
150
     *
151
     * @var string
152
     */
153
    protected $accessMode;
154
155
    /**
156
     * @var QueueRepository
157
     */
158
    protected $queueRepository;
159
160
    /**
161
     * @var ProcessRepository
162
     */
163
    protected $processRepository;
164
165
    /**
166
     * @var ConfigurationRepository
167
     */
168
    protected $configurationRepository;
169
170
    /**
171
     * @var string
172
     */
173
    protected $tableName = 'tx_crawler_queue';
174
175
    /**
176
     * @var QueueExecutor
177
     */
178
    protected $queueExecutor;
179
180
    /**
181
     * @var int
182
     */
183
    protected $maximumUrlsToCompile = 10000;
184
185
    /**
186
     * @var IconFactory
187
     */
188
    protected $iconFactory;
189
190
    /**
191
     * @var string[]
192
     */
193
    private $deprecatedPublicMethods = [
0 ignored issues
show
introduced by
The private property $deprecatedPublicMethods is not used, and could be removed.
Loading history...
194
        'cleanUpOldQueueEntries' => 'Using CrawlerController::cleanUpOldQueueEntries() is deprecated since 9.0.1 and will be removed in v11.x, please use QueueRepository->cleanUpOldQueueEntries() instead.',
195
        'CLI_runHooks' => 'Using CrawlerController::CLI_runHooks() is deprecated since 9.0.1 and will be removed in v11.x',
196
        'flushQueue' => 'Using CrawlerController::flushQueue() is deprecated since 9.0.1 and will be removed in v11.x, please use QueueRepository->flushQueue() instead.',
197
        'getLogEntriesForSetId' => 'Using crawlerController::getLogEntriesForSetId() is deprecated since 9.0.1 and will be removed in v11.x',
198
    ];
199
200
    /**
201
     * @var BackendUserAuthentication|null
202
     */
203
    private $backendUser;
204
205
    /**
206
     * @var integer
207
     */
208
    private $scheduledTime = 0;
209
210
    /**
211
     * @var integer
212
     */
213
    private $reqMinute = 0;
214
215
    /**
216
     * @var bool
217
     */
218
    private $submitCrawlUrls = false;
219
220
    /**
221
     * @var bool
222
     */
223
    private $downloadCrawlUrls = false;
224
225
    /************************************
226
     *
227
     * Getting URLs based on Page TSconfig
228
     *
229
     ************************************/
230
231 43
    public function __construct()
232
    {
233 43
        $objectManager = GeneralUtility::makeInstance(ObjectManager::class);
234 43
        $this->queueRepository = $objectManager->get(QueueRepository::class);
235 43
        $this->processRepository = $objectManager->get(ProcessRepository::class);
236 43
        $this->configurationRepository = $objectManager->get(ConfigurationRepository::class);
237 43
        $this->queueExecutor = $objectManager->get(QueueExecutor::class);
238 43
        $this->iconFactory = GeneralUtility::makeInstance(IconFactory::class);
239
240 43
        $this->processFilename = Environment::getVarPath() . '/lock/tx_crawler.proc';
241
242
        /** @var ExtensionConfigurationProvider $configurationProvider */
243 43
        $configurationProvider = GeneralUtility::makeInstance(ExtensionConfigurationProvider::class);
244 43
        $settings = $configurationProvider->getExtensionConfiguration();
245 43
        $this->extensionSettings = is_array($settings) ? $settings : [];
0 ignored issues
show
introduced by
The condition is_array($settings) is always true.
Loading history...
246
247
        // set defaults:
248 43
        if (MathUtility::convertToPositiveInteger($this->extensionSettings['countInARun']) === 0) {
249
            $this->extensionSettings['countInARun'] = 100;
250
        }
251
252 43
        $this->extensionSettings['processLimit'] = MathUtility::forceIntegerInRange($this->extensionSettings['processLimit'], 1, 99, 1);
253 43
        $this->maximumUrlsToCompile = MathUtility::forceIntegerInRange($this->extensionSettings['maxCompileUrls'], 1, 1000000000, 10000);
254 43
    }
255
256
    /**
257
     * Method to set the accessMode can be gui, cli or cli_im
258
     *
259
     * @return string
260
     */
261 1
    public function getAccessMode()
262
    {
263 1
        return $this->accessMode;
264
    }
265
266
    /**
267
     * @param string $accessMode
268
     */
269 1
    public function setAccessMode($accessMode): void
270
    {
271 1
        $this->accessMode = $accessMode;
272 1
    }
273
274
    /**
275
     * Set disabled status to prevent processes from being processed
276
     *
277
     * @param bool $disabled (optional, defaults to true)
278
     */
279 2
    public function setDisabled($disabled = true): void
280
    {
281 2
        if ($disabled) {
282 1
            GeneralUtility::writeFile($this->processFilename, '');
283
        } else {
284 1
            if (is_file($this->processFilename)) {
285 1
                unlink($this->processFilename);
286
            }
287
        }
288 2
    }
289
290
    /**
291
     * Get disable status
292
     *
293
     * @return bool true if disabled
294
     */
295 2
    public function getDisabled()
296
    {
297 2
        return is_file($this->processFilename);
298
    }
299
300
    /**
301
     * @param string $filenameWithPath
302
     */
303 3
    public function setProcessFilename($filenameWithPath): void
304
    {
305 3
        $this->processFilename = $filenameWithPath;
306 3
    }
307
308
    /**
309
     * @return string
310
     */
311 1
    public function getProcessFilename()
312
    {
313 1
        return $this->processFilename;
314
    }
315
316
    /**
317
     * Sets the extensions settings (unserialized pendant of $TYPO3_CONF_VARS['EXT']['extConf']['crawler']).
318
     */
319 12
    public function setExtensionSettings(array $extensionSettings): void
320
    {
321 12
        $this->extensionSettings = $extensionSettings;
322 12
    }
323
324
    /**
325
     * Check if the given page should be crawled
326
     *
327
     * @return false|string false if the page should be crawled (not excluded), true / skipMessage if it should be skipped
328
     */
329 10
    public function checkIfPageShouldBeSkipped(array $pageRow)
330
    {
331 10
        $skipPage = false;
332 10
        $skipMessage = 'Skipped'; // message will be overwritten later
333
334
        // if page is hidden
335 10
        if (!$this->extensionSettings['crawlHiddenPages'] && $pageRow['hidden']) {
336 1
            $skipPage = true;
337 1
            $skipMessage = 'Because page is hidden';
338
        }
339
340 10
        if (! $skipPage && (GeneralUtility::inList('3,4', $pageRow['doktype']) || $pageRow['doktype'] >= 199)) {
341 3
                $skipPage = true;
342 3
                $skipMessage = 'Because doktype is not allowed';
343
        }
344
345 10
        if (! $skipPage) {
346 6
            foreach ($GLOBALS['TYPO3_CONF_VARS']['EXTCONF']['crawler']['excludeDoktype'] ?? [] as $key => $doktypeList) {
347 1
                if (GeneralUtility::inList($doktypeList, $pageRow['doktype'])) {
348 1
                    $skipPage = true;
349 1
                    $skipMessage = 'Doktype was excluded by "' . $key . '"';
350 1
                    break;
351
                }
352
            }
353
        }
354
355 10
        if (! $skipPage) {
356
            // veto hook
357 5
            foreach ($GLOBALS['TYPO3_CONF_VARS']['EXTCONF']['crawler']['pageVeto'] ?? [] as $key => $func) {
358
                $params = [
359
                    'pageRow' => $pageRow,
360
                ];
361
                // expects "false" if page is ok and "true" or a skipMessage if this page should _not_ be crawled
362
                $veto = GeneralUtility::callUserFunction($func, $params, $this);
363
                if ($veto !== false) {
364
                    $skipPage = true;
365
                    if (is_string($veto)) {
366
                        $skipMessage = $veto;
367
                    } else {
368
                        $skipMessage = 'Veto from hook "' . htmlspecialchars($key) . '"';
369
                    }
370
                    // no need to execute other hooks if a previous one return a veto
371
                    break;
372
                }
373
            }
374
        }
375
376 10
        return $skipPage ? $skipMessage : false;
377
    }
378
379
    /**
380
     * Wrapper method for getUrlsForPageId()
381
     * It returns an array of configurations and no urls!
382
     *
383
     * @param array $pageRow Page record with at least dok-type and uid columns.
384
     * @param string $skipMessage
385
     * @return array
386
     * @see getUrlsForPageId()
387
     */
388 6
    public function getUrlsForPageRow(array $pageRow, &$skipMessage = '')
389
    {
390 6
        $message = $this->checkIfPageShouldBeSkipped($pageRow);
391
392 6
        if ($message === false) {
393 5
            $res = $this->getUrlsForPageId($pageRow['uid']);
394 5
            $skipMessage = '';
395
        } else {
396 1
            $skipMessage = $message;
397 1
            $res = [];
398
        }
399
400 6
        return $res;
401
    }
402
403
    /**
404
     * Creates a list of URLs from input array (and submits them to queue if asked for)
405
     * See Web > Info module script + "indexed_search"'s crawler hook-client using this!
406
     *
407
     * @param array $vv Information about URLs from pageRow to crawl.
408
     * @param array $pageRow Page row
409
     * @param int $scheduledTime Unix time to schedule indexing to, typically time()
410
     * @param int $reqMinute Number of requests per minute (creates the interleave between requests)
411
     * @param bool $submitCrawlUrls If set, submits the URLs to queue
412
     * @param bool $downloadCrawlUrls If set (and submitcrawlUrls is false) will fill $downloadUrls with entries)
413
     * @param array $duplicateTrack Array which is passed by reference and contains the an id per url to secure we will not crawl duplicates
414
     * @param array $downloadUrls Array which will be filled with URLS for download if flag is set.
415
     * @param array $incomingProcInstructions Array of processing instructions
416
     * @return string List of URLs (meant for display in backend module)
417
     */
418 4
    public function urlListFromUrlArray(
419
        array $vv,
420
        array $pageRow,
421
        $scheduledTime,
422
        $reqMinute,
423
        $submitCrawlUrls,
424
        $downloadCrawlUrls,
425
        array &$duplicateTrack,
426
        array &$downloadUrls,
427
        array $incomingProcInstructions
428
    ) {
429 4
        if (! is_array($vv['URLs'])) {
430
            return 'ERROR - no URL generated';
431
        }
432 4
        $urlLog = [];
433 4
        $pageId = (int) $pageRow['uid'];
434 4
        $configurationHash = $this->getConfigurationHash($vv);
435 4
        $skipInnerCheck = $this->queueRepository->noUnprocessedQueueEntriesForPageWithConfigurationHashExist($pageId, $configurationHash);
436
437 4
        foreach ($vv['URLs'] as $urlQuery) {
438 4
            if (! $this->drawURLs_PIfilter($vv['subCfg']['procInstrFilter'], $incomingProcInstructions)) {
439
                continue;
440
            }
441 4
            $url = (string) $this->getUrlFromPageAndQueryParameters(
442 4
                $pageId,
443 4
                $urlQuery,
444 4
                $vv['subCfg']['baseUrl'] ?? null,
445 4
                $vv['subCfg']['force_ssl'] ?? 0
446
            );
447
448
            // Create key by which to determine unique-ness:
449 4
            $uKey = $url . '|' . $vv['subCfg']['userGroups'] . '|' . $vv['subCfg']['procInstrFilter'];
450
451 4
            if (isset($duplicateTrack[$uKey])) {
452
                //if the url key is registered just display it and do not resubmit is
453
                $urlLog[] = '<em><span class="text-muted">' . htmlspecialchars($url) . '</span></em>';
454
            } else {
455
                // Scheduled time:
456 4
                $schTime = $scheduledTime + round(count($duplicateTrack) * (60 / $reqMinute));
457 4
                $schTime = intval($schTime / 60) * 60;
458 4
                $formattedDate = BackendUtility::datetime($schTime);
459 4
                $this->urlList[] = '[' . $formattedDate . '] ' . $url;
460 4
                $urlList = '[' . $formattedDate . '] ' . htmlspecialchars($url);
461
462
                // Submit for crawling!
463 4
                if ($submitCrawlUrls) {
464 4
                    $added = $this->addUrl(
465 4
                        $pageId,
466 4
                        $url,
467 4
                        $vv['subCfg'],
468 4
                        $scheduledTime,
469 4
                        $configurationHash,
470 4
                        $skipInnerCheck
471
                    );
472 4
                    if ($added === false) {
473 4
                        $urlList .= ' (URL already existed)';
474
                    }
475
                } elseif ($downloadCrawlUrls) {
476
                    $downloadUrls[$url] = $url;
477
                }
478 4
                $urlLog[] = $urlList;
479
            }
480 4
            $duplicateTrack[$uKey] = true;
481
        }
482
483 4
        return implode('<br>', $urlLog);
484
    }
485
486
    /**
487
     * Returns true if input processing instruction is among registered ones.
488
     *
489
     * @param string $piString PI to test
490
     * @param array $incomingProcInstructions Processing instructions
491
     * @return boolean
492
     */
493 5
    public function drawURLs_PIfilter($piString, array $incomingProcInstructions)
494
    {
495 5
        if (empty($incomingProcInstructions)) {
496 1
            return true;
497
        }
498
499 4
        foreach ($incomingProcInstructions as $pi) {
500 4
            if (GeneralUtility::inList($piString, $pi)) {
501 2
                return true;
502
            }
503
        }
504 2
        return false;
505
    }
506
507 5
    public function getPageTSconfigForId($id): array
508
    {
509 5
        if (! $this->MP) {
510 5
            $pageTSconfig = BackendUtility::getPagesTSconfig($id);
0 ignored issues
show
Deprecated Code introduced by
The function TYPO3\CMS\Backend\Utilit...ity::getPagesTSconfig() has been deprecated. ( Ignorable by Annotation )

If this is a false-positive, you can also ignore this issue in your code via the ignore-deprecated  annotation

510
            $pageTSconfig = /** @scrutinizer ignore-deprecated */ BackendUtility::getPagesTSconfig($id);
Loading history...
511
        } else {
512
            // TODO: Please check, this makes no sense to split a boolean value.
513
            [, $mountPointId] = explode('-', $this->MP);
0 ignored issues
show
Bug introduced by
$this->MP of type true is incompatible with the type string expected by parameter $string of explode(). ( Ignorable by Annotation )

If this is a false-positive, you can also ignore this issue in your code via the ignore-type  annotation

513
            [, $mountPointId] = explode('-', /** @scrutinizer ignore-type */ $this->MP);
Loading history...
514
            $pageTSconfig = BackendUtility::getPagesTSconfig($mountPointId);
0 ignored issues
show
Bug introduced by
$mountPointId of type string is incompatible with the type integer expected by parameter $id of TYPO3\CMS\Backend\Utilit...ity::getPagesTSconfig(). ( Ignorable by Annotation )

If this is a false-positive, you can also ignore this issue in your code via the ignore-type  annotation

514
            $pageTSconfig = BackendUtility::getPagesTSconfig(/** @scrutinizer ignore-type */ $mountPointId);
Loading history...
Deprecated Code introduced by
The function TYPO3\CMS\Backend\Utilit...ity::getPagesTSconfig() has been deprecated. ( Ignorable by Annotation )

If this is a false-positive, you can also ignore this issue in your code via the ignore-deprecated  annotation

514
            $pageTSconfig = /** @scrutinizer ignore-deprecated */ BackendUtility::getPagesTSconfig($mountPointId);
Loading history...
515
        }
516
517
        // Call a hook to alter configuration
518 5
        if (is_array($GLOBALS['TYPO3_CONF_VARS']['EXTCONF']['crawler']['getPageTSconfigForId'])) {
519
            $params = [
520
                'pageId' => $id,
521
                'pageTSConfig' => &$pageTSconfig,
522
            ];
523
            foreach ($GLOBALS['TYPO3_CONF_VARS']['EXTCONF']['crawler']['getPageTSconfigForId'] as $userFunc) {
524
                GeneralUtility::callUserFunction($userFunc, $params, $this);
525
            }
526
        }
527 5
        return $pageTSconfig;
528
    }
529
530
    /**
531
     * This methods returns an array of configurations.
532
     * Adds no urls!
533
     */
534 4
    public function getUrlsForPageId(int $pageId): array
535
    {
536
        // Get page TSconfig for page ID
537 4
        $pageTSconfig = $this->getPageTSconfigForId($pageId);
538
539 4
        $res = [];
540
541
        // Fetch Crawler Configuration from pageTSconfig
542 4
        $crawlerCfg = $pageTSconfig['tx_crawler.']['crawlerCfg.']['paramSets.'] ?? [];
543 4
        foreach ($crawlerCfg as $key => $values) {
544 3
            if (! is_array($values)) {
545 3
                continue;
546
            }
547 3
            $key = str_replace('.', '', $key);
548
            // Sub configuration for a single configuration string:
549 3
            $subCfg = (array) $crawlerCfg[$key . '.'];
550 3
            $subCfg['key'] = $key;
551
552 3
            if (strcmp($subCfg['procInstrFilter'], '')) {
553 3
                $subCfg['procInstrFilter'] = implode(',', GeneralUtility::trimExplode(',', $subCfg['procInstrFilter']));
554
            }
555 3
            $pidOnlyList = implode(',', GeneralUtility::trimExplode(',', $subCfg['pidsOnly'], true));
556
557
            // process configuration if it is not page-specific or if the specific page is the current page:
558
            // TODO: Check if $pidOnlyList can be kept as Array instead of imploded
559 3
            if (! strcmp((string) $subCfg['pidsOnly'], '') || GeneralUtility::inList($pidOnlyList, strval($pageId))) {
560
561
                // Explode, process etc.:
562 3
                $res[$key] = [];
563 3
                $res[$key]['subCfg'] = $subCfg;
564 3
                $res[$key]['paramParsed'] = GeneralUtility::explodeUrl2Array($crawlerCfg[$key]);
565 3
                $res[$key]['paramExpanded'] = $this->expandParameters($res[$key]['paramParsed'], $pageId);
566 3
                $res[$key]['origin'] = 'pagets';
567
568
                // recognize MP value
569 3
                if (! $this->MP) {
570 3
                    $res[$key]['URLs'] = $this->compileUrls($res[$key]['paramExpanded'], ['?id=' . $pageId]);
571
                } else {
572
                    $res[$key]['URLs'] = $this->compileUrls($res[$key]['paramExpanded'], ['?id=' . $pageId . '&MP=' . $this->MP]);
0 ignored issues
show
Bug introduced by
Are you sure $this->MP of type true can be used in concatenation? ( Ignorable by Annotation )

If this is a false-positive, you can also ignore this issue in your code via the ignore-type  annotation

572
                    $res[$key]['URLs'] = $this->compileUrls($res[$key]['paramExpanded'], ['?id=' . $pageId . '&MP=' . /** @scrutinizer ignore-type */ $this->MP]);
Loading history...
573
                }
574
            }
575
        }
576
577
        // Get configuration from tx_crawler_configuration records up the rootline
578 4
        $crawlerConfigurations = $this->configurationRepository->getCrawlerConfigurationRecordsFromRootLine($pageId);
579 4
        foreach ($crawlerConfigurations as $configurationRecord) {
580
581
            // check access to the configuration record
582 1
            if (empty($configurationRecord['begroups']) || $this->getBackendUser()->isAdmin() || $this->hasGroupAccess($this->getBackendUser()->user['usergroup_cached_list'], $configurationRecord['begroups'])) {
583 1
                $pidOnlyList = implode(',', GeneralUtility::trimExplode(',', $configurationRecord['pidsonly'], true));
584
585
                // process configuration if it is not page-specific or if the specific page is the current page:
586
                // TODO: Check if $pidOnlyList can be kept as Array instead of imploded
587 1
                if (! strcmp($configurationRecord['pidsonly'], '') || GeneralUtility::inList($pidOnlyList, strval($pageId))) {
588 1
                    $key = $configurationRecord['name'];
589
590
                    // don't overwrite previously defined paramSets
591 1
                    if (! isset($res[$key])) {
592
593
                        /* @var $TSparserObject \TYPO3\CMS\Core\TypoScript\Parser\TypoScriptParser */
594 1
                        $TSparserObject = GeneralUtility::makeInstance(TypoScriptParser::class);
595 1
                        $TSparserObject->parse($configurationRecord['processing_instruction_parameters_ts']);
596
597
                        $subCfg = [
598 1
                            'procInstrFilter' => $configurationRecord['processing_instruction_filter'],
599 1
                            'procInstrParams.' => $TSparserObject->setup,
600 1
                            'baseUrl' => $configurationRecord['base_url'],
601 1
                            'force_ssl' => (int) $configurationRecord['force_ssl'],
602 1
                            'userGroups' => $configurationRecord['fegroups'],
603 1
                            'exclude' => $configurationRecord['exclude'],
604 1
                            'key' => $key,
605
                        ];
606
607 1
                        if (! in_array($pageId, $this->expandExcludeString($subCfg['exclude']), true)) {
608 1
                            $res[$key] = [];
609 1
                            $res[$key]['subCfg'] = $subCfg;
610 1
                            $res[$key]['paramParsed'] = GeneralUtility::explodeUrl2Array($configurationRecord['configuration']);
611 1
                            $res[$key]['paramExpanded'] = $this->expandParameters($res[$key]['paramParsed'], $pageId);
612 1
                            $res[$key]['URLs'] = $this->compileUrls($res[$key]['paramExpanded'], ['?id=' . $pageId]);
613 1
                            $res[$key]['origin'] = 'tx_crawler_configuration_' . $configurationRecord['uid'];
614
                        }
615
                    }
616
                }
617
            }
618
        }
619
620 4
        foreach ($GLOBALS['TYPO3_CONF_VARS']['EXTCONF']['crawler']['processUrls'] ?? [] as $func) {
621
            $params = [
622
                'res' => &$res,
623
            ];
624
            GeneralUtility::callUserFunction($func, $params, $this);
625
        }
626 4
        return $res;
627
    }
628
629
    /**
630
     * Find all configurations of subpages of a page
631
     * TODO: Write Functional Tests
632
     */
633 1
    public function getConfigurationsForBranch(int $rootid, int $depth): array
634
    {
635 1
        $configurationsForBranch = [];
636 1
        $pageTSconfig = $this->getPageTSconfigForId($rootid);
637 1
        $sets = $pageTSconfig['tx_crawler.']['crawlerCfg.']['paramSets.'] ?? [];
638 1
        foreach ($sets as $key => $value) {
639
            if (! is_array($value)) {
640
                continue;
641
            }
642
            $configurationsForBranch[] = substr($key, -1) === '.' ? substr($key, 0, -1) : $key;
643
        }
644 1
        $pids = [];
645 1
        $rootLine = BackendUtility::BEgetRootLine($rootid);
646 1
        foreach ($rootLine as $node) {
647 1
            $pids[] = $node['uid'];
648
        }
649
        /* @var PageTreeView $tree */
650 1
        $tree = GeneralUtility::makeInstance(PageTreeView::class);
651 1
        $perms_clause = $this->getBackendUser()->getPagePermsClause(Permission::PAGE_SHOW);
652 1
        $tree->init(empty($perms_clause) ? '' : ('AND ' . $perms_clause));
653 1
        $tree->getTree($rootid, $depth, '');
654 1
        foreach ($tree->tree as $node) {
655
            $pids[] = $node['row']['uid'];
656
        }
657
658 1
        $queryBuilder = $this->getQueryBuilder('tx_crawler_configuration');
659
        $statement = $queryBuilder
660 1
            ->select('name')
661 1
            ->from('tx_crawler_configuration')
662 1
            ->where(
663 1
                $queryBuilder->expr()->in('pid', $queryBuilder->createNamedParameter($pids, Connection::PARAM_INT_ARRAY))
664
            )
665 1
            ->execute();
666
667 1
        while ($row = $statement->fetch()) {
668 1
            $configurationsForBranch[] = $row['name'];
669
        }
670 1
        return $configurationsForBranch;
671
    }
672
673
    /**
674
     * Check if a user has access to an item
675
     * (e.g. get the group list of the current logged in user from $GLOBALS['TSFE']->gr_list)
676
     *
677
     * @param string $groupList Comma-separated list of (fe_)group UIDs from a user
678
     * @param string $accessList Comma-separated list of (fe_)group UIDs of the item to access
679
     * @return bool                 TRUE if at least one of the users group UIDs is in the access list or the access list is empty
680
     * @see \TYPO3\CMS\Frontend\Page\PageRepository::getMultipleGroupsWhereClause()
681
     */
682 3
    public function hasGroupAccess($groupList, $accessList)
683
    {
684 3
        if (empty($accessList)) {
685 1
            return true;
686
        }
687 2
        foreach (GeneralUtility::intExplode(',', $groupList) as $groupUid) {
688 2
            if (GeneralUtility::inList($accessList, $groupUid)) {
689 1
                return true;
690
            }
691
        }
692 1
        return false;
693
    }
694
695
    /**
696
     * Will expand the parameters configuration to individual values. This follows a certain syntax of the value of each parameter.
697
     * Syntax of values:
698
     * - Basically: If the value is wrapped in [...] it will be expanded according to the following syntax, otherwise the value is taken literally
699
     * - Configuration is splitted by "|" and the parts are processed individually and finally added together
700
     * - For each configuration part:
701
     *         - "[int]-[int]" = Integer range, will be expanded to all values in between, values included, starting from low to high (max. 1000). Example "1-34" or "-40--30"
702
     *         - "_TABLE:[TCA table name];[_PID:[optional page id, default is current page]];[_ENABLELANG:1]" = Look up of table records from PID, filtering out deleted records. Example "_TABLE:tt_content; _PID:123"
703
     *        _ENABLELANG:1 picks only original records without their language overlays
704
     *         - Default: Literal value
705
     *
706
     * @param array $paramArray Array with key (GET var name) and values (value of GET var which is configuration for expansion)
707
     * @param integer $pid Current page ID
708
     * @return array
709
     *
710
     * TODO: Write Functional Tests
711
     */
712 11
    public function expandParameters($paramArray, $pid)
713
    {
714
        // Traverse parameter names:
715 11
        foreach ($paramArray as $p => $v) {
716 11
            $v = trim($v);
717
718
            // If value is encapsulated in square brackets it means there are some ranges of values to find, otherwise the value is literal
719 11
            if (strpos($v, '[') === 0 && substr($v, -1) === ']') {
720
                // So, find the value inside brackets and reset the paramArray value as an array.
721 11
                $v = substr($v, 1, -1);
722 11
                $paramArray[$p] = [];
723
724
                // Explode parts and traverse them:
725 11
                $parts = explode('|', $v);
726 11
                foreach ($parts as $pV) {
727
728
                    // Look for integer range: (fx. 1-34 or -40--30 // reads minus 40 to minus 30)
729 11
                    if (preg_match('/^(-?[0-9]+)\s*-\s*(-?[0-9]+)$/', trim($pV), $reg)) {
730 1
                        $reg = $this->swapIfFirstIsLargerThanSecond($reg);
731
732
                        // Traverse range, add values:
733 1
                        $runAwayBrake = 1000; // Limit to size of range!
734 1
                        for ($a = $reg[1]; $a <= $reg[2]; $a++) {
735 1
                            $paramArray[$p][] = $a;
736 1
                            $runAwayBrake--;
737 1
                            if ($runAwayBrake <= 0) {
738
                                break;
739
                            }
740
                        }
741 10
                    } elseif (strpos(trim($pV), '_TABLE:') === 0) {
742
743
                        // Parse parameters:
744 6
                        $subparts = GeneralUtility::trimExplode(';', $pV);
745 6
                        $subpartParams = [];
746 6
                        foreach ($subparts as $spV) {
747 6
                            [$pKey, $pVal] = GeneralUtility::trimExplode(':', $spV);
748 6
                            $subpartParams[$pKey] = $pVal;
749
                        }
750
751
                        // Table exists:
752 6
                        if (isset($GLOBALS['TCA'][$subpartParams['_TABLE']])) {
753 6
                            $lookUpPid = isset($subpartParams['_PID']) ? intval($subpartParams['_PID']) : intval($pid);
754 6
                            $recursiveDepth = isset($subpartParams['_RECURSIVE']) ? intval($subpartParams['_RECURSIVE']) : 0;
755 6
                            $pidField = isset($subpartParams['_PIDFIELD']) ? trim($subpartParams['_PIDFIELD']) : 'pid';
756 6
                            $where = $subpartParams['_WHERE'] ?? '';
757 6
                            $addTable = $subpartParams['_ADDTABLE'] ?? '';
758
759 6
                            $fieldName = $subpartParams['_FIELD'] ? $subpartParams['_FIELD'] : 'uid';
760 6
                            if ($fieldName === 'uid' || $GLOBALS['TCA'][$subpartParams['_TABLE']]['columns'][$fieldName]) {
761 6
                                $queryBuilder = $this->getQueryBuilder($subpartParams['_TABLE']);
762
763 6
                                if ($recursiveDepth > 0) {
764
                                    /** @var \TYPO3\CMS\Core\Database\QueryGenerator $queryGenerator */
765 2
                                    $queryGenerator = GeneralUtility::makeInstance(\TYPO3\CMS\Core\Database\QueryGenerator::class);
766 2
                                    $pidList = $queryGenerator->getTreeList($lookUpPid, $recursiveDepth, 0, 1);
767 2
                                    $pidArray = GeneralUtility::intExplode(',', $pidList);
768
                                } else {
769 4
                                    $pidArray = [(string) $lookUpPid];
770
                                }
771
772 6
                                $queryBuilder->getRestrictions()
773 6
                                    ->removeAll()
774 6
                                    ->add(GeneralUtility::makeInstance(DeletedRestriction::class));
775
776
                                $queryBuilder
777 6
                                    ->select($fieldName)
778 6
                                    ->from($subpartParams['_TABLE'])
779 6
                                    ->where(
780 6
                                        $queryBuilder->expr()->in($pidField, $queryBuilder->createNamedParameter($pidArray, Connection::PARAM_INT_ARRAY)),
781 6
                                        $where
782
                                    );
783
784 6
                                if (! empty($addTable)) {
785
                                    // TODO: Check if this works as intended!
786
                                    $queryBuilder->add('from', $addTable);
787
                                }
788 6
                                $transOrigPointerField = $GLOBALS['TCA'][$subpartParams['_TABLE']]['ctrl']['transOrigPointerField'];
789
790 6
                                if ($subpartParams['_ENABLELANG'] && $transOrigPointerField) {
791
                                    $queryBuilder->andWhere(
792
                                        $queryBuilder->expr()->lte(
793
                                            $transOrigPointerField,
794
                                            0
795
                                        )
796
                                    );
797
                                }
798
799 6
                                $statement = $queryBuilder->execute();
800
801 6
                                $rows = [];
802 6
                                while ($row = $statement->fetch()) {
803 6
                                    $rows[$row[$fieldName]] = $row;
804
                                }
805
806 6
                                if (is_array($rows)) {
807 6
                                    $paramArray[$p] = array_merge($paramArray[$p], array_keys($rows));
808
                                }
809
                            }
810
                        }
811
                    } else { // Just add value:
812 4
                        $paramArray[$p][] = $pV;
813
                    }
814
                    // Hook for processing own expandParameters place holder
815 11
                    if (is_array($GLOBALS['TYPO3_CONF_VARS']['SC_OPTIONS']['crawler/class.tx_crawler_lib.php']['expandParameters'])) {
816
                        $_params = [
817
                            'pObj' => &$this,
818
                            'paramArray' => &$paramArray,
819
                            'currentKey' => $p,
820
                            'currentValue' => $pV,
821
                            'pid' => $pid,
822
                        ];
823
                        foreach ($GLOBALS['TYPO3_CONF_VARS']['SC_OPTIONS']['crawler/class.tx_crawler_lib.php']['expandParameters'] as $_funcRef) {
824
                            GeneralUtility::callUserFunction($_funcRef, $_params, $this);
825
                        }
826
                    }
827
                }
828
829
                // Make unique set of values and sort array by key:
830 11
                $paramArray[$p] = array_unique($paramArray[$p]);
831 11
                ksort($paramArray);
832
            } else {
833
                // Set the literal value as only value in array:
834 4
                $paramArray[$p] = [$v];
835
            }
836
        }
837
838 11
        return $paramArray;
839
    }
840
841
    /**
842
     * Compiling URLs from parameter array (output of expandParameters())
843
     * The number of URLs will be the multiplication of the number of parameter values for each key
844
     *
845
     * @param array $paramArray Output of expandParameters(): Array with keys (GET var names) and for each an array of values
846
     * @param array $urls URLs accumulated in this array (for recursion)
847
     * @return array
848
     */
849 7
    public function compileUrls($paramArray, array $urls)
850
    {
851 7
        if (empty($paramArray)) {
852 7
            return $urls;
853
        }
854
        // shift first off stack:
855 6
        reset($paramArray);
856 6
        $varName = key($paramArray);
857 6
        $valueSet = array_shift($paramArray);
858
859
        // Traverse value set:
860 6
        $newUrls = [];
861 6
        foreach ($urls as $url) {
862 5
            foreach ($valueSet as $val) {
863 5
                $newUrls[] = $url . (strcmp((string) $val, '') ? '&' . rawurlencode($varName) . '=' . rawurlencode((string) $val) : '');
864
865 5
                if (count($newUrls) > $this->maximumUrlsToCompile) {
866
                    break;
867
                }
868
            }
869
        }
870 6
        return $this->compileUrls($paramArray, $newUrls);
871
    }
872
873
    /************************************
874
     *
875
     * Crawler log
876
     *
877
     ************************************/
878
879
    /**
880
     * Return array of records from crawler queue for input page ID
881
     *
882
     * @param integer $id Page ID for which to look up log entries.
883
     * @param string $filter Filter: "all" => all entries, "pending" => all that is not yet run, "finished" => all complete ones
884
     * @param boolean $doFlush If TRUE, then entries selected at DELETED(!) instead of selected!
885
     * @param boolean $doFullFlush
886
     * @param integer $itemsPerPage Limit the amount of entries per page default is 10
887
     * @return array
888
     */
889 4
    public function getLogEntriesForPageId($id, $filter = '', $doFlush = false, $doFullFlush = false, $itemsPerPage = 10)
890
    {
891 4
        $queryBuilder = GeneralUtility::makeInstance(ConnectionPool::class)->getQueryBuilderForTable($this->tableName);
892
        $queryBuilder
893 4
            ->select('*')
894 4
            ->from($this->tableName)
895 4
            ->where(
896 4
                $queryBuilder->expr()->eq('page_id', $queryBuilder->createNamedParameter($id, \PDO::PARAM_INT))
897
            )
898 4
            ->orderBy('scheduled', 'DESC');
899
900 4
        $expressionBuilder = GeneralUtility::makeInstance(ConnectionPool::class)
901 4
            ->getConnectionForTable($this->tableName)
902 4
            ->getExpressionBuilder();
903 4
        $query = $expressionBuilder->andX();
0 ignored issues
show
Unused Code introduced by
The assignment to $query is dead and can be removed.
Loading history...
904
        // PHPStorm adds the highlight that the $addWhere is immediately overwritten,
905
        // but the $query = $expressionBuilder->andX() ensures that the $addWhere is written correctly with AND
906
        // between the statements, it's not a mistake in the code.
907 4
        switch ($filter) {
908 4
            case 'pending':
909
                $queryBuilder->andWhere($queryBuilder->expr()->eq('exec_time', 0));
910
                break;
911 4
            case 'finished':
912
                $queryBuilder->andWhere($queryBuilder->expr()->gt('exec_time', 0));
913
                break;
914
        }
915
916 4
        if ($doFlush) {
917 2
            if ($doFullFlush) {
918 1
                $this->queueRepository->flushQueue('all');
919
            } else {
920 1
                $this->queueRepository->flushQueue($filter);
921
            }
922
        }
923 4
        if ($itemsPerPage > 0) {
924
            $queryBuilder
925 4
                ->setMaxResults((int) $itemsPerPage);
926
        }
927
928 4
        return $queryBuilder->execute()->fetchAll();
929
    }
930
931
    /**
932
     * Return array of records from crawler queue for input set ID
933
     *
934
     * @param int $set_id Set ID for which to look up log entries.
935
     * @param string $filter Filter: "all" => all entries, "pending" => all that is not yet run, "finished" => all complete ones
936
     * @param bool $doFlush If TRUE, then entries selected at DELETED(!) instead of selected!
937
     * @param int $itemsPerPage Limit the amount of entries per page default is 10
938
     * @return array
939
     *
940
     * @deprecated
941
     */
942 6
    public function getLogEntriesForSetId(int $set_id, string $filter = '', bool $doFlush = false, bool $doFullFlush = false, int $itemsPerPage = 10)
943
    {
944 6
        $queryBuilder = GeneralUtility::makeInstance(ConnectionPool::class)->getQueryBuilderForTable($this->tableName);
945
        $queryBuilder
946 6
            ->select('*')
947 6
            ->from($this->tableName)
948 6
            ->where(
949 6
                $queryBuilder->expr()->eq('set_id', $queryBuilder->createNamedParameter($set_id, \PDO::PARAM_INT))
950
            )
951 6
            ->orderBy('scheduled', 'DESC');
952
953 6
        $expressionBuilder = GeneralUtility::makeInstance(ConnectionPool::class)
954 6
            ->getConnectionForTable($this->tableName)
955 6
            ->getExpressionBuilder();
956 6
        $query = $expressionBuilder->andX();
957
        // PHPStorm adds the highlight that the $addWhere is immediately overwritten,
958
        // but the $query = $expressionBuilder->andX() ensures that the $addWhere is written correctly with AND
959
        // between the statements, it's not a mistake in the code.
960 6
        $addWhere = '';
961 6
        switch ($filter) {
962 6
            case 'pending':
963 1
                $queryBuilder->andWhere($queryBuilder->expr()->eq('exec_time', 0));
964 1
                $addWhere = $query->add($expressionBuilder->eq('exec_time', 0));
0 ignored issues
show
Unused Code introduced by
The assignment to $addWhere is dead and can be removed.
Loading history...
965 1
                break;
966 5
            case 'finished':
967 1
                $queryBuilder->andWhere($queryBuilder->expr()->gt('exec_time', 0));
968 1
                $addWhere = $query->add($expressionBuilder->gt('exec_time', 0));
969 1
                break;
970
        }
971 6
        if ($doFlush) {
972 4
            $addWhere = $query->add($expressionBuilder->eq('set_id', (int) $set_id));
973 4
            $this->flushQueue($doFullFlush ? '' : $addWhere);
0 ignored issues
show
Deprecated Code introduced by
The function AOE\Crawler\Controller\C...ontroller::flushQueue() has been deprecated. ( Ignorable by Annotation )

If this is a false-positive, you can also ignore this issue in your code via the ignore-deprecated  annotation

973
            /** @scrutinizer ignore-deprecated */ $this->flushQueue($doFullFlush ? '' : $addWhere);
Loading history...
974 4
            return [];
975
        }
976 2
        if ($itemsPerPage > 0) {
977
            $queryBuilder
978 2
                ->setMaxResults((int) $itemsPerPage);
979
        }
980
981 2
        return $queryBuilder->execute()->fetchAll();
982
    }
983
984
    /**
985
     * Adding call back entries to log (called from hooks typically, see indexed search class "class.crawler.php"
986
     *
987
     * @param integer $setId Set ID
988
     * @param array $params Parameters to pass to call back function
989
     * @param string $callBack Call back object reference, eg. 'EXT:indexed_search/class.crawler.php:&tx_indexedsearch_crawler'
990
     * @param integer $page_id Page ID to attach it to
991
     * @param integer $schedule Time at which to activate
992
     */
993
    public function addQueueEntry_callBack($setId, $params, $callBack, $page_id = 0, $schedule = 0): void
994
    {
995
        if (! is_array($params)) {
0 ignored issues
show
introduced by
The condition is_array($params) is always true.
Loading history...
996
            $params = [];
997
        }
998
        $params['_CALLBACKOBJ'] = $callBack;
999
1000
        GeneralUtility::makeInstance(ConnectionPool::class)->getConnectionForTable('tx_crawler_queue')
1001
            ->insert(
1002
                'tx_crawler_queue',
1003
                [
1004
                    'page_id' => (int) $page_id,
1005
                    'parameters' => json_encode($params),
1006
                    'scheduled' => (int) $schedule ?: $this->getCurrentTime(),
1007
                    'exec_time' => 0,
1008
                    'set_id' => (int) $setId,
1009
                    'result_data' => '',
1010
                ]
1011
            );
1012
    }
1013
1014
    /************************************
1015
     *
1016
     * URL setting
1017
     *
1018
     ************************************/
1019
1020
    /**
1021
     * Setting a URL for crawling:
1022
     *
1023
     * @param integer $id Page ID
1024
     * @param string $url Complete URL
1025
     * @param array $subCfg Sub configuration array (from TS config)
1026
     * @param integer $tstamp Scheduled-time
1027
     * @param string $configurationHash (optional) configuration hash
1028
     * @param bool $skipInnerDuplicationCheck (optional) skip inner duplication check
1029
     * @return bool
1030
     */
1031 8
    public function addUrl(
1032
        $id,
1033
        $url,
1034
        array $subCfg,
1035
        $tstamp,
1036
        $configurationHash = '',
1037
        $skipInnerDuplicationCheck = false
1038
    ) {
1039 8
        $urlAdded = false;
1040 8
        $rows = [];
1041
1042
        // Creating parameters:
1043
        $parameters = [
1044 8
            'url' => $url,
1045
        ];
1046
1047
        // fe user group simulation:
1048 8
        $uGs = implode(',', array_unique(GeneralUtility::intExplode(',', $subCfg['userGroups'], true)));
1049 8
        if ($uGs) {
1050 1
            $parameters['feUserGroupList'] = $uGs;
1051
        }
1052
1053
        // Setting processing instructions
1054 8
        $parameters['procInstructions'] = GeneralUtility::trimExplode(',', $subCfg['procInstrFilter']);
1055 8
        if (is_array($subCfg['procInstrParams.'])) {
1056 5
            $parameters['procInstrParams'] = $subCfg['procInstrParams.'];
1057
        }
1058
1059
        // Compile value array:
1060 8
        $parameters_serialized = json_encode($parameters);
1061
        $fieldArray = [
1062 8
            'page_id' => (int) $id,
1063 8
            'parameters' => $parameters_serialized,
1064 8
            'parameters_hash' => GeneralUtility::shortMD5($parameters_serialized),
1065 8
            'configuration_hash' => $configurationHash,
1066 8
            'scheduled' => $tstamp,
1067 8
            'exec_time' => 0,
1068 8
            'set_id' => (int) $this->setID,
1069 8
            'result_data' => '',
1070 8
            'configuration' => $subCfg['key'],
1071
        ];
1072
1073 8
        if ($this->registerQueueEntriesInternallyOnly) {
1074
            //the entries will only be registered and not stored to the database
1075 1
            $this->queueEntries[] = $fieldArray;
1076
        } else {
1077 7
            if (! $skipInnerDuplicationCheck) {
1078
                // check if there is already an equal entry
1079 6
                $rows = $this->getDuplicateRowsIfExist($tstamp, $fieldArray);
1080
            }
1081
1082 7
            if (empty($rows)) {
1083 6
                $connectionForCrawlerQueue = GeneralUtility::makeInstance(ConnectionPool::class)->getConnectionForTable('tx_crawler_queue');
1084 6
                $connectionForCrawlerQueue->insert(
1085 6
                    'tx_crawler_queue',
1086 6
                    $fieldArray
1087
                );
1088 6
                $uid = $connectionForCrawlerQueue->lastInsertId('tx_crawler_queue', 'qid');
1089 6
                $rows[] = $uid;
1090 6
                $urlAdded = true;
1091
1092 6
                $signalPayload = ['uid' => $uid, 'fieldArray' => $fieldArray];
1093 6
                SignalSlotUtility::emitSignal(
1094 6
                    self::class,
1095 6
                    SignalSlotUtility::SIGNAL_URL_ADDED_TO_QUEUE,
1096 6
                    $signalPayload
1097
                );
1098
            } else {
1099 3
                $signalPayload = ['rows' => $rows, 'fieldArray' => $fieldArray];
1100 3
                SignalSlotUtility::emitSignal(
1101 3
                    self::class,
1102 3
                    SignalSlotUtility::SIGNAL_DUPLICATE_URL_IN_QUEUE,
1103 3
                    $signalPayload
1104
                );
1105
            }
1106
        }
1107
1108 8
        return $urlAdded;
1109
    }
1110
1111
    /**
1112
     * Returns the current system time
1113
     *
1114
     * @return int
1115
     */
1116
    public function getCurrentTime()
1117
    {
1118
        return time();
1119
    }
1120
1121
    /************************************
1122
     *
1123
     * URL reading
1124
     *
1125
     ************************************/
1126
1127
    /**
1128
     * Read URL for single queue entry
1129
     *
1130
     * @param integer $queueId
1131
     * @param boolean $force If set, will process even if exec_time has been set!
1132
     * @return integer
1133
     */
1134
    public function readUrl($queueId, $force = false)
1135
    {
1136
        $queryBuilder = GeneralUtility::makeInstance(ConnectionPool::class)->getQueryBuilderForTable($this->tableName);
1137
        $ret = 0;
1138
        $this->logger->debug('crawler-readurl start ' . microtime(true));
1139
        // Get entry:
1140
        $queryBuilder
1141
            ->select('*')
1142
            ->from('tx_crawler_queue')
1143
            ->where(
1144
                $queryBuilder->expr()->eq('qid', $queryBuilder->createNamedParameter($queueId, \PDO::PARAM_INT))
1145
            );
1146
        if (! $force) {
1147
            $queryBuilder
1148
                ->andWhere('exec_time = 0')
1149
                ->andWhere('process_scheduled > 0');
1150
        }
1151
        $queueRec = $queryBuilder->execute()->fetch();
1152
1153
        if (! is_array($queueRec)) {
1154
            return;
1155
        }
1156
1157
        SignalSlotUtility::emitSignal(
1158
            self::class,
1159
            SignalSlotUtility::SIGNAL_QUEUEITEM_PREPROCESS,
1160
            [$queueId, &$queueRec]
1161
        );
1162
1163
        // Set exec_time to lock record:
1164
        $field_array = ['exec_time' => $this->getCurrentTime()];
1165
1166
        if (isset($this->processID)) {
1167
            //if mulitprocessing is used we need to store the id of the process which has handled this entry
1168
            $field_array['process_id_completed'] = $this->processID;
1169
        }
1170
1171
        GeneralUtility::makeInstance(ConnectionPool::class)->getConnectionForTable('tx_crawler_queue')
1172
            ->update(
1173
                'tx_crawler_queue',
1174
                $field_array,
1175
                ['qid' => (int) $queueId]
1176
            );
1177
1178
        $result = $this->queueExecutor->executeQueueItem($queueRec, $this);
1179
        if ($result['content'] === null) {
1180
            $resultData = 'An errors happened';
1181
        } else {
1182
            /** @var JsonCompatibilityConverter $jsonCompatibilityConverter */
1183
            $jsonCompatibilityConverter = GeneralUtility::makeInstance(JsonCompatibilityConverter::class);
1184
            $resultData = $jsonCompatibilityConverter->convert($result['content']);
1185
        }
1186
1187
        //atm there's no need to point to specific pollable extensions
1188
        if (is_array($GLOBALS['TYPO3_CONF_VARS']['EXTCONF']['crawler']['pollSuccess'])) {
1189
            foreach ($GLOBALS['TYPO3_CONF_VARS']['EXTCONF']['crawler']['pollSuccess'] as $pollable) {
1190
                // only check the success value if the instruction is runnig
1191
                // it is important to name the pollSuccess key same as the procInstructions key
1192
                if (is_array($resultData['parameters']['procInstructions'])
1193
                    && in_array(
1194
                        $pollable,
1195
                        $resultData['parameters']['procInstructions'], true
1196
                    )
1197
                ) {
1198
                    if (! empty($resultData['success'][$pollable]) && $resultData['success'][$pollable]) {
1199
                        $ret |= self::CLI_STATUS_POLLABLE_PROCESSED;
1200
                    }
1201
                }
1202
            }
1203
        }
1204
1205
        // Set result in log which also denotes the end of the processing of this entry.
1206
        $field_array = ['result_data' => json_encode($result)];
1207
1208
        SignalSlotUtility::emitSignal(
1209
            self::class,
1210
            SignalSlotUtility::SIGNAL_QUEUEITEM_POSTPROCESS,
1211
            [$queueId, &$field_array]
1212
        );
1213
1214
        GeneralUtility::makeInstance(ConnectionPool::class)->getConnectionForTable('tx_crawler_queue')
1215
            ->update(
1216
                'tx_crawler_queue',
1217
                $field_array,
1218
                ['qid' => (int) $queueId]
1219
            );
1220
1221
        $this->logger->debug('crawler-readurl stop ' . microtime(true));
1222
        return $ret;
1223
    }
1224
1225
    /**
1226
     * Read URL for not-yet-inserted log-entry
1227
     *
1228
     * @param array $field_array Queue field array,
1229
     *
1230
     * @return string
1231
     */
1232
    public function readUrlFromArray($field_array)
1233
    {
1234
        // Set exec_time to lock record:
1235
        $field_array['exec_time'] = $this->getCurrentTime();
1236
        $connectionForCrawlerQueue = GeneralUtility::makeInstance(ConnectionPool::class)->getConnectionForTable($this->tableName);
1237
        $connectionForCrawlerQueue->insert(
1238
            $this->tableName,
1239
            $field_array
1240
        );
1241
        $queueId = $field_array['qid'] = $connectionForCrawlerQueue->lastInsertId($this->tableName, 'qid');
1242
1243
        $result = $this->queueExecutor->executeQueueItem($field_array, $this);
1244
1245
        // Set result in log which also denotes the end of the processing of this entry.
1246
        $field_array = ['result_data' => json_encode($result)];
1247
1248
        SignalSlotUtility::emitSignal(
1249
            self::class,
1250
            SignalSlotUtility::SIGNAL_QUEUEITEM_POSTPROCESS,
1251
            [$queueId, &$field_array]
1252
        );
1253
1254
        $connectionForCrawlerQueue->update(
1255
            $this->tableName,
1256
            $field_array,
1257
            ['qid' => $queueId]
1258
        );
1259
1260
        return $result;
0 ignored issues
show
Bug Best Practice introduced by
The expression return $result also could return the type array|boolean which is incompatible with the documented return type string.
Loading history...
1261
    }
1262
1263
    /*****************************
1264
     *
1265
     * Compiling URLs to crawl - tools
1266
     *
1267
     *****************************/
1268
1269
    /**
1270
     * @param integer $id Root page id to start from.
1271
     * @param integer $depth Depth of tree, 0=only id-page, 1= on sublevel, 99 = infinite
1272
     * @param integer $scheduledTime Unix Time when the URL is timed to be visited when put in queue
1273
     * @param integer $reqMinute Number of requests per minute (creates the interleave between requests)
1274
     * @param boolean $submitCrawlUrls If set, submits the URLs to queue in database (real crawling)
1275
     * @param boolean $downloadCrawlUrls If set (and submitcrawlUrls is false) will fill $downloadUrls with entries)
1276
     * @param array $incomingProcInstructions Array of processing instructions
1277
     * @param array $configurationSelection Array of configuration keys
1278
     * @return string
1279
     */
1280
    public function getPageTreeAndUrls(
1281
        $id,
1282
        $depth,
1283
        $scheduledTime,
1284
        $reqMinute,
1285
        $submitCrawlUrls,
1286
        $downloadCrawlUrls,
1287
        array $incomingProcInstructions,
1288
        array $configurationSelection
1289
    ) {
1290
        $this->scheduledTime = $scheduledTime;
1291
        $this->reqMinute = $reqMinute;
1292
        $this->submitCrawlUrls = $submitCrawlUrls;
1293
        $this->downloadCrawlUrls = $downloadCrawlUrls;
1294
        $this->incomingProcInstructions = $incomingProcInstructions;
1295
        $this->incomingConfigurationSelection = $configurationSelection;
1296
1297
        $this->duplicateTrack = [];
1298
        $this->downloadUrls = [];
1299
1300
        // Drawing tree:
1301
        /* @var PageTreeView $tree */
1302
        $tree = GeneralUtility::makeInstance(PageTreeView::class);
1303
        $perms_clause = $this->getBackendUser()->getPagePermsClause(Permission::PAGE_SHOW);
1304
        $tree->init('AND ' . $perms_clause);
1305
1306
        $pageInfo = BackendUtility::readPageAccess($id, $perms_clause);
1307
        if (is_array($pageInfo)) {
0 ignored issues
show
introduced by
The condition is_array($pageInfo) is always true.
Loading history...
1308
            // Set root row:
1309
            $tree->tree[] = [
1310
                'row' => $pageInfo,
1311
                'HTML' => $this->iconFactory->getIconForRecord('pages', $pageInfo, Icon::SIZE_SMALL),
1312
            ];
1313
        }
1314
1315
        // Get branch beneath:
1316
        if ($depth) {
1317
            $tree->getTree($id, $depth, '');
1318
        }
1319
1320
        // Traverse page tree:
1321
        $code = '';
1322
1323
        foreach ($tree->tree as $data) {
1324
            $this->MP = false;
1325
1326
            // recognize mount points
1327
            if ($data['row']['doktype'] === PageRepository::DOKTYPE_MOUNTPOINT) {
1328
                $queryBuilder = GeneralUtility::makeInstance(ConnectionPool::class)->getQueryBuilderForTable('pages');
1329
                $queryBuilder->getRestrictions()->removeAll()->add(GeneralUtility::makeInstance(DeletedRestriction::class));
1330
                $mountpage = $queryBuilder
1331
                    ->select('*')
1332
                    ->from('pages')
1333
                    ->where(
1334
                        $queryBuilder->expr()->eq('uid', $queryBuilder->createNamedParameter($data['row']['uid'], \PDO::PARAM_INT))
1335
                    )
1336
                    ->execute()
1337
                    ->fetchAll();
1338
                $queryBuilder->resetRestrictions();
1339
1340
                // fetch mounted pages
1341
                $this->MP = $mountpage[0]['mount_pid'] . '-' . $data['row']['uid'];
0 ignored issues
show
Documentation Bug introduced by
The property $MP was declared of type boolean, but $mountpage[0]['mount_pid...' . $data['row']['uid'] is of type string. Maybe add a type cast?

This check looks for assignments to scalar types that may be of the wrong type.

To ensure the code behaves as expected, it may be a good idea to add an explicit type cast.

$answer = 42;

$correct = false;

$correct = (bool) $answer;
Loading history...
1342
1343
                $mountTree = GeneralUtility::makeInstance(PageTreeView::class);
1344
                $mountTree->init('AND ' . $perms_clause);
1345
                $mountTree->getTree($mountpage[0]['mount_pid'], $depth);
1346
1347
                foreach ($mountTree->tree as $mountData) {
1348
                    $code .= $this->drawURLs_addRowsForPage(
1349
                        $mountData['row'],
1350
                        $mountData['HTML'] . BackendUtility::getRecordTitle('pages', $mountData['row'], true)
1351
                    );
1352
                }
1353
1354
                // replace page when mount_pid_ol is enabled
1355
                if ($mountpage[0]['mount_pid_ol']) {
1356
                    $data['row']['uid'] = $mountpage[0]['mount_pid'];
1357
                } else {
1358
                    // if the mount_pid_ol is not set the MP must not be used for the mountpoint page
1359
                    $this->MP = false;
1360
                }
1361
            }
1362
1363
            $code .= $this->drawURLs_addRowsForPage(
1364
                $data['row'],
1365
                $data['HTML'] . BackendUtility::getRecordTitle('pages', $data['row'], true)
1366
            );
1367
        }
1368
1369
        return $code;
1370
    }
1371
1372
    /**
1373
     * Expands exclude string
1374
     *
1375
     * @param string $excludeString Exclude string
1376
     * @return array
1377
     */
1378 1
    public function expandExcludeString($excludeString)
1379
    {
1380
        // internal static caches;
1381 1
        static $expandedExcludeStringCache;
1382 1
        static $treeCache;
1383
1384 1
        if (empty($expandedExcludeStringCache[$excludeString])) {
1385 1
            $pidList = [];
1386
1387 1
            if (! empty($excludeString)) {
1388
                /** @var PageTreeView $tree */
1389
                $tree = GeneralUtility::makeInstance(PageTreeView::class);
1390
                $tree->init('AND ' . $this->getBackendUser()->getPagePermsClause(Permission::PAGE_SHOW));
1391
1392
                $excludeParts = GeneralUtility::trimExplode(',', $excludeString);
1393
1394
                foreach ($excludeParts as $excludePart) {
1395
                    [$pid, $depth] = GeneralUtility::trimExplode('+', $excludePart);
1396
1397
                    // default is "page only" = "depth=0"
1398
                    if (empty($depth)) {
1399
                        $depth = (stristr($excludePart, '+')) ? 99 : 0;
1400
                    }
1401
1402
                    $pidList[] = $pid;
1403
1404
                    if ($depth > 0) {
1405
                        if (empty($treeCache[$pid][$depth])) {
1406
                            $tree->reset();
1407
                            $tree->getTree($pid, $depth);
1408
                            $treeCache[$pid][$depth] = $tree->tree;
1409
                        }
1410
1411
                        foreach ($treeCache[$pid][$depth] as $data) {
1412
                            $pidList[] = $data['row']['uid'];
1413
                        }
1414
                    }
1415
                }
1416
            }
1417
1418 1
            $expandedExcludeStringCache[$excludeString] = array_unique($pidList);
1419
        }
1420
1421 1
        return $expandedExcludeStringCache[$excludeString];
1422
    }
1423
1424
    /**
1425
     * Create the rows for display of the page tree
1426
     * For each page a number of rows are shown displaying GET variable configuration
1427
     */
1428
    public function drawURLs_addRowsForPage(array $pageRow, string $pageTitle): string
1429
    {
1430
        $skipMessage = '';
1431
1432
        // Get list of configurations
1433
        $configurations = $this->getUrlsForPageRow($pageRow, $skipMessage);
1434
1435
        if (! empty($this->incomingConfigurationSelection)) {
1436
            // remove configuration that does not match the current selection
1437
            foreach ($configurations as $confKey => $confArray) {
1438
                if (! in_array($confKey, $this->incomingConfigurationSelection, true)) {
1439
                    unset($configurations[$confKey]);
1440
                }
1441
            }
1442
        }
1443
1444
        // Traverse parameter combinations:
1445
        $c = 0;
1446
        $content = '';
1447
        if (! empty($configurations)) {
1448
            foreach ($configurations as $confKey => $confArray) {
1449
1450
                // Title column:
1451
                if (! $c) {
1452
                    $titleClm = '<td rowspan="' . count($configurations) . '">' . $pageTitle . '</td>';
1453
                } else {
1454
                    $titleClm = '';
1455
                }
1456
1457
                if (! in_array($pageRow['uid'], $this->expandExcludeString($confArray['subCfg']['exclude']), true)) {
1458
1459
                    // URL list:
1460
                    $urlList = $this->urlListFromUrlArray(
1461
                        $confArray,
1462
                        $pageRow,
1463
                        $this->scheduledTime,
1464
                        $this->reqMinute,
1465
                        $this->submitCrawlUrls,
1466
                        $this->downloadCrawlUrls,
1467
                        $this->duplicateTrack,
1468
                        $this->downloadUrls,
1469
                        $this->incomingProcInstructions // if empty the urls won't be filtered by processing instructions
1470
                    );
1471
1472
                    // Expanded parameters:
1473
                    $paramExpanded = '';
1474
                    $calcAccu = [];
1475
                    $calcRes = 1;
1476
                    foreach ($confArray['paramExpanded'] as $gVar => $gVal) {
1477
                        $paramExpanded .= '
1478
                            <tr>
1479
                                <td>' . htmlspecialchars('&' . $gVar . '=') . '<br/>' .
1480
                            '(' . count($gVal) . ')' .
1481
                            '</td>
1482
                                <td nowrap="nowrap">' . nl2br(htmlspecialchars(implode(chr(10), $gVal))) . '</td>
1483
                            </tr>
1484
                        ';
1485
                        $calcRes *= count($gVal);
1486
                        $calcAccu[] = count($gVal);
1487
                    }
1488
                    $paramExpanded = '<table>' . $paramExpanded . '</table>';
1489
                    $paramExpanded .= 'Comb: ' . implode('*', $calcAccu) . '=' . $calcRes;
1490
1491
                    // Options
1492
                    $optionValues = '';
1493
                    if ($confArray['subCfg']['userGroups']) {
1494
                        $optionValues .= 'User Groups: ' . $confArray['subCfg']['userGroups'] . '<br/>';
1495
                    }
1496
                    if ($confArray['subCfg']['procInstrFilter']) {
1497
                        $optionValues .= 'ProcInstr: ' . $confArray['subCfg']['procInstrFilter'] . '<br/>';
1498
                    }
1499
1500
                    // Compile row:
1501
                    $content .= '
1502
                        <tr>
1503
                            ' . $titleClm . '
1504
                            <td>' . htmlspecialchars($confKey) . '</td>
1505
                            <td>' . nl2br(htmlspecialchars(rawurldecode(trim(str_replace('&', chr(10) . '&', GeneralUtility::implodeArrayForUrl('', $confArray['paramParsed'])))))) . '</td>
1506
                            <td>' . $paramExpanded . '</td>
1507
                            <td nowrap="nowrap">' . $urlList . '</td>
1508
                            <td nowrap="nowrap">' . $optionValues . '</td>
1509
                            <td nowrap="nowrap">' . DebugUtility::viewArray($confArray['subCfg']['procInstrParams.']) . '</td>
1510
                        </tr>';
1511
                } else {
1512
                    $content .= '<tr>
1513
                            ' . $titleClm . '
1514
                            <td>' . htmlspecialchars($confKey) . '</td>
1515
                            <td colspan="5"><em>No entries</em> (Page is excluded in this configuration)</td>
1516
                        </tr>';
1517
                }
1518
1519
                $c++;
1520
            }
1521
        } else {
1522
            $message = ! empty($skipMessage) ? ' (' . $skipMessage . ')' : '';
1523
1524
            // Compile row:
1525
            $content .= '
1526
                <tr>
1527
                    <td>' . $pageTitle . '</td>
1528
                    <td colspan="6"><em>No entries</em>' . $message . '</td>
1529
                </tr>';
1530
        }
1531
1532
        return $content;
1533
    }
1534
1535
    /*****************************
1536
     *
1537
     * CLI functions
1538
     *
1539
     *****************************/
1540
1541
    /**
1542
     * Running the functionality of the CLI (crawling URLs from queue)
1543
     */
1544
    public function CLI_run(int $countInARun, int $sleepTime, int $sleepAfterFinish): int
1545
    {
1546
        $result = 0;
1547
        $counter = 0;
1548
1549
        // First, run hooks:
1550
        /** @var HookUtility $hookUtility */
1551
        $hookUtility = GeneralUtility::makeInstance(HookUtility::class);
1552
        $hookUtility->triggerCliHooks();
1553
1554
1555
        // Clean up the queue
1556
        $this->queueRepository->cleanupQueue();
1557
1558
        // Select entries:
1559
        $rows = $this->queueRepository->fetchRecordsToBeCrawled($countInARun);
1560
1561
        if (! empty($rows)) {
1562
            $quidList = [];
1563
1564
            foreach ($rows as $r) {
1565
                $quidList[] = $r['qid'];
1566
            }
1567
1568
            $processId = $this->CLI_buildProcessId();
1569
1570
            //save the number of assigned queue entries to determine how many have been processed later
1571
            $numberOfAffectedRows = $this->queueRepository->updateProcessIdAndSchedulerForQueueIds($quidList, $processId);
1572
            $this->processRepository->updateProcessAssignItemsCount($numberOfAffectedRows, $processId);
1573
1574
            if ($numberOfAffectedRows !== count($quidList)) {
1575
                $this->CLI_debug('Nothing processed due to multi-process collision (' . $this->CLI_buildProcessId() . ')');
1576
                return ($result | self::CLI_STATUS_ABORTED);
1577
            }
1578
1579
            foreach ($rows as $r) {
1580
                $result |= $this->readUrl($r['qid']);
1581
1582
                $counter++;
1583
                usleep((int) $sleepTime); // Just to relax the system
1584
1585
                // if during the start and the current read url the cli has been disable we need to return from the function
1586
                // mark the process NOT as ended.
1587
                if ($this->getDisabled()) {
1588
                    return ($result | self::CLI_STATUS_ABORTED);
1589
                }
1590
1591
                if (! $this->processRepository->isProcessActive($this->CLI_buildProcessId())) {
1592
                    $this->CLI_debug('conflict / timeout (' . $this->CLI_buildProcessId() . ')');
1593
                    $result |= self::CLI_STATUS_ABORTED;
1594
                    break; //possible timeout
1595
                }
1596
            }
1597
1598
            sleep((int) $sleepAfterFinish);
1599
1600
            $msg = 'Rows: ' . $counter;
1601
            $this->CLI_debug($msg . ' (' . $this->CLI_buildProcessId() . ')');
1602
        } else {
1603
            $this->CLI_debug('Nothing within queue which needs to be processed (' . $this->CLI_buildProcessId() . ')');
1604
        }
1605
1606
        if ($counter > 0) {
1607
            $result |= self::CLI_STATUS_PROCESSED;
1608
        }
1609
1610
        return $result;
1611
    }
1612
1613
    /**
1614
     * @deprecated
1615
     */
1616
    public function CLI_runHooks(): void
1617
    {
1618
        /** @var HookUtility $hookUtility */
1619
        $hookUtility = GeneralUtility::makeInstance(HookUtility::class);
1620
        $hookUtility->triggerCliHooks();
1621
    }
1622
1623
    /**
1624
     * Try to acquire a new process with the given id
1625
     * also performs some auto-cleanup for orphan processes
1626
     * @param string $id identification string for the process
1627
     * @return boolean
1628
     * @todo preemption might not be the most elegant way to clean up
1629
     */
1630
    public function CLI_checkAndAcquireNewProcess($id)
1631
    {
1632
        $queryBuilder = GeneralUtility::makeInstance(ConnectionPool::class)->getQueryBuilderForTable($this->tableName);
1633
        $ret = true;
1634
1635
        $systemProcessId = getmypid();
1636
        if ($systemProcessId < 1) {
1637
            return false;
1638
        }
1639
1640
        $processCount = 0;
1641
        $orphanProcesses = [];
1642
1643
        $statement = $queryBuilder
1644
            ->select('process_id', 'ttl')
1645
            ->from('tx_crawler_process')
1646
            ->where(
1647
                'active = 1 AND deleted = 0'
1648
            )
1649
            ->execute();
1650
1651
        $currentTime = $this->getCurrentTime();
1652
1653
        while ($row = $statement->fetch()) {
1654
            if ($row['ttl'] < $currentTime) {
1655
                $orphanProcesses[] = $row['process_id'];
1656
            } else {
1657
                $processCount++;
1658
            }
1659
        }
1660
1661
        // if there are less than allowed active processes then add a new one
1662
        if ($processCount < (int) $this->extensionSettings['processLimit']) {
1663
            $this->CLI_debug('add process ' . $this->CLI_buildProcessId() . ' (' . ($processCount + 1) . '/' . (int) $this->extensionSettings['processLimit'] . ')');
1664
1665
            GeneralUtility::makeInstance(ConnectionPool::class)->getConnectionForTable('tx_crawler_process')->insert(
1666
                'tx_crawler_process',
1667
                [
1668
                    'process_id' => $id,
1669
                    'active' => 1,
1670
                    'ttl' => $currentTime + (int) $this->extensionSettings['processMaxRunTime'],
1671
                    'system_process_id' => $systemProcessId,
1672
                ]
1673
            );
1674
        } else {
1675
            $this->CLI_debug('Processlimit reached (' . ($processCount) . '/' . (int) $this->extensionSettings['processLimit'] . ')');
1676
            $ret = false;
1677
        }
1678
1679
        $this->processRepository->deleteProcessesMarkedAsDeleted();
1680
        $this->CLI_releaseProcesses($orphanProcesses);
1681
1682
        return $ret;
1683
    }
1684
1685
    /**
1686
     * Release a process and the required resources
1687
     *
1688
     * @param mixed $releaseIds string with a single process-id or array with multiple process-ids
1689
     * @return boolean
1690
     */
1691
    public function CLI_releaseProcesses($releaseIds)
1692
    {
1693
        $queryBuilder = GeneralUtility::makeInstance(ConnectionPool::class)->getQueryBuilderForTable($this->tableName);
1694
1695
        if (! is_array($releaseIds)) {
1696
            $releaseIds = [$releaseIds];
1697
        }
1698
1699
        if (empty($releaseIds)) {
1700
            return false;   //nothing to release
1701
        }
1702
1703
        // some kind of 2nd chance algo - this way you need at least 2 processes to have a real cleanup
1704
        // this ensures that a single process can't mess up the entire process table
1705
1706
        // mark all processes as deleted which have no "waiting" queue-entires and which are not active
1707
1708
        $queryBuilder
1709
            ->update($this->tableName, 'q')
1710
            ->where(
1711
                'q.process_id IN(SELECT p.process_id FROM tx_crawler_process as p WHERE p.active = 0)'
1712
            )
1713
            ->set('q.process_scheduled', 0)
1714
            ->set('q.process_id', '')
1715
            ->execute();
1716
1717
        // FIXME: Not entirely sure that this is equivalent to the previous version
1718
        $queryBuilder->resetQueryPart('set');
1719
1720
        $queryBuilder
1721
            ->update('tx_crawler_process')
1722
            ->where(
1723
                $queryBuilder->expr()->eq('active', 0),
1724
                'process_id IN(SELECT q.process_id FROM tx_crawler_queue as q WHERE q.exec_time = 0)'
1725
            )
1726
            ->set('system_process_id', 0)
1727
            ->execute();
1728
1729
        $this->processRepository->markRequestedProcessesAsNotActive($releaseIds);
1730
        $this->queueRepository->unsetProcessScheduledAndProcessIdForQueueEntries($releaseIds);
1731
1732
        return true;
1733
    }
1734
1735
    /**
1736
     * Create a unique Id for the current process
1737
     *
1738
     * @return string  the ID
1739
     */
1740 1
    public function CLI_buildProcessId()
1741
    {
1742 1
        if (! $this->processID) {
1743
            $this->processID = GeneralUtility::shortMD5(microtime(true));
1744
        }
1745 1
        return $this->processID;
1746
    }
1747
1748
    /**
1749
     * Prints a message to the stdout (only if debug-mode is enabled)
1750
     *
1751
     * @param string $msg the message
1752
     */
1753
    public function CLI_debug($msg): void
1754
    {
1755
        if ((int) $this->extensionSettings['processDebug']) {
1756
            echo $msg . "\n";
1757
            flush();
1758
        }
1759
    }
1760
1761
    /**
1762
     * Cleans up entries that stayed for too long in the queue. These are:
1763
     * - processed entries that are over 1.5 days in age
1764
     * - scheduled entries that are over 7 days old
1765
     *
1766
     * @deprecated
1767
     */
1768 1
    public function cleanUpOldQueueEntries(): void
1769
    {
1770 1
        $processedAgeInSeconds = $this->extensionSettings['cleanUpProcessedAge'] * 86400; // 24*60*60 Seconds in 24 hours
1771 1
        $scheduledAgeInSeconds = $this->extensionSettings['cleanUpScheduledAge'] * 86400;
1772
1773 1
        $now = time();
1774 1
        $condition = '(exec_time<>0 AND exec_time<' . ($now - $processedAgeInSeconds) . ') OR scheduled<=' . ($now - $scheduledAgeInSeconds);
1775 1
        $this->flushQueue($condition);
0 ignored issues
show
Deprecated Code introduced by
The function AOE\Crawler\Controller\C...ontroller::flushQueue() has been deprecated. ( Ignorable by Annotation )

If this is a false-positive, you can also ignore this issue in your code via the ignore-deprecated  annotation

1775
        /** @scrutinizer ignore-deprecated */ $this->flushQueue($condition);
Loading history...
1776 1
    }
1777
1778
    /**
1779
     * Removes queue entries
1780
     *
1781
     * @param string $where SQL related filter for the entries which should be removed
1782
     *
1783
     * @deprecated
1784
     */
1785 5
    protected function flushQueue($where = ''): void
1786
    {
1787 5
        $realWhere = strlen((string) $where) > 0 ? $where : '1=1';
1788
1789 5
        $queryBuilder = $this->getQueryBuilder($this->tableName);
1790
1791
        $groups = $queryBuilder
1792 5
            ->selectLiteral('DISTINCT set_id')
1793 5
            ->from($this->tableName)
1794 5
            ->where($realWhere)
1795 5
            ->execute()
1796 5
            ->fetchAll();
1797 5
        if (is_array($groups)) {
0 ignored issues
show
introduced by
The condition is_array($groups) is always true.
Loading history...
1798 5
            foreach ($groups as $group) {
1799
                $subSet = $queryBuilder
1800 4
                    ->select('qid', 'set_id')
1801 4
                    ->from($this->tableName)
1802 4
                    ->where(
1803 4
                        $realWhere,
1804 4
                        $queryBuilder->expr()->eq('set_id', $group['set_id'])
1805
                    )
1806 4
                    ->execute()
1807 4
                    ->fetchAll();
1808
1809 4
                $payLoad = ['subSet' => $subSet];
1810 4
                SignalSlotUtility::emitSignal(
1811 4
                    self::class,
1812 4
                    SignalSlotUtility::SIGNAL_QUEUE_ENTRY_FLUSH,
1813 4
                    $payLoad
1814
                );
1815
            }
1816
        }
1817
1818
        $queryBuilder
1819 5
            ->delete($this->tableName)
1820 5
            ->where($realWhere)
1821 5
            ->execute();
1822 5
    }
1823
1824
    /**
1825
     * This method determines duplicates for a queue entry with the same parameters and this timestamp.
1826
     * If the timestamp is in the past, it will check if there is any unprocessed queue entry in the past.
1827
     * If the timestamp is in the future it will check, if the queued entry has exactly the same timestamp
1828
     *
1829
     * @param int $tstamp
1830
     * @param array $fieldArray
1831
     *
1832
     * @return array
1833
     */
1834 9
    protected function getDuplicateRowsIfExist($tstamp, $fieldArray)
1835
    {
1836 9
        $rows = [];
1837
1838 9
        $currentTime = $this->getCurrentTime();
1839
1840 9
        $queryBuilder = GeneralUtility::makeInstance(ConnectionPool::class)->getQueryBuilderForTable($this->tableName);
1841
        $queryBuilder
1842 9
            ->select('qid')
1843 9
            ->from('tx_crawler_queue');
1844
        //if this entry is scheduled with "now"
1845 9
        if ($tstamp <= $currentTime) {
1846 3
            if ($this->extensionSettings['enableTimeslot']) {
1847 2
                $timeBegin = $currentTime - 100;
1848 2
                $timeEnd = $currentTime + 100;
1849
                $queryBuilder
1850 2
                    ->where(
1851 2
                        'scheduled BETWEEN ' . $timeBegin . ' AND ' . $timeEnd . ''
1852
                    )
1853 2
                    ->orWhere(
1854 2
                        $queryBuilder->expr()->lte('scheduled', $currentTime)
1855
                    );
1856
            } else {
1857
                $queryBuilder
1858 1
                    ->where(
1859 3
                        $queryBuilder->expr()->lte('scheduled', $currentTime)
1860
                    );
1861
            }
1862 6
        } elseif ($tstamp > $currentTime) {
1863
            //entry with a timestamp in the future need to have the same schedule time
1864
            $queryBuilder
1865 6
                ->where(
1866 6
                    $queryBuilder->expr()->eq('scheduled', $tstamp)
1867
                );
1868
        }
1869
1870
        $queryBuilder
1871 9
            ->andWhere('NOT exec_time')
1872 9
            ->andWhere('NOT process_id')
1873 9
            ->andWhere($queryBuilder->expr()->eq('page_id', $queryBuilder->createNamedParameter($fieldArray['page_id'], \PDO::PARAM_INT)))
1874 9
            ->andWhere($queryBuilder->expr()->eq('parameters_hash', $queryBuilder->createNamedParameter($fieldArray['parameters_hash'], \PDO::PARAM_STR)));
1875
1876 9
        $statement = $queryBuilder->execute();
1877
1878 9
        while ($row = $statement->fetch()) {
1879 7
            $rows[] = $row['qid'];
1880
        }
1881
1882 9
        return $rows;
1883
    }
1884
1885
    /**
1886
     * Returns a md5 hash generated from a serialized configuration array.
1887
     *
1888
     * @return string
1889
     */
1890 10
    protected function getConfigurationHash(array $configuration)
1891
    {
1892 10
        unset($configuration['paramExpanded']);
1893 10
        unset($configuration['URLs']);
1894 10
        return md5(serialize($configuration));
1895
    }
1896
1897
    /**
1898
     * Build a URL from a Page and the Query String. If the page has a Site configuration, it can be built by using
1899
     * the Site instance.
1900
     *
1901
     * @param int $httpsOrHttp see tx_crawler_configuration.force_ssl
1902
     * @throws \TYPO3\CMS\Core\Exception\SiteNotFoundException
1903
     * @throws \TYPO3\CMS\Core\Routing\InvalidRouteArgumentsException
1904
     */
1905 12
    protected function getUrlFromPageAndQueryParameters(int $pageId, string $queryString, ?string $alternativeBaseUrl, int $httpsOrHttp): UriInterface
1906
    {
1907 12
        $site = GeneralUtility::makeInstance(SiteMatcher::class)->matchByPageId((int) $pageId);
1908 12
        if ($site instanceof Site) {
1909 5
            $queryString = ltrim($queryString, '?&');
1910 5
            $queryParts = [];
1911 5
            parse_str($queryString, $queryParts);
1912 5
            unset($queryParts['id']);
1913
            // workaround as long as we don't have native language support in crawler configurations
1914 5
            if (isset($queryParts['L'])) {
1915
                $queryParts['_language'] = $queryParts['L'];
1916
                unset($queryParts['L']);
1917
                $siteLanguage = $site->getLanguageById((int) $queryParts['_language']);
0 ignored issues
show
Unused Code introduced by
The assignment to $siteLanguage is dead and can be removed.
Loading history...
1918
            } else {
1919 5
                $siteLanguage = $site->getDefaultLanguage();
1920
            }
1921 5
            $url = $site->getRouter()->generateUri($pageId, $queryParts);
1922 5
            if (! empty($alternativeBaseUrl)) {
1923 3
                $alternativeBaseUrl = new Uri($alternativeBaseUrl);
1924 3
                $url = $url->withHost($alternativeBaseUrl->getHost());
1925 3
                $url = $url->withScheme($alternativeBaseUrl->getScheme());
1926 3
                $url = $url->withPort($alternativeBaseUrl->getPort());
1927 3
                if ($userInfo = $alternativeBaseUrl->getUserInfo()) {
1928 5
                    $url = $url->withUserInfo($userInfo);
1929
                }
1930
            }
1931
        } else {
1932
            // Technically this is not possible with site handling, but kept for backwards-compatibility reasons
1933
            // Once EXT:crawler is v10-only compatible, this should be removed completely
1934 7
            $baseUrl = ($alternativeBaseUrl ?: GeneralUtility::getIndpEnv('TYPO3_SITE_URL'));
1935 7
            $cacheHashCalculator = GeneralUtility::makeInstance(CacheHashCalculator::class);
1936 7
            $queryString .= '&cHash=' . $cacheHashCalculator->generateForParameters($queryString);
1937 7
            $url = rtrim($baseUrl, '/') . '/index.php' . $queryString;
1938 7
            $url = new Uri($url);
1939
        }
1940
1941 12
        if ($httpsOrHttp === -1) {
1942 2
            $url = $url->withScheme('http');
1943 10
        } elseif ($httpsOrHttp === 1) {
1944 6
            $url = $url->withScheme('https');
1945
        }
1946
1947 12
        return $url;
1948
    }
1949
1950 1
    protected function swapIfFirstIsLargerThanSecond(array $reg): array
1951
    {
1952
        // Swap if first is larger than last:
1953 1
        if ($reg[1] > $reg[2]) {
1954
            $temp = $reg[2];
1955
            $reg[2] = $reg[1];
1956
            $reg[1] = $temp;
1957
        }
1958
1959 1
        return $reg;
1960
    }
1961
1962
    /**
1963
     * @return BackendUserAuthentication
1964
     */
1965 1
    private function getBackendUser()
1966
    {
1967
        // Make sure the _cli_ user is loaded
1968 1
        Bootstrap::initializeBackendAuthentication();
1969 1
        if ($this->backendUser === null) {
1970 1
            $this->backendUser = $GLOBALS['BE_USER'];
1971
        }
1972 1
        return $this->backendUser;
1973
    }
1974
1975
    /**
1976
     * Get querybuilder for given table
1977
     *
1978
     * @return \TYPO3\CMS\Core\Database\Query\QueryBuilder
1979
     */
1980 12
    private function getQueryBuilder(string $table)
1981
    {
1982 12
        return GeneralUtility::makeInstance(ConnectionPool::class)->getQueryBuilderForTable($table);
1983
    }
1984
}
1985