Passed
Push — ci/infection ( 2591ac...a25359 )
by Tomas Norre
07:35
created

CrawlerController::getPageTreeAndUrls()   B

Complexity

Conditions 7
Paths 16

Size

Total Lines 90
Code Lines 48

Duplication

Lines 0
Ratio 0 %

Code Coverage

Tests 0
CRAP Score 56

Importance

Changes 1
Bugs 0 Features 0
Metric Value
cc 7
eloc 48
c 1
b 0
f 0
nc 16
nop 8
dl 0
loc 90
ccs 0
cts 47
cp 0
crap 56
rs 8.2012

How to fix   Long Method    Many Parameters   

Long Method

Small methods make your code easier to understand, in particular if combined with a good name. Besides, if your method is small, finding a good name is usually much easier.

For example, if you find yourself adding comments to a method's body, this is usually a good sign to extract the commented part to a new method, and use the comment as a starting point when coming up with a good name for this new method.

Commonly applied refactorings include:

Many Parameters

Methods with many parameters are not only hard to understand, but their parameters also often become inconsistent when you need more, or different data.

There are several approaches to avoid long parameter lists:

1
<?php
2
3
declare(strict_types=1);
4
5
namespace AOE\Crawler\Controller;
6
7
/***************************************************************
8
 *  Copyright notice
9
 *
10
 *  (c) 2020 AOE GmbH <[email protected]>
11
 *
12
 *  All rights reserved
13
 *
14
 *  This script is part of the TYPO3 project. The TYPO3 project is
15
 *  free software; you can redistribute it and/or modify
16
 *  it under the terms of the GNU General Public License as published by
17
 *  the Free Software Foundation; either version 3 of the License, or
18
 *  (at your option) any later version.
19
 *
20
 *  The GNU General Public License can be found at
21
 *  http://www.gnu.org/copyleft/gpl.html.
22
 *
23
 *  This script is distributed in the hope that it will be useful,
24
 *  but WITHOUT ANY WARRANTY; without even the implied warranty of
25
 *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
26
 *  GNU General Public License for more details.
27
 *
28
 *  This copyright notice MUST APPEAR in all copies of the script!
29
 ***************************************************************/
30
31
use AOE\Crawler\Configuration\ExtensionConfigurationProvider;
32
use AOE\Crawler\Converter\JsonCompatibilityConverter;
33
use AOE\Crawler\Domain\Repository\ConfigurationRepository;
34
use AOE\Crawler\Domain\Repository\ProcessRepository;
35
use AOE\Crawler\Domain\Repository\QueueRepository;
36
use AOE\Crawler\QueueExecutor;
37
use AOE\Crawler\Utility\SignalSlotUtility;
38
use Psr\Http\Message\UriInterface;
39
use Psr\Log\LoggerAwareInterface;
40
use Psr\Log\LoggerAwareTrait;
41
use TYPO3\CMS\Backend\Tree\View\PageTreeView;
42
use TYPO3\CMS\Backend\Utility\BackendUtility;
43
use TYPO3\CMS\Core\Authentication\BackendUserAuthentication;
44
use TYPO3\CMS\Core\Compatibility\PublicMethodDeprecationTrait;
45
use TYPO3\CMS\Core\Core\Bootstrap;
46
use TYPO3\CMS\Core\Core\Environment;
47
use TYPO3\CMS\Core\Database\Connection;
48
use TYPO3\CMS\Core\Database\ConnectionPool;
49
use TYPO3\CMS\Core\Database\Query\Restriction\DeletedRestriction;
50
use TYPO3\CMS\Core\Http\Uri;
51
use TYPO3\CMS\Core\Imaging\Icon;
52
use TYPO3\CMS\Core\Imaging\IconFactory;
53
use TYPO3\CMS\Core\Routing\SiteMatcher;
54
use TYPO3\CMS\Core\Site\Entity\Site;
55
use TYPO3\CMS\Core\Type\Bitmask\Permission;
56
use TYPO3\CMS\Core\TypoScript\Parser\TypoScriptParser;
57
use TYPO3\CMS\Core\Utility\DebugUtility;
58
use TYPO3\CMS\Core\Utility\GeneralUtility;
59
use TYPO3\CMS\Core\Utility\MathUtility;
60
use TYPO3\CMS\Extbase\Object\ObjectManager;
61
use TYPO3\CMS\Frontend\Page\CacheHashCalculator;
62
use TYPO3\CMS\Frontend\Page\PageRepository;
63
64
/**
65
 * Class CrawlerController
66
 *
67
 * @package AOE\Crawler\Controller
68
 */
69
class CrawlerController implements LoggerAwareInterface
70
{
71
    use LoggerAwareTrait;
72
    use PublicMethodDeprecationTrait;
73
74
    public const CLI_STATUS_NOTHING_PROCCESSED = 0;
75
76
    public const CLI_STATUS_REMAIN = 1; //queue not empty
77
78
    public const CLI_STATUS_PROCESSED = 2; //(some) queue items where processed
79
80
    public const CLI_STATUS_ABORTED = 4; //instance didn't finish
81
82
    public const CLI_STATUS_POLLABLE_PROCESSED = 8;
83
84
    /**
85
     * @var integer
86
     */
87
    public $setID = 0;
88
89
    /**
90
     * @var string
91
     */
92
    public $processID = '';
93
94
    /**
95
     * @var array
96
     */
97
    public $duplicateTrack = [];
98
99
    /**
100
     * @var array
101
     */
102
    public $downloadUrls = [];
103
104
    /**
105
     * @var array
106
     */
107
    public $incomingProcInstructions = [];
108
109
    /**
110
     * @var array
111
     */
112
    public $incomingConfigurationSelection = [];
113
114
    /**
115
     * @var bool
116
     */
117
    public $registerQueueEntriesInternallyOnly = false;
118
119
    /**
120
     * @var array
121
     */
122
    public $queueEntries = [];
123
124
    /**
125
     * @var array
126
     */
127
    public $urlList = [];
128
129
    /**
130
     * @var array
131
     */
132
    public $extensionSettings = [];
133
134
    /**
135
     * Mount Point
136
     *
137
     * @var bool
138
     * Todo: Check what this is used for and adjust the type hint or code, as bool doesn't match the current code.
139
     */
140
    public $MP = false;
141
142
    /**
143
     * @var string
144
     */
145
    protected $processFilename;
146
147
    /**
148
     * Holds the internal access mode can be 'gui','cli' or 'cli_im'
149
     *
150
     * @var string
151
     */
152
    protected $accessMode;
153
154
    /**
155
     * @var QueueRepository
156
     */
157
    protected $queueRepository;
158
159
    /**
160
     * @var ProcessRepository
161
     */
162
    protected $processRepository;
163
164
    /**
165
     * @var ConfigurationRepository
166
     */
167
    protected $configurationRepository;
168
169
    /**
170
     * @var string
171
     */
172
    protected $tableName = 'tx_crawler_queue';
173
174
    /**
175
     * @var QueueExecutor
176
     */
177
    protected $queueExecutor;
178
179
    /**
180
     * @var int
181
     */
182
    protected $maximumUrlsToCompile = 10000;
183
184
    /**
185
     * @var IconFactory
186
     */
187
    protected $iconFactory;
188
189
    /**
190
     * @var string[]
191
     */
192
    private $deprecatedPublicMethods = [
0 ignored issues
show
introduced by
The private property $deprecatedPublicMethods is not used, and could be removed.
Loading history...
193
        'getLogEntriesForSetId' => 'Using crawlerController::getLogEntriesForSetId() is deprecated since 9.0.1 and will be removed in v11.x',
194
        'flushQueue' => 'Using CrawlerController::flushQueue() is deprecated since 9.0.1 and will be removed in v11.x, please use QueueRepository->flushQueue() instead.',
195
        'cleanUpOldQueueEntries' => 'Using CrawlerController::cleanUpOldQueueEntries() is deprecated since 9.0.1 and will be removed in v11.x, please use QueueRepository->cleanUpOldQueueEntries() instead.',
196
    ];
197
198
    /**
199
     * @var BackendUserAuthentication|null
200
     */
201
    private $backendUser;
202
203
    /**
204
     * @var integer
205
     */
206
    private $scheduledTime = 0;
207
208
    /**
209
     * @var integer
210
     */
211
    private $reqMinute = 0;
212
213
    /**
214
     * @var bool
215
     */
216
    private $submitCrawlUrls = false;
217
218
    /**
219
     * @var bool
220
     */
221
    private $downloadCrawlUrls = false;
222
223
    /************************************
224
     *
225
     * Getting URLs based on Page TSconfig
226
     *
227
     ************************************/
228
229 37
    public function __construct()
230
    {
231 37
        $objectManager = GeneralUtility::makeInstance(ObjectManager::class);
232 37
        $this->queueRepository = $objectManager->get(QueueRepository::class);
233 37
        $this->processRepository = $objectManager->get(ProcessRepository::class);
234 37
        $this->configurationRepository = $objectManager->get(ConfigurationRepository::class);
235 37
        $this->queueExecutor = $objectManager->get(QueueExecutor::class);
236 37
        $this->iconFactory = GeneralUtility::makeInstance(IconFactory::class);
237
238 37
        $this->processFilename = Environment::getVarPath() . '/lock/tx_crawler.proc';
239
240
        /** @var ExtensionConfigurationProvider $configurationProvider */
241 37
        $configurationProvider = GeneralUtility::makeInstance(ExtensionConfigurationProvider::class);
242 37
        $settings = $configurationProvider->getExtensionConfiguration();
243 37
        $this->extensionSettings = is_array($settings) ? $settings : [];
0 ignored issues
show
introduced by
The condition is_array($settings) is always true.
Loading history...
244
245
        // set defaults:
246 37
        if (MathUtility::convertToPositiveInteger($this->extensionSettings['countInARun']) === 0) {
247
            $this->extensionSettings['countInARun'] = 100;
248
        }
249
250 37
        $this->extensionSettings['processLimit'] = MathUtility::forceIntegerInRange($this->extensionSettings['processLimit'], 1, 99, 1);
251 37
        $this->maximumUrlsToCompile = MathUtility::forceIntegerInRange($this->extensionSettings['maxCompileUrls'], 1, 1000000000, 10000);
252 37
    }
253
254
    public function getMaximumUrlsToCompile(): int
255
    {
256
        return $this->maximumUrlsToCompile;
257
    }
258
259 4
    public function setMaximumUrlsToCompile(int $maximumUrlsToCompile): void
260
    {
261 4
        $this->maximumUrlsToCompile = $maximumUrlsToCompile;
262 4
    }
263
264
    /**
265
     * Method to set the accessMode can be gui, cli or cli_im
266
     *
267
     * @return string
268
     */
269 1
    public function getAccessMode()
270
    {
271 1
        return $this->accessMode;
272
    }
273
274
    /**
275
     * @param string $accessMode
276
     */
277 1
    public function setAccessMode($accessMode): void
278
    {
279 1
        $this->accessMode = $accessMode;
280 1
    }
281
282
    /**
283
     * Set disabled status to prevent processes from being processed
284
     */
285 3
    public function setDisabled(?bool $disabled = true): void
286
    {
287 3
        if ($disabled) {
288 2
            GeneralUtility::writeFile($this->processFilename, 'disabled');
289 1
        } elseif (is_file($this->processFilename)) {
290 1
            unlink($this->processFilename);
291
        }
292 3
    }
293
294
    /**
295
     * Get disable status
296
     */
297 3
    public function getDisabled(): bool
298
    {
299 3
        return is_file($this->processFilename);
300
    }
301
302
    /**
303
     * @param string $filenameWithPath
304
     */
305 4
    public function setProcessFilename($filenameWithPath): void
306
    {
307 4
        $this->processFilename = $filenameWithPath;
308 4
    }
309
310
    /**
311
     * @return string
312
     */
313 1
    public function getProcessFilename()
314
    {
315 1
        return $this->processFilename;
316
    }
317
318
    /**
319
     * Sets the extensions settings (unserialized pendant of $TYPO3_CONF_VARS['EXT']['extConf']['crawler']).
320
     */
321 6
    public function setExtensionSettings(array $extensionSettings): void
322
    {
323 6
        $this->extensionSettings = $extensionSettings;
324 6
    }
325
326
    /**
327
     * Check if the given page should be crawled
328
     *
329
     * @return false|string false if the page should be crawled (not excluded), true / skipMessage if it should be skipped
330
     */
331
    public function checkIfPageShouldBeSkipped(array $pageRow)
332
    {
333
        $skipPage = false;
334
        $skipMessage = 'Skipped'; // message will be overwritten later
335
336
        // if page is hidden
337
        if (! $this->extensionSettings['crawlHiddenPages']) {
338
            if ($pageRow['hidden']) {
339
                $skipPage = true;
340
                $skipMessage = 'Because page is hidden';
341
            }
342
        }
343
344
        if (! $skipPage) {
345
            if (GeneralUtility::inList('3,4,199,254,255', $pageRow['doktype'])) {
346
                $skipPage = true;
347
                $skipMessage = 'Because doktype is not allowed';
348
            }
349
        }
350
351
        if (! $skipPage) {
352
            foreach ($GLOBALS['TYPO3_CONF_VARS']['EXTCONF']['crawler']['excludeDoktype'] ?? [] as $key => $doktypeList) {
353
                if (GeneralUtility::inList($doktypeList, $pageRow['doktype'])) {
354
                    $skipPage = true;
355
                    $skipMessage = 'Doktype was excluded by "' . $key . '"';
356
                    break;
357
                }
358
            }
359
        }
360
361
        if (! $skipPage) {
362
            // veto hook
363
            foreach ($GLOBALS['TYPO3_CONF_VARS']['EXTCONF']['crawler']['pageVeto'] ?? [] as $key => $func) {
364
                $params = [
365
                    'pageRow' => $pageRow,
366
                ];
367
                // expects "false" if page is ok and "true" or a skipMessage if this page should _not_ be crawled
368
                $veto = GeneralUtility::callUserFunction($func, $params, $this);
369
                if ($veto !== false) {
370
                    $skipPage = true;
371
                    if (is_string($veto)) {
372
                        $skipMessage = $veto;
373
                    } else {
374
                        $skipMessage = 'Veto from hook "' . htmlspecialchars($key) . '"';
375
                    }
376
                    // no need to execute other hooks if a previous one return a veto
377
                    break;
378
                }
379
            }
380
        }
381
382
        return $skipPage ? $skipMessage : false;
383
    }
384
385
    /**
386
     * Wrapper method for getUrlsForPageId()
387
     * It returns an array of configurations and no urls!
388
     *
389
     * @param array $pageRow Page record with at least dok-type and uid columns.
390
     * @param string $skipMessage
391
     * @return array
392
     * @see getUrlsForPageId()
393
     */
394 2
    public function getUrlsForPageRow(array $pageRow, &$skipMessage = '')
395
    {
396 2
        $message = $this->checkIfPageShouldBeSkipped($pageRow);
397 2
        if ($message === false) {
398 1
            $res = $this->getUrlsForPageId($pageRow['uid']);
399 1
            $skipMessage = '';
400
        } else {
401 1
            $skipMessage = $message;
402 1
            $res = [];
403
        }
404
405 2
        return $res;
406
    }
407
408
    /**
409
     * Creates a list of URLs from input array (and submits them to queue if asked for)
410
     * See Web > Info module script + "indexed_search"'s crawler hook-client using this!
411
     *
412
     * @param array $vv Information about URLs from pageRow to crawl.
413
     * @param array $pageRow Page row
414
     * @param int $scheduledTime Unix time to schedule indexing to, typically time()
415
     * @param int $reqMinute Number of requests per minute (creates the interleave between requests)
416
     * @param bool $submitCrawlUrls If set, submits the URLs to queue
417
     * @param bool $downloadCrawlUrls If set (and submitcrawlUrls is false) will fill $downloadUrls with entries)
418
     * @param array $duplicateTrack Array which is passed by reference and contains the an id per url to secure we will not crawl duplicates
419
     * @param array $downloadUrls Array which will be filled with URLS for download if flag is set.
420
     * @param array $incomingProcInstructions Array of processing instructions
421
     * @return string List of URLs (meant for display in backend module)
422
     */
423
    public function urlListFromUrlArray(
424
        array $vv,
425
        array $pageRow,
426
        $scheduledTime,
427
        $reqMinute,
428
        $submitCrawlUrls,
429
        $downloadCrawlUrls,
430
        array &$duplicateTrack,
431
        array &$downloadUrls,
432
        array $incomingProcInstructions
433
    ) {
434
        if (! is_array($vv['URLs'])) {
435
            return 'ERROR - no URL generated';
436
        }
437
        $urlLog = [];
438
        $pageId = (int) $pageRow['uid'];
439
        $configurationHash = $this->getConfigurationHash($vv);
440
        $skipInnerCheck = $this->queueRepository->noUnprocessedQueueEntriesForPageWithConfigurationHashExist($pageId, $configurationHash);
441
442
        foreach ($vv['URLs'] as $urlQuery) {
443
            if (! $this->drawURLs_PIfilter($vv['subCfg']['procInstrFilter'], $incomingProcInstructions)) {
444
                continue;
445
            }
446
            $url = (string) $this->getUrlFromPageAndQueryParameters(
447
                $pageId,
448
                $urlQuery,
449
                $vv['subCfg']['baseUrl'] ?? null,
450
                $vv['subCfg']['force_ssl'] ?? 0
451
            );
452
453
            // Create key by which to determine unique-ness:
454
            $uKey = $url . '|' . $vv['subCfg']['userGroups'] . '|' . $vv['subCfg']['procInstrFilter'];
455
456
            if (isset($duplicateTrack[$uKey])) {
457
                //if the url key is registered just display it and do not resubmit is
458
                $urlLog[] = '<em><span class="text-muted">' . htmlspecialchars($url) . '</span></em>';
459
            } else {
460
                // Scheduled time:
461
                $schTime = $scheduledTime + round(count($duplicateTrack) * (60 / $reqMinute));
462
                $schTime = intval($schTime / 60) * 60;
463
                $formattedDate = BackendUtility::datetime($schTime);
464
                $this->urlList[] = '[' . $formattedDate . '] ' . $url;
465
                $urlList = '[' . $formattedDate . '] ' . htmlspecialchars($url);
466
467
                // Submit for crawling!
468
                if ($submitCrawlUrls) {
469
                    $added = $this->addUrl(
470
                        $pageId,
471
                        $url,
472
                        $vv['subCfg'],
473
                        $scheduledTime,
474
                        $configurationHash,
475
                        $skipInnerCheck
476
                    );
477
                    if ($added === false) {
478
                        $urlList .= ' (URL already existed)';
479
                    }
480
                } elseif ($downloadCrawlUrls) {
481
                    $downloadUrls[$url] = $url;
482
                }
483
                $urlLog[] = $urlList;
484
            }
485
            $duplicateTrack[$uKey] = true;
486
        }
487
488
        return implode('<br>', $urlLog);
489
    }
490
491
    /**
492
     * Returns true if input processing instruction is among registered ones.
493
     *
494
     * @param string $piString PI to test
495
     * @param array $incomingProcInstructions Processing instructions
496
     * @return boolean
497
     */
498 5
    public function drawURLs_PIfilter($piString, array $incomingProcInstructions)
499
    {
500 5
        if (empty($incomingProcInstructions)) {
501 1
            return true;
502
        }
503
504 4
        foreach ($incomingProcInstructions as $pi) {
505 4
            if (GeneralUtility::inList($piString, $pi)) {
506 2
                return true;
507
            }
508
        }
509 2
        return false;
510
    }
511
512 1
    public function getPageTSconfigForId($id): array
513
    {
514 1
        if (! $this->MP) {
515 1
            $pageTSconfig = BackendUtility::getPagesTSconfig($id);
0 ignored issues
show
Deprecated Code introduced by
The function TYPO3\CMS\Backend\Utilit...ity::getPagesTSconfig() has been deprecated. ( Ignorable by Annotation )

If this is a false-positive, you can also ignore this issue in your code via the ignore-deprecated  annotation

515
            $pageTSconfig = /** @scrutinizer ignore-deprecated */ BackendUtility::getPagesTSconfig($id);
Loading history...
516
        } else {
517
            // TODO: Please check, this makes no sense to split a boolean value.
518
            [, $mountPointId] = explode('-', $this->MP);
0 ignored issues
show
Bug introduced by
$this->MP of type true is incompatible with the type string expected by parameter $string of explode(). ( Ignorable by Annotation )

If this is a false-positive, you can also ignore this issue in your code via the ignore-type  annotation

518
            [, $mountPointId] = explode('-', /** @scrutinizer ignore-type */ $this->MP);
Loading history...
519
            $pageTSconfig = BackendUtility::getPagesTSconfig($mountPointId);
0 ignored issues
show
Bug introduced by
$mountPointId of type string is incompatible with the type integer expected by parameter $id of TYPO3\CMS\Backend\Utilit...ity::getPagesTSconfig(). ( Ignorable by Annotation )

If this is a false-positive, you can also ignore this issue in your code via the ignore-type  annotation

519
            $pageTSconfig = BackendUtility::getPagesTSconfig(/** @scrutinizer ignore-type */ $mountPointId);
Loading history...
Deprecated Code introduced by
The function TYPO3\CMS\Backend\Utilit...ity::getPagesTSconfig() has been deprecated. ( Ignorable by Annotation )

If this is a false-positive, you can also ignore this issue in your code via the ignore-deprecated  annotation

519
            $pageTSconfig = /** @scrutinizer ignore-deprecated */ BackendUtility::getPagesTSconfig($mountPointId);
Loading history...
520
        }
521
522
        // Call a hook to alter configuration
523 1
        if (is_array($GLOBALS['TYPO3_CONF_VARS']['EXTCONF']['crawler']['getPageTSconfigForId'])) {
524
            $params = [
525
                'pageId' => $id,
526
                'pageTSConfig' => &$pageTSconfig,
527
            ];
528
            foreach ($GLOBALS['TYPO3_CONF_VARS']['EXTCONF']['crawler']['getPageTSconfigForId'] as $userFunc) {
529
                GeneralUtility::callUserFunction($userFunc, $params, $this);
530
            }
531
        }
532 1
        return $pageTSconfig;
533
    }
534
535
    /**
536
     * This methods returns an array of configurations.
537
     * Adds no urls!
538
     */
539
    public function getUrlsForPageId(int $pageId): array
540
    {
541
        // Get page TSconfig for page ID
542
        $pageTSconfig = $this->getPageTSconfigForId($pageId);
543
544
        $res = [];
545
546
        // Fetch Crawler Configuration from pageTSconfig
547
        $crawlerCfg = $pageTSconfig['tx_crawler.']['crawlerCfg.']['paramSets.'] ?? [];
548
        foreach ($crawlerCfg as $key => $values) {
549
            if (! is_array($values)) {
550
                continue;
551
            }
552
            $key = str_replace('.', '', $key);
553
            // Sub configuration for a single configuration string:
554
            $subCfg = (array) $crawlerCfg[$key . '.'];
555
            $subCfg['key'] = $key;
556
557
            if (strcmp($subCfg['procInstrFilter'] ?? '', '')) {
558
                $subCfg['procInstrFilter'] = implode(',', GeneralUtility::trimExplode(',', $subCfg['procInstrFilter']));
559
            }
560
            $pidOnlyList = implode(',', GeneralUtility::trimExplode(',', $subCfg['pidsOnly'], true));
561
562
            // process configuration if it is not page-specific or if the specific page is the current page:
563
            // TODO: Check if $pidOnlyList can be kept as Array instead of imploded
564
            if (! strcmp((string) $subCfg['pidsOnly'], '') || GeneralUtility::inList($pidOnlyList, strval($pageId))) {
565
566
                // Explode, process etc.:
567
                $res[$key] = [];
568
                $res[$key]['subCfg'] = $subCfg;
569
                $res[$key]['paramParsed'] = GeneralUtility::explodeUrl2Array($crawlerCfg[$key]);
570
                $res[$key]['paramExpanded'] = $this->expandParameters($res[$key]['paramParsed'], $pageId);
571
                $res[$key]['origin'] = 'pagets';
572
573
                // recognize MP value
574
                if (! $this->MP) {
575
                    $res[$key]['URLs'] = $this->compileUrls($res[$key]['paramExpanded'], ['?id=' . $pageId]);
576
                } else {
577
                    $res[$key]['URLs'] = $this->compileUrls($res[$key]['paramExpanded'], ['?id=' . $pageId . '&MP=' . $this->MP]);
0 ignored issues
show
Bug introduced by
Are you sure $this->MP of type true can be used in concatenation? ( Ignorable by Annotation )

If this is a false-positive, you can also ignore this issue in your code via the ignore-type  annotation

577
                    $res[$key]['URLs'] = $this->compileUrls($res[$key]['paramExpanded'], ['?id=' . $pageId . '&MP=' . /** @scrutinizer ignore-type */ $this->MP]);
Loading history...
578
                }
579
            }
580
        }
581
582
        // Get configuration from tx_crawler_configuration records up the rootline
583
        $crawlerConfigurations = $this->configurationRepository->getCrawlerConfigurationRecordsFromRootLine($pageId);
584
        foreach ($crawlerConfigurations as $configurationRecord) {
585
586
            // check access to the configuration record
587
            if (empty($configurationRecord['begroups']) || $this->getBackendUser()->isAdmin() || $this->hasGroupAccess($this->getBackendUser()->user['usergroup_cached_list'], $configurationRecord['begroups'])) {
588
                $pidOnlyList = implode(',', GeneralUtility::trimExplode(',', $configurationRecord['pidsonly'], true));
589
590
                // process configuration if it is not page-specific or if the specific page is the current page:
591
                // TODO: Check if $pidOnlyList can be kept as Array instead of imploded
592
                if (! strcmp($configurationRecord['pidsonly'], '') || GeneralUtility::inList($pidOnlyList, strval($pageId))) {
593
                    $key = $configurationRecord['name'];
594
595
                    // don't overwrite previously defined paramSets
596
                    if (! isset($res[$key])) {
597
598
                        /* @var $TSparserObject \TYPO3\CMS\Core\TypoScript\Parser\TypoScriptParser */
599
                        $TSparserObject = GeneralUtility::makeInstance(TypoScriptParser::class);
600
                        $TSparserObject->parse($configurationRecord['processing_instruction_parameters_ts']);
601
602
                        $subCfg = [
603
                            'procInstrFilter' => $configurationRecord['processing_instruction_filter'],
604
                            'procInstrParams.' => $TSparserObject->setup,
605
                            'baseUrl' => $configurationRecord['base_url'],
606
                            'force_ssl' => (int) $configurationRecord['force_ssl'],
607
                            'userGroups' => $configurationRecord['fegroups'],
608
                            'exclude' => $configurationRecord['exclude'],
609
                            'key' => $key,
610
                        ];
611
612
                        if (! in_array($pageId, $this->expandExcludeString($subCfg['exclude']), true)) {
613
                            $res[$key] = [];
614
                            $res[$key]['subCfg'] = $subCfg;
615
                            $res[$key]['paramParsed'] = GeneralUtility::explodeUrl2Array($configurationRecord['configuration']);
616
                            $res[$key]['paramExpanded'] = $this->expandParameters($res[$key]['paramParsed'], $pageId);
617
                            $res[$key]['URLs'] = $this->compileUrls($res[$key]['paramExpanded'], ['?id=' . $pageId]);
618
                            $res[$key]['origin'] = 'tx_crawler_configuration_' . $configurationRecord['uid'];
619
                        }
620
                    }
621
                }
622
            }
623
        }
624
625
        foreach ($GLOBALS['TYPO3_CONF_VARS']['EXTCONF']['crawler']['processUrls'] ?? [] as $func) {
626
            $params = [
627
                'res' => &$res,
628
            ];
629
            GeneralUtility::callUserFunction($func, $params, $this);
630
        }
631
        return $res;
632
    }
633
634
    /**
635
     * Find all configurations of subpages of a page
636
     * TODO: Write Functional Tests
637
     */
638 1
    public function getConfigurationsForBranch(int $rootid, int $depth): array
639
    {
640 1
        $configurationsForBranch = [];
641 1
        $pageTSconfig = $this->getPageTSconfigForId($rootid);
642 1
        $sets = $pageTSconfig['tx_crawler.']['crawlerCfg.']['paramSets.'] ?? [];
643 1
        foreach ($sets as $key => $value) {
644
            if (! is_array($value)) {
645
                continue;
646
            }
647
            $configurationsForBranch[] = substr($key, -1) === '.' ? substr($key, 0, -1) : $key;
648
        }
649 1
        $pids = [];
650 1
        $rootLine = BackendUtility::BEgetRootLine($rootid);
651 1
        foreach ($rootLine as $node) {
652 1
            $pids[] = $node['uid'];
653
        }
654
        /* @var PageTreeView $tree */
655 1
        $tree = GeneralUtility::makeInstance(PageTreeView::class);
656 1
        $perms_clause = $this->getBackendUser()->getPagePermsClause(Permission::PAGE_SHOW);
657 1
        $tree->init(empty($perms_clause) ? '' : ('AND ' . $perms_clause));
658 1
        $tree->getTree($rootid, $depth, '');
659 1
        foreach ($tree->tree as $node) {
660
            $pids[] = $node['row']['uid'];
661
        }
662
663 1
        $queryBuilder = $this->getQueryBuilder('tx_crawler_configuration');
664
        $statement = $queryBuilder
665 1
            ->select('name')
666 1
            ->from('tx_crawler_configuration')
667 1
            ->where(
668 1
                $queryBuilder->expr()->in('pid', $queryBuilder->createNamedParameter($pids, Connection::PARAM_INT_ARRAY))
669
            )
670 1
            ->execute();
671
672 1
        while ($row = $statement->fetch()) {
673 1
            $configurationsForBranch[] = $row['name'];
674
        }
675 1
        return $configurationsForBranch;
676
    }
677
678
    /**
679
     * Check if a user has access to an item
680
     * (e.g. get the group list of the current logged in user from $GLOBALS['TSFE']->gr_list)
681
     *
682
     * @param string $groupList Comma-separated list of (fe_)group UIDs from a user
683
     * @param string $accessList Comma-separated list of (fe_)group UIDs of the item to access
684
     * @return bool                 TRUE if at least one of the users group UIDs is in the access list or the access list is empty
685
     * @see \TYPO3\CMS\Frontend\Page\PageRepository::getMultipleGroupsWhereClause()
686
     */
687 3
    public function hasGroupAccess($groupList, $accessList)
688
    {
689 3
        if (empty($accessList)) {
690 1
            return true;
691
        }
692 2
        foreach (GeneralUtility::intExplode(',', $groupList) as $groupUid) {
693 2
            if (GeneralUtility::inList($accessList, $groupUid)) {
694 1
                return true;
695
            }
696
        }
697 1
        return false;
698
    }
699
700
    /**
701
     * Will expand the parameters configuration to individual values. This follows a certain syntax of the value of each parameter.
702
     * Syntax of values:
703
     * - Basically: If the value is wrapped in [...] it will be expanded according to the following syntax, otherwise the value is taken literally
704
     * - Configuration is splitted by "|" and the parts are processed individually and finally added together
705
     * - For each configuration part:
706
     *         - "[int]-[int]" = Integer range, will be expanded to all values in between, values included, starting from low to high (max. 1000). Example "1-34" or "-40--30"
707
     *         - "_TABLE:[TCA table name];[_PID:[optional page id, default is current page]];[_ENABLELANG:1]" = Look up of table records from PID, filtering out deleted records. Example "_TABLE:tt_content; _PID:123"
708
     *        _ENABLELANG:1 picks only original records without their language overlays
709
     *         - Default: Literal value
710
     *
711
     * @param array $paramArray Array with key (GET var name) and values (value of GET var which is configuration for expansion)
712
     * @param integer $pid Current page ID
713
     * @return array
714
     *
715
     * TODO: Write Functional Tests
716
     */
717 7
    public function expandParameters($paramArray, $pid)
718
    {
719
        // Traverse parameter names:
720 7
        foreach ($paramArray as $p => $v) {
721 7
            $v = trim($v);
722
723
            // If value is encapsulated in square brackets it means there are some ranges of values to find, otherwise the value is literal
724 7
            if (strpos($v, '[') === 0 && substr($v, -1) === ']') {
725
                // So, find the value inside brackets and reset the paramArray value as an array.
726 7
                $v = substr($v, 1, -1);
727 7
                $paramArray[$p] = [];
728
729
                // Explode parts and traverse them:
730 7
                $parts = explode('|', $v);
731 7
                foreach ($parts as $pV) {
732
733
                    // Look for integer range: (fx. 1-34 or -40--30 // reads minus 40 to minus 30)
734 7
                    if (preg_match('/^(-?[0-9]+)\s*-\s*(-?[0-9]+)$/', trim($pV), $reg)) {
735 1
                        $reg = $this->swapIfFirstIsLargerThanSecond($reg);
736
737
                        // Traverse range, add values:
738 1
                        $runAwayBrake = 1000; // Limit to size of range!
739 1
                        for ($a = $reg[1]; $a <= $reg[2]; $a++) {
740 1
                            $paramArray[$p][] = $a;
741 1
                            $runAwayBrake--;
742 1
                            if ($runAwayBrake <= 0) {
743
                                break;
744
                            }
745
                        }
746 6
                    } elseif (strpos(trim($pV), '_TABLE:') === 0) {
747
748
                        // Parse parameters:
749 6
                        $subparts = GeneralUtility::trimExplode(';', $pV);
750 6
                        $subpartParams = [];
751 6
                        foreach ($subparts as $spV) {
752 6
                            [$pKey, $pVal] = GeneralUtility::trimExplode(':', $spV);
753 6
                            $subpartParams[$pKey] = $pVal;
754
                        }
755
756
                        // Table exists:
757 6
                        if (isset($GLOBALS['TCA'][$subpartParams['_TABLE']])) {
758 6
                            $lookUpPid = isset($subpartParams['_PID']) ? intval($subpartParams['_PID']) : intval($pid);
759 6
                            $recursiveDepth = isset($subpartParams['_RECURSIVE']) ? intval($subpartParams['_RECURSIVE']) : 0;
760 6
                            $pidField = isset($subpartParams['_PIDFIELD']) ? trim($subpartParams['_PIDFIELD']) : 'pid';
761 6
                            $where = $subpartParams['_WHERE'] ?? '';
762 6
                            $addTable = $subpartParams['_ADDTABLE'] ?? '';
763
764 6
                            $fieldName = $subpartParams['_FIELD'] ? $subpartParams['_FIELD'] : 'uid';
765 6
                            if ($fieldName === 'uid' || $GLOBALS['TCA'][$subpartParams['_TABLE']]['columns'][$fieldName]) {
766 6
                                $queryBuilder = $this->getQueryBuilder($subpartParams['_TABLE']);
767
768 6
                                if ($recursiveDepth > 0) {
769
                                    /** @var \TYPO3\CMS\Core\Database\QueryGenerator $queryGenerator */
770 2
                                    $queryGenerator = GeneralUtility::makeInstance(\TYPO3\CMS\Core\Database\QueryGenerator::class);
771 2
                                    $pidList = $queryGenerator->getTreeList($lookUpPid, $recursiveDepth, 0, 1);
772 2
                                    $pidArray = GeneralUtility::intExplode(',', $pidList);
773
                                } else {
774 4
                                    $pidArray = [(string) $lookUpPid];
775
                                }
776
777 6
                                $queryBuilder->getRestrictions()
778 6
                                    ->removeAll()
779 6
                                    ->add(GeneralUtility::makeInstance(DeletedRestriction::class));
780
781
                                $queryBuilder
782 6
                                    ->select($fieldName)
783 6
                                    ->from($subpartParams['_TABLE'])
784 6
                                    ->where(
785 6
                                        $queryBuilder->expr()->in($pidField, $queryBuilder->createNamedParameter($pidArray, Connection::PARAM_INT_ARRAY)),
786 6
                                        $where
787
                                    );
788
789 6
                                if (! empty($addTable)) {
790
                                    // TODO: Check if this works as intended!
791
                                    $queryBuilder->add('from', $addTable);
792
                                }
793 6
                                $transOrigPointerField = $GLOBALS['TCA'][$subpartParams['_TABLE']]['ctrl']['transOrigPointerField'];
794
795 6
                                if ($subpartParams['_ENABLELANG'] && $transOrigPointerField) {
796
                                    $queryBuilder->andWhere(
797
                                        $queryBuilder->expr()->lte(
798
                                            $transOrigPointerField,
799
                                            0
800
                                        )
801
                                    );
802
                                }
803
804 6
                                $statement = $queryBuilder->execute();
805
806 6
                                $rows = [];
807 6
                                while ($row = $statement->fetch()) {
808 6
                                    $rows[$row[$fieldName]] = $row;
809
                                }
810
811 6
                                if (is_array($rows)) {
812 6
                                    $paramArray[$p] = array_merge($paramArray[$p], array_keys($rows));
813
                                }
814
                            }
815
                        }
816
                    } else { // Just add value:
817
                        $paramArray[$p][] = $pV;
818
                    }
819
                    // Hook for processing own expandParameters place holder
820 7
                    if (is_array($GLOBALS['TYPO3_CONF_VARS']['SC_OPTIONS']['crawler/class.tx_crawler_lib.php']['expandParameters'])) {
821
                        $_params = [
822
                            'pObj' => &$this,
823
                            'paramArray' => &$paramArray,
824
                            'currentKey' => $p,
825
                            'currentValue' => $pV,
826
                            'pid' => $pid,
827
                        ];
828
                        foreach ($GLOBALS['TYPO3_CONF_VARS']['SC_OPTIONS']['crawler/class.tx_crawler_lib.php']['expandParameters'] as $_funcRef) {
829
                            GeneralUtility::callUserFunction($_funcRef, $_params, $this);
830
                        }
831
                    }
832
                }
833
834
                // Make unique set of values and sort array by key:
835 7
                $paramArray[$p] = array_unique($paramArray[$p]);
836 7
                ksort($paramArray);
837
            } else {
838
                // Set the literal value as only value in array:
839
                $paramArray[$p] = [$v];
840
            }
841
        }
842
843 7
        return $paramArray;
844
    }
845
846
    /**
847
     * Compiling URLs from parameter array (output of expandParameters())
848
     * The number of URLs will be the multiplication of the number of parameter values for each key
849
     *
850
     * @param array $paramArray Output of expandParameters(): Array with keys (GET var names) and for each an array of values
851
     * @param array $urls URLs accumulated in this array (for recursion)
852
     * @return array
853
     */
854 4
    public function compileUrls($paramArray, array $urls)
855
    {
856 4
        if (empty($paramArray)) {
857 4
            return $urls;
858
        }
859
        // shift first off stack:
860 3
        reset($paramArray);
861 3
        $varName = key($paramArray);
862 3
        $valueSet = array_shift($paramArray);
863
864
        // Traverse value set:
865 3
        $newUrls = [];
866 3
        foreach ($urls as $url) {
867 2
            foreach ($valueSet as $val) {
868 2
                if (count($newUrls) < $this->maximumUrlsToCompile) {
869 2
                    $newUrls[] = $url . (strcmp((string) $val, '') ? '&' . rawurlencode($varName) . '=' . rawurlencode((string) $val) : '');
870
                }
871
            }
872
        }
873 3
        return $this->compileUrls($paramArray, $newUrls);
874
    }
875
876
    /************************************
877
     *
878
     * Crawler log
879
     *
880
     ************************************/
881
882
    /**
883
     * Return array of records from crawler queue for input page ID
884
     *
885
     * @param integer $id Page ID for which to look up log entries.
886
     * @param string $filter Filter: "all" => all entries, "pending" => all that is not yet run, "finished" => all complete ones
887
     * @param boolean $doFlush If TRUE, then entries selected at DELETED(!) instead of selected!
888
     * @param boolean $doFullFlush
889
     * @param integer $itemsPerPage Limit the amount of entries per page default is 10
890
     * @return array
891
     */
892 4
    public function getLogEntriesForPageId($id, $filter = '', $doFlush = false, $doFullFlush = false, $itemsPerPage = 10)
893
    {
894 4
        $queryBuilder = GeneralUtility::makeInstance(ConnectionPool::class)->getQueryBuilderForTable($this->tableName);
895
        $queryBuilder
896 4
            ->select('*')
897 4
            ->from($this->tableName)
898 4
            ->where(
899 4
                $queryBuilder->expr()->eq('page_id', $queryBuilder->createNamedParameter($id, \PDO::PARAM_INT))
900
            )
901 4
            ->orderBy('scheduled', 'DESC');
902
903 4
        $expressionBuilder = GeneralUtility::makeInstance(ConnectionPool::class)
904 4
            ->getConnectionForTable($this->tableName)
905 4
            ->getExpressionBuilder();
906 4
        $query = $expressionBuilder->andX();
0 ignored issues
show
Unused Code introduced by
The assignment to $query is dead and can be removed.
Loading history...
907
        // PHPStorm adds the highlight that the $addWhere is immediately overwritten,
908
        // but the $query = $expressionBuilder->andX() ensures that the $addWhere is written correctly with AND
909
        // between the statements, it's not a mistake in the code.
910 4
        switch ($filter) {
911 4
            case 'pending':
912
                $queryBuilder->andWhere($queryBuilder->expr()->eq('exec_time', 0));
913
                break;
914 4
            case 'finished':
915
                $queryBuilder->andWhere($queryBuilder->expr()->gt('exec_time', 0));
916
                break;
917
        }
918
919 4
        if ($doFlush) {
920 2
            if ($doFullFlush) {
921 1
                $this->queueRepository->flushQueue('all');
922
            } else {
923 1
                $this->queueRepository->flushQueue($filter);
924
            }
925
        }
926 4
        if ($itemsPerPage > 0) {
927
            $queryBuilder
928 4
                ->setMaxResults((int) $itemsPerPage);
929
        }
930
931 4
        return $queryBuilder->execute()->fetchAll();
932
    }
933
934
    /**
935
     * Return array of records from crawler queue for input set ID
936
     *
937
     * @param int $set_id Set ID for which to look up log entries.
938
     * @param string $filter Filter: "all" => all entries, "pending" => all that is not yet run, "finished" => all complete ones
939
     * @param bool $doFlush If TRUE, then entries selected at DELETED(!) instead of selected!
940
     * @param int $itemsPerPage Limit the amount of entries per page default is 10
941
     * @return array
942
     *
943
     * @deprecated
944
     */
945 6
    public function getLogEntriesForSetId(int $set_id, string $filter = '', bool $doFlush = false, bool $doFullFlush = false, int $itemsPerPage = 10)
946
    {
947 6
        $queryBuilder = GeneralUtility::makeInstance(ConnectionPool::class)->getQueryBuilderForTable($this->tableName);
948
        $queryBuilder
949 6
            ->select('*')
950 6
            ->from($this->tableName)
951 6
            ->where(
952 6
                $queryBuilder->expr()->eq('set_id', $queryBuilder->createNamedParameter($set_id, \PDO::PARAM_INT))
953
            )
954 6
            ->orderBy('scheduled', 'DESC');
955
956 6
        $expressionBuilder = GeneralUtility::makeInstance(ConnectionPool::class)
957 6
            ->getConnectionForTable($this->tableName)
958 6
            ->getExpressionBuilder();
959 6
        $query = $expressionBuilder->andX();
960
        // PHPStorm adds the highlight that the $addWhere is immediately overwritten,
961
        // but the $query = $expressionBuilder->andX() ensures that the $addWhere is written correctly with AND
962
        // between the statements, it's not a mistake in the code.
963 6
        $addWhere = '';
964 6
        switch ($filter) {
965 6
            case 'pending':
966 1
                $queryBuilder->andWhere($queryBuilder->expr()->eq('exec_time', 0));
967 1
                $addWhere = $query->add($expressionBuilder->eq('exec_time', 0));
0 ignored issues
show
Unused Code introduced by
The assignment to $addWhere is dead and can be removed.
Loading history...
968 1
                break;
969 5
            case 'finished':
970 1
                $queryBuilder->andWhere($queryBuilder->expr()->gt('exec_time', 0));
971 1
                $addWhere = $query->add($expressionBuilder->gt('exec_time', 0));
972 1
                break;
973
        }
974 6
        if ($doFlush) {
975 4
            $addWhere = $query->add($expressionBuilder->eq('set_id', (int) $set_id));
976 4
            $this->flushQueue($doFullFlush ? '' : $addWhere);
0 ignored issues
show
Deprecated Code introduced by
The function AOE\Crawler\Controller\C...ontroller::flushQueue() has been deprecated. ( Ignorable by Annotation )

If this is a false-positive, you can also ignore this issue in your code via the ignore-deprecated  annotation

976
            /** @scrutinizer ignore-deprecated */ $this->flushQueue($doFullFlush ? '' : $addWhere);
Loading history...
977 4
            return [];
978
        }
979 2
        if ($itemsPerPage > 0) {
980
            $queryBuilder
981 2
                ->setMaxResults((int) $itemsPerPage);
982
        }
983
984 2
        return $queryBuilder->execute()->fetchAll();
985
    }
986
987
    /**
988
     * Adding call back entries to log (called from hooks typically, see indexed search class "class.crawler.php"
989
     *
990
     * @param integer $setId Set ID
991
     * @param array $params Parameters to pass to call back function
992
     * @param string $callBack Call back object reference, eg. 'EXT:indexed_search/class.crawler.php:&tx_indexedsearch_crawler'
993
     * @param integer $page_id Page ID to attach it to
994
     * @param integer $schedule Time at which to activate
995
     */
996
    public function addQueueEntry_callBack($setId, $params, $callBack, $page_id = 0, $schedule = 0): void
997
    {
998
        if (! is_array($params)) {
0 ignored issues
show
introduced by
The condition is_array($params) is always true.
Loading history...
999
            $params = [];
1000
        }
1001
        $params['_CALLBACKOBJ'] = $callBack;
1002
1003
        GeneralUtility::makeInstance(ConnectionPool::class)->getConnectionForTable('tx_crawler_queue')
1004
            ->insert(
1005
                'tx_crawler_queue',
1006
                [
1007
                    'page_id' => (int) $page_id,
1008
                    'parameters' => json_encode($params),
1009
                    'scheduled' => (int) $schedule ?: $this->getCurrentTime(),
1010
                    'exec_time' => 0,
1011
                    'set_id' => (int) $setId,
1012
                    'result_data' => '',
1013
                ]
1014
            );
1015
    }
1016
1017
    /************************************
1018
     *
1019
     * URL setting
1020
     *
1021
     ************************************/
1022
1023
    /**
1024
     * Setting a URL for crawling:
1025
     *
1026
     * @param integer $id Page ID
1027
     * @param string $url Complete URL
1028
     * @param array $subCfg Sub configuration array (from TS config)
1029
     * @param integer $tstamp Scheduled-time
1030
     * @param string $configurationHash (optional) configuration hash
1031
     * @param bool $skipInnerDuplicationCheck (optional) skip inner duplication check
1032
     * @return bool
1033
     */
1034 4
    public function addUrl(
1035
        $id,
1036
        $url,
1037
        array $subCfg,
1038
        $tstamp,
1039
        $configurationHash = '',
1040
        $skipInnerDuplicationCheck = false
1041
    ) {
1042 4
        $urlAdded = false;
1043 4
        $rows = [];
1044
1045
        // Creating parameters:
1046
        $parameters = [
1047 4
            'url' => $url,
1048
        ];
1049
1050
        // fe user group simulation:
1051 4
        $uGs = implode(',', array_unique(GeneralUtility::intExplode(',', $subCfg['userGroups'], true)));
1052 4
        if ($uGs) {
1053 1
            $parameters['feUserGroupList'] = $uGs;
1054
        }
1055
1056
        // Setting processing instructions
1057 4
        $parameters['procInstructions'] = GeneralUtility::trimExplode(',', $subCfg['procInstrFilter']);
1058 4
        if (is_array($subCfg['procInstrParams.'])) {
1059 1
            $parameters['procInstrParams'] = $subCfg['procInstrParams.'];
1060
        }
1061
1062
        // Compile value array:
1063 4
        $parameters_serialized = json_encode($parameters);
1064
        $fieldArray = [
1065 4
            'page_id' => (int) $id,
1066 4
            'parameters' => $parameters_serialized,
1067 4
            'parameters_hash' => GeneralUtility::shortMD5($parameters_serialized),
1068 4
            'configuration_hash' => $configurationHash,
1069 4
            'scheduled' => $tstamp,
1070 4
            'exec_time' => 0,
1071 4
            'set_id' => (int) $this->setID,
1072 4
            'result_data' => '',
1073 4
            'configuration' => $subCfg['key'],
1074
        ];
1075
1076 4
        if ($this->registerQueueEntriesInternallyOnly) {
1077
            //the entries will only be registered and not stored to the database
1078 1
            $this->queueEntries[] = $fieldArray;
1079
        } else {
1080 3
            if (! $skipInnerDuplicationCheck) {
1081
                // check if there is already an equal entry
1082 2
                $rows = $this->getDuplicateRowsIfExist($tstamp, $fieldArray);
1083
            }
1084
1085 3
            if (empty($rows)) {
1086 2
                $connectionForCrawlerQueue = GeneralUtility::makeInstance(ConnectionPool::class)->getConnectionForTable('tx_crawler_queue');
1087 2
                $connectionForCrawlerQueue->insert(
1088 2
                    'tx_crawler_queue',
1089 2
                    $fieldArray
1090
                );
1091 2
                $uid = $connectionForCrawlerQueue->lastInsertId('tx_crawler_queue', 'qid');
1092 2
                $rows[] = $uid;
1093 2
                $urlAdded = true;
1094
1095 2
                $signalPayload = ['uid' => $uid, 'fieldArray' => $fieldArray];
1096 2
                SignalSlotUtility::emitSignal(
1097 2
                    self::class,
1098 2
                    SignalSlotUtility::SIGNAL_URL_ADDED_TO_QUEUE,
1099 2
                    $signalPayload
1100
                );
1101
            } else {
1102 1
                $signalPayload = ['rows' => $rows, 'fieldArray' => $fieldArray];
1103 1
                SignalSlotUtility::emitSignal(
1104 1
                    self::class,
1105 1
                    SignalSlotUtility::SIGNAL_DUPLICATE_URL_IN_QUEUE,
1106 1
                    $signalPayload
1107
                );
1108
            }
1109
        }
1110
1111 4
        return $urlAdded;
1112
    }
1113
1114
    /**
1115
     * Returns the current system time
1116
     *
1117
     * @return int
1118
     */
1119
    public function getCurrentTime()
1120
    {
1121
        return time();
1122
    }
1123
1124
    /************************************
1125
     *
1126
     * URL reading
1127
     *
1128
     ************************************/
1129
1130
    /**
1131
     * Read URL for single queue entry
1132
     *
1133
     * @param integer $queueId
1134
     * @param boolean $force If set, will process even if exec_time has been set!
1135
     * @return integer
1136
     */
1137
    public function readUrl($queueId, $force = false)
1138
    {
1139
        $queryBuilder = GeneralUtility::makeInstance(ConnectionPool::class)->getQueryBuilderForTable($this->tableName);
1140
        $ret = 0;
1141
        $this->logger->debug('crawler-readurl start ' . microtime(true));
1142
        // Get entry:
1143
        $queryBuilder
1144
            ->select('*')
1145
            ->from('tx_crawler_queue')
1146
            ->where(
1147
                $queryBuilder->expr()->eq('qid', $queryBuilder->createNamedParameter($queueId, \PDO::PARAM_INT))
1148
            );
1149
        if (! $force) {
1150
            $queryBuilder
1151
                ->andWhere('exec_time = 0')
1152
                ->andWhere('process_scheduled > 0');
1153
        }
1154
        $queueRec = $queryBuilder->execute()->fetch();
1155
1156
        if (! is_array($queueRec)) {
1157
            return;
1158
        }
1159
1160
        SignalSlotUtility::emitSignal(
1161
            self::class,
1162
            SignalSlotUtility::SIGNAL_QUEUEITEM_PREPROCESS,
1163
            [$queueId, &$queueRec]
1164
        );
1165
1166
        // Set exec_time to lock record:
1167
        $field_array = ['exec_time' => $this->getCurrentTime()];
1168
1169
        if (isset($this->processID)) {
1170
            //if mulitprocessing is used we need to store the id of the process which has handled this entry
1171
            $field_array['process_id_completed'] = $this->processID;
1172
        }
1173
1174
        GeneralUtility::makeInstance(ConnectionPool::class)->getConnectionForTable('tx_crawler_queue')
1175
            ->update(
1176
                'tx_crawler_queue',
1177
                $field_array,
1178
                ['qid' => (int) $queueId]
1179
            );
1180
1181
        $result = $this->queueExecutor->executeQueueItem($queueRec, $this);
1182
        if ($result['content'] === null) {
1183
            $resultData = 'An errors happened';
1184
        } else {
1185
            /** @var JsonCompatibilityConverter $jsonCompatibilityConverter */
1186
            $jsonCompatibilityConverter = GeneralUtility::makeInstance(JsonCompatibilityConverter::class);
1187
            $resultData = $jsonCompatibilityConverter->convert($result['content']);
1188
        }
1189
1190
        //atm there's no need to point to specific pollable extensions
1191
        if (is_array($GLOBALS['TYPO3_CONF_VARS']['EXTCONF']['crawler']['pollSuccess'])) {
1192
            foreach ($GLOBALS['TYPO3_CONF_VARS']['EXTCONF']['crawler']['pollSuccess'] as $pollable) {
1193
                // only check the success value if the instruction is runnig
1194
                // it is important to name the pollSuccess key same as the procInstructions key
1195
                if (is_array($resultData['parameters']['procInstructions'])
1196
                    && in_array(
1197
                        $pollable,
1198
                        $resultData['parameters']['procInstructions'], true
1199
                    )
1200
                ) {
1201
                    if (! empty($resultData['success'][$pollable]) && $resultData['success'][$pollable]) {
1202
                        $ret |= self::CLI_STATUS_POLLABLE_PROCESSED;
1203
                    }
1204
                }
1205
            }
1206
        }
1207
1208
        // Set result in log which also denotes the end of the processing of this entry.
1209
        $field_array = ['result_data' => json_encode($result)];
1210
1211
        SignalSlotUtility::emitSignal(
1212
            self::class,
1213
            SignalSlotUtility::SIGNAL_QUEUEITEM_POSTPROCESS,
1214
            [$queueId, &$field_array]
1215
        );
1216
1217
        GeneralUtility::makeInstance(ConnectionPool::class)->getConnectionForTable('tx_crawler_queue')
1218
            ->update(
1219
                'tx_crawler_queue',
1220
                $field_array,
1221
                ['qid' => (int) $queueId]
1222
            );
1223
1224
        $this->logger->debug('crawler-readurl stop ' . microtime(true));
1225
        return $ret;
1226
    }
1227
1228
    /**
1229
     * Read URL for not-yet-inserted log-entry
1230
     *
1231
     * @param array $field_array Queue field array,
1232
     *
1233
     * @return string
1234
     */
1235
    public function readUrlFromArray($field_array)
1236
    {
1237
        // Set exec_time to lock record:
1238
        $field_array['exec_time'] = $this->getCurrentTime();
1239
        $connectionForCrawlerQueue = GeneralUtility::makeInstance(ConnectionPool::class)->getConnectionForTable($this->tableName);
1240
        $connectionForCrawlerQueue->insert(
1241
            $this->tableName,
1242
            $field_array
1243
        );
1244
        $queueId = $field_array['qid'] = $connectionForCrawlerQueue->lastInsertId($this->tableName, 'qid');
1245
1246
        $result = $this->queueExecutor->executeQueueItem($field_array, $this);
1247
1248
        // Set result in log which also denotes the end of the processing of this entry.
1249
        $field_array = ['result_data' => json_encode($result)];
1250
1251
        SignalSlotUtility::emitSignal(
1252
            self::class,
1253
            SignalSlotUtility::SIGNAL_QUEUEITEM_POSTPROCESS,
1254
            [$queueId, &$field_array]
1255
        );
1256
1257
        $connectionForCrawlerQueue->update(
1258
            $this->tableName,
1259
            $field_array,
1260
            ['qid' => $queueId]
1261
        );
1262
1263
        return $result;
0 ignored issues
show
Bug Best Practice introduced by
The expression return $result also could return the type array|boolean which is incompatible with the documented return type string.
Loading history...
1264
    }
1265
1266
    /*****************************
1267
     *
1268
     * Compiling URLs to crawl - tools
1269
     *
1270
     *****************************/
1271
1272
    /**
1273
     * @param integer $id Root page id to start from.
1274
     * @param integer $depth Depth of tree, 0=only id-page, 1= on sublevel, 99 = infinite
1275
     * @param integer $scheduledTime Unix Time when the URL is timed to be visited when put in queue
1276
     * @param integer $reqMinute Number of requests per minute (creates the interleave between requests)
1277
     * @param boolean $submitCrawlUrls If set, submits the URLs to queue in database (real crawling)
1278
     * @param boolean $downloadCrawlUrls If set (and submitcrawlUrls is false) will fill $downloadUrls with entries)
1279
     * @param array $incomingProcInstructions Array of processing instructions
1280
     * @param array $configurationSelection Array of configuration keys
1281
     * @return string
1282
     */
1283
    public function getPageTreeAndUrls(
1284
        $id,
1285
        $depth,
1286
        $scheduledTime,
1287
        $reqMinute,
1288
        $submitCrawlUrls,
1289
        $downloadCrawlUrls,
1290
        array $incomingProcInstructions,
1291
        array $configurationSelection
1292
    ) {
1293
        $this->scheduledTime = $scheduledTime;
1294
        $this->reqMinute = $reqMinute;
1295
        $this->submitCrawlUrls = $submitCrawlUrls;
1296
        $this->downloadCrawlUrls = $downloadCrawlUrls;
1297
        $this->incomingProcInstructions = $incomingProcInstructions;
1298
        $this->incomingConfigurationSelection = $configurationSelection;
1299
1300
        $this->duplicateTrack = [];
1301
        $this->downloadUrls = [];
1302
1303
        // Drawing tree:
1304
        /* @var PageTreeView $tree */
1305
        $tree = GeneralUtility::makeInstance(PageTreeView::class);
1306
        $perms_clause = $this->getBackendUser()->getPagePermsClause(Permission::PAGE_SHOW);
1307
        $tree->init('AND ' . $perms_clause);
1308
1309
        $pageInfo = BackendUtility::readPageAccess($id, $perms_clause);
1310
        if (is_array($pageInfo)) {
0 ignored issues
show
introduced by
The condition is_array($pageInfo) is always true.
Loading history...
1311
            // Set root row:
1312
            $tree->tree[] = [
1313
                'row' => $pageInfo,
1314
                'HTML' => $this->iconFactory->getIconForRecord('pages', $pageInfo, Icon::SIZE_SMALL),
1315
            ];
1316
        }
1317
1318
        // Get branch beneath:
1319
        if ($depth) {
1320
            $tree->getTree($id, $depth, '');
1321
        }
1322
1323
        // Traverse page tree:
1324
        $code = '';
1325
1326
        foreach ($tree->tree as $data) {
1327
            $this->MP = false;
1328
1329
            // recognize mount points
1330
            if ($data['row']['doktype'] === PageRepository::DOKTYPE_MOUNTPOINT) {
1331
                $queryBuilder = GeneralUtility::makeInstance(ConnectionPool::class)->getQueryBuilderForTable('pages');
1332
                $queryBuilder->getRestrictions()->removeAll()->add(GeneralUtility::makeInstance(DeletedRestriction::class));
1333
                $mountpage = $queryBuilder
1334
                    ->select('*')
1335
                    ->from('pages')
1336
                    ->where(
1337
                        $queryBuilder->expr()->eq('uid', $queryBuilder->createNamedParameter($data['row']['uid'], \PDO::PARAM_INT))
1338
                    )
1339
                    ->execute()
1340
                    ->fetchAll();
1341
                $queryBuilder->resetRestrictions();
1342
1343
                // fetch mounted pages
1344
                $this->MP = $mountpage[0]['mount_pid'] . '-' . $data['row']['uid'];
0 ignored issues
show
Documentation Bug introduced by
The property $MP was declared of type boolean, but $mountpage[0]['mount_pid...' . $data['row']['uid'] is of type string. Maybe add a type cast?

This check looks for assignments to scalar types that may be of the wrong type.

To ensure the code behaves as expected, it may be a good idea to add an explicit type cast.

$answer = 42;

$correct = false;

$correct = (bool) $answer;
Loading history...
1345
1346
                $mountTree = GeneralUtility::makeInstance(PageTreeView::class);
1347
                $mountTree->init('AND ' . $perms_clause);
1348
                $mountTree->getTree($mountpage[0]['mount_pid'], $depth);
1349
1350
                foreach ($mountTree->tree as $mountData) {
1351
                    $code .= $this->drawURLs_addRowsForPage(
1352
                        $mountData['row'],
1353
                        $mountData['HTML'] . BackendUtility::getRecordTitle('pages', $mountData['row'], true)
1354
                    );
1355
                }
1356
1357
                // replace page when mount_pid_ol is enabled
1358
                if ($mountpage[0]['mount_pid_ol']) {
1359
                    $data['row']['uid'] = $mountpage[0]['mount_pid'];
1360
                } else {
1361
                    // if the mount_pid_ol is not set the MP must not be used for the mountpoint page
1362
                    $this->MP = false;
1363
                }
1364
            }
1365
1366
            $code .= $this->drawURLs_addRowsForPage(
1367
                $data['row'],
1368
                $data['HTML'] . BackendUtility::getRecordTitle('pages', $data['row'], true)
1369
            );
1370
        }
1371
1372
        return $code;
1373
    }
1374
1375
    /**
1376
     * Expands exclude string
1377
     *
1378
     * @param string $excludeString Exclude string
1379
     * @return array
1380
     */
1381
    public function expandExcludeString($excludeString)
1382
    {
1383
        // internal static caches;
1384
        static $expandedExcludeStringCache;
1385
        static $treeCache;
1386
1387
        if (empty($expandedExcludeStringCache[$excludeString])) {
1388
            $pidList = [];
1389
1390
            if (! empty($excludeString)) {
1391
                /** @var PageTreeView $tree */
1392
                $tree = GeneralUtility::makeInstance(PageTreeView::class);
1393
                $tree->init('AND ' . $this->getBackendUser()->getPagePermsClause(Permission::PAGE_SHOW));
1394
1395
                $excludeParts = GeneralUtility::trimExplode(',', $excludeString);
1396
1397
                foreach ($excludeParts as $excludePart) {
1398
                    [$pid, $depth] = GeneralUtility::trimExplode('+', $excludePart);
1399
1400
                    // default is "page only" = "depth=0"
1401
                    if (empty($depth)) {
1402
                        $depth = (stristr($excludePart, '+')) ? 99 : 0;
1403
                    }
1404
1405
                    $pidList[] = $pid;
1406
1407
                    if ($depth > 0) {
1408
                        if (empty($treeCache[$pid][$depth])) {
1409
                            $tree->reset();
1410
                            $tree->getTree($pid, $depth);
0 ignored issues
show
Bug introduced by
$pid of type string is incompatible with the type integer expected by parameter $uid of TYPO3\CMS\Backend\Tree\V...ractTreeView::getTree(). ( Ignorable by Annotation )

If this is a false-positive, you can also ignore this issue in your code via the ignore-type  annotation

1410
                            $tree->getTree(/** @scrutinizer ignore-type */ $pid, $depth);
Loading history...
1411
                            $treeCache[$pid][$depth] = $tree->tree;
1412
                        }
1413
1414
                        foreach ($treeCache[$pid][$depth] as $data) {
1415
                            $pidList[] = $data['row']['uid'];
1416
                        }
1417
                    }
1418
                }
1419
            }
1420
1421
            $expandedExcludeStringCache[$excludeString] = array_unique($pidList);
1422
        }
1423
1424
        return $expandedExcludeStringCache[$excludeString];
1425
    }
1426
1427
    /**
1428
     * Create the rows for display of the page tree
1429
     * For each page a number of rows are shown displaying GET variable configuration
1430
     */
1431
    public function drawURLs_addRowsForPage(array $pageRow, string $pageTitle): string
1432
    {
1433
        $skipMessage = '';
1434
1435
        // Get list of configurations
1436
        $configurations = $this->getUrlsForPageRow($pageRow, $skipMessage);
1437
1438
        if (! empty($this->incomingConfigurationSelection)) {
1439
            // remove configuration that does not match the current selection
1440
            foreach ($configurations as $confKey => $confArray) {
1441
                if (! in_array($confKey, $this->incomingConfigurationSelection, true)) {
1442
                    unset($configurations[$confKey]);
1443
                }
1444
            }
1445
        }
1446
1447
        // Traverse parameter combinations:
1448
        $c = 0;
1449
        $content = '';
1450
        if (! empty($configurations)) {
1451
            foreach ($configurations as $confKey => $confArray) {
1452
1453
                // Title column:
1454
                if (! $c) {
1455
                    $titleClm = '<td rowspan="' . count($configurations) . '">' . $pageTitle . '</td>';
1456
                } else {
1457
                    $titleClm = '';
1458
                }
1459
1460
                if (! in_array($pageRow['uid'], $this->expandExcludeString($confArray['subCfg']['exclude']), true)) {
1461
1462
                    // URL list:
1463
                    $urlList = $this->urlListFromUrlArray(
1464
                        $confArray,
1465
                        $pageRow,
1466
                        $this->scheduledTime,
1467
                        $this->reqMinute,
1468
                        $this->submitCrawlUrls,
1469
                        $this->downloadCrawlUrls,
1470
                        $this->duplicateTrack,
1471
                        $this->downloadUrls,
1472
                        $this->incomingProcInstructions // if empty the urls won't be filtered by processing instructions
1473
                    );
1474
1475
                    // Expanded parameters:
1476
                    $paramExpanded = '';
1477
                    $calcAccu = [];
1478
                    $calcRes = 1;
1479
                    foreach ($confArray['paramExpanded'] as $gVar => $gVal) {
1480
                        $paramExpanded .= '
1481
                            <tr>
1482
                                <td>' . htmlspecialchars('&' . $gVar . '=') . '<br/>' .
1483
                            '(' . count($gVal) . ')' .
1484
                            '</td>
1485
                                <td nowrap="nowrap">' . nl2br(htmlspecialchars(implode(chr(10), $gVal))) . '</td>
1486
                            </tr>
1487
                        ';
1488
                        $calcRes *= count($gVal);
1489
                        $calcAccu[] = count($gVal);
1490
                    }
1491
                    $paramExpanded = '<table>' . $paramExpanded . '</table>';
1492
                    $paramExpanded .= 'Comb: ' . implode('*', $calcAccu) . '=' . $calcRes;
1493
1494
                    // Options
1495
                    $optionValues = '';
1496
                    if ($confArray['subCfg']['userGroups']) {
1497
                        $optionValues .= 'User Groups: ' . $confArray['subCfg']['userGroups'] . '<br/>';
1498
                    }
1499
                    if ($confArray['subCfg']['procInstrFilter']) {
1500
                        $optionValues .= 'ProcInstr: ' . $confArray['subCfg']['procInstrFilter'] . '<br/>';
1501
                    }
1502
1503
                    // Compile row:
1504
                    $content .= '
1505
                        <tr>
1506
                            ' . $titleClm . '
1507
                            <td>' . htmlspecialchars($confKey) . '</td>
1508
                            <td>' . nl2br(htmlspecialchars(rawurldecode(trim(str_replace('&', chr(10) . '&', GeneralUtility::implodeArrayForUrl('', $confArray['paramParsed'])))))) . '</td>
1509
                            <td>' . $paramExpanded . '</td>
1510
                            <td nowrap="nowrap">' . $urlList . '</td>
1511
                            <td nowrap="nowrap">' . $optionValues . '</td>
1512
                            <td nowrap="nowrap">' . DebugUtility::viewArray($confArray['subCfg']['procInstrParams.']) . '</td>
1513
                        </tr>';
1514
                } else {
1515
                    $content .= '<tr>
1516
                            ' . $titleClm . '
1517
                            <td>' . htmlspecialchars($confKey) . '</td>
1518
                            <td colspan="5"><em>No entries</em> (Page is excluded in this configuration)</td>
1519
                        </tr>';
1520
                }
1521
1522
                $c++;
1523
            }
1524
        } else {
1525
            $message = ! empty($skipMessage) ? ' (' . $skipMessage . ')' : '';
1526
1527
            // Compile row:
1528
            $content .= '
1529
                <tr>
1530
                    <td>' . $pageTitle . '</td>
1531
                    <td colspan="6"><em>No entries</em>' . $message . '</td>
1532
                </tr>';
1533
        }
1534
1535
        return $content;
1536
    }
1537
1538
    /*****************************
1539
     *
1540
     * CLI functions
1541
     *
1542
     *****************************/
1543
1544
    /**
1545
     * Running the functionality of the CLI (crawling URLs from queue)
1546
     */
1547
    public function CLI_run(int $countInARun, int $sleepTime, int $sleepAfterFinish): int
1548
    {
1549
        $result = 0;
1550
        $counter = 0;
1551
1552
        // First, run hooks:
1553
        $this->CLI_runHooks();
1554
1555
        // Clean up the queue
1556
        $this->queueRepository->cleanupQueue();
1557
1558
        // Select entries:
1559
        $rows = $this->queueRepository->fetchRecordsToBeCrawled($countInARun);
1560
1561
        if (! empty($rows)) {
1562
            $quidList = [];
1563
1564
            foreach ($rows as $r) {
1565
                $quidList[] = $r['qid'];
1566
            }
1567
1568
            $processId = $this->CLI_buildProcessId();
1569
1570
            //save the number of assigned queue entries to determine how many have been processed later
1571
            $numberOfAffectedRows = $this->queueRepository->updateProcessIdAndSchedulerForQueueIds($quidList, $processId);
1572
            $this->processRepository->updateProcessAssignItemsCount($numberOfAffectedRows, $processId);
1573
1574
            if ($numberOfAffectedRows !== count($quidList)) {
1575
                $this->CLI_debug('Nothing processed due to multi-process collision (' . $this->CLI_buildProcessId() . ')');
1576
                return ($result | self::CLI_STATUS_ABORTED);
1577
            }
1578
1579
            foreach ($rows as $r) {
1580
                $result |= $this->readUrl($r['qid']);
1581
1582
                $counter++;
1583
                usleep((int) $sleepTime); // Just to relax the system
1584
1585
                // if during the start and the current read url the cli has been disable we need to return from the function
1586
                // mark the process NOT as ended.
1587
                if ($this->getDisabled()) {
1588
                    return ($result | self::CLI_STATUS_ABORTED);
1589
                }
1590
1591
                if (! $this->processRepository->isProcessActive($this->CLI_buildProcessId())) {
1592
                    $this->CLI_debug('conflict / timeout (' . $this->CLI_buildProcessId() . ')');
1593
                    $result |= self::CLI_STATUS_ABORTED;
1594
                    break; //possible timeout
1595
                }
1596
            }
1597
1598
            sleep((int) $sleepAfterFinish);
1599
1600
            $msg = 'Rows: ' . $counter;
1601
            $this->CLI_debug($msg . ' (' . $this->CLI_buildProcessId() . ')');
1602
        } else {
1603
            $this->CLI_debug('Nothing within queue which needs to be processed (' . $this->CLI_buildProcessId() . ')');
1604
        }
1605
1606
        if ($counter > 0) {
1607
            $result |= self::CLI_STATUS_PROCESSED;
1608
        }
1609
1610
        return $result;
1611
    }
1612
1613
    /**
1614
     * Activate hooks
1615
     */
1616
    public function CLI_runHooks(): void
1617
    {
1618
        foreach ($GLOBALS['TYPO3_CONF_VARS']['EXTCONF']['crawler']['cli_hooks'] ?? [] as $objRef) {
1619
            $hookObj = GeneralUtility::makeInstance($objRef);
1620
            if (is_object($hookObj)) {
1621
                $hookObj->crawler_init($this);
1622
            }
1623
        }
1624
    }
1625
1626
    /**
1627
     * Try to acquire a new process with the given id
1628
     * also performs some auto-cleanup for orphan processes
1629
     * @param string $id identification string for the process
1630
     * @return boolean
1631
     * @todo preemption might not be the most elegant way to clean up
1632
     */
1633
    public function CLI_checkAndAcquireNewProcess($id)
1634
    {
1635
        $queryBuilder = GeneralUtility::makeInstance(ConnectionPool::class)->getQueryBuilderForTable($this->tableName);
1636
        $ret = true;
1637
1638
        $systemProcessId = getmypid();
1639
        if ($systemProcessId < 1) {
1640
            return false;
1641
        }
1642
1643
        $processCount = 0;
1644
        $orphanProcesses = [];
1645
1646
        $statement = $queryBuilder
1647
            ->select('process_id', 'ttl')
1648
            ->from('tx_crawler_process')
1649
            ->where(
1650
                'active = 1 AND deleted = 0'
1651
            )
1652
            ->execute();
1653
1654
        $currentTime = $this->getCurrentTime();
1655
1656
        while ($row = $statement->fetch()) {
1657
            if ($row['ttl'] < $currentTime) {
1658
                $orphanProcesses[] = $row['process_id'];
1659
            } else {
1660
                $processCount++;
1661
            }
1662
        }
1663
1664
        // if there are less than allowed active processes then add a new one
1665
        if ($processCount < (int) $this->extensionSettings['processLimit']) {
1666
            $this->CLI_debug('add process ' . $this->CLI_buildProcessId() . ' (' . ($processCount + 1) . '/' . (int) $this->extensionSettings['processLimit'] . ')');
1667
1668
            GeneralUtility::makeInstance(ConnectionPool::class)->getConnectionForTable('tx_crawler_process')->insert(
1669
                'tx_crawler_process',
1670
                [
1671
                    'process_id' => $id,
1672
                    'active' => 1,
1673
                    'ttl' => $currentTime + (int) $this->extensionSettings['processMaxRunTime'],
1674
                    'system_process_id' => $systemProcessId,
1675
                ]
1676
            );
1677
        } else {
1678
            $this->CLI_debug('Processlimit reached (' . ($processCount) . '/' . (int) $this->extensionSettings['processLimit'] . ')');
1679
            $ret = false;
1680
        }
1681
1682
        $this->processRepository->deleteProcessesMarkedAsDeleted();
1683
        $this->CLI_releaseProcesses($orphanProcesses);
1684
1685
        return $ret;
1686
    }
1687
1688
    /**
1689
     * Release a process and the required resources
1690
     *
1691
     * @param mixed $releaseIds string with a single process-id or array with multiple process-ids
1692
     * @return boolean
1693
     */
1694
    public function CLI_releaseProcesses($releaseIds)
1695
    {
1696
        $queryBuilder = GeneralUtility::makeInstance(ConnectionPool::class)->getQueryBuilderForTable($this->tableName);
1697
1698
        if (! is_array($releaseIds)) {
1699
            $releaseIds = [$releaseIds];
1700
        }
1701
1702
        if (empty($releaseIds)) {
1703
            return false;   //nothing to release
1704
        }
1705
1706
        // some kind of 2nd chance algo - this way you need at least 2 processes to have a real cleanup
1707
        // this ensures that a single process can't mess up the entire process table
1708
1709
        // mark all processes as deleted which have no "waiting" queue-entires and which are not active
1710
1711
        $queryBuilder
1712
            ->update($this->tableName, 'q')
1713
            ->where(
1714
                'q.process_id IN(SELECT p.process_id FROM tx_crawler_process as p WHERE p.active = 0)'
1715
            )
1716
            ->set('q.process_scheduled', 0)
1717
            ->set('q.process_id', '')
1718
            ->execute();
1719
1720
        // FIXME: Not entirely sure that this is equivalent to the previous version
1721
        $queryBuilder->resetQueryPart('set');
1722
1723
        $queryBuilder
1724
            ->update('tx_crawler_process')
1725
            ->where(
1726
                $queryBuilder->expr()->eq('active', 0),
1727
                'process_id IN(SELECT q.process_id FROM tx_crawler_queue as q WHERE q.exec_time = 0)'
1728
            )
1729
            ->set('system_process_id', 0)
1730
            ->execute();
1731
1732
        $this->processRepository->markRequestedProcessesAsNotActive($releaseIds);
1733
        $this->queueRepository->unsetProcessScheduledAndProcessIdForQueueEntries($releaseIds);
1734
1735
        return true;
1736
    }
1737
1738
    /**
1739
     * Create a unique Id for the current process
1740
     *
1741
     * @return string  the ID
1742
     */
1743 1
    public function CLI_buildProcessId()
1744
    {
1745 1
        if (! $this->processID) {
1746
            $this->processID = GeneralUtility::shortMD5(microtime(true));
1747
        }
1748 1
        return $this->processID;
1749
    }
1750
1751
    /**
1752
     * Prints a message to the stdout (only if debug-mode is enabled)
1753
     *
1754
     * @param string $msg the message
1755
     */
1756
    public function CLI_debug($msg): void
1757
    {
1758
        if ((int) $this->extensionSettings['processDebug']) {
1759
            echo $msg . "\n";
1760
            flush();
1761
        }
1762
    }
1763
1764
    /**
1765
     * Cleans up entries that stayed for too long in the queue. These are:
1766
     * - processed entries that are over 1.5 days in age
1767
     * - scheduled entries that are over 7 days old
1768
     *
1769
     * @deprecated
1770
     */
1771 1
    public function cleanUpOldQueueEntries(): void
1772
    {
1773 1
        $processedAgeInSeconds = $this->extensionSettings['cleanUpProcessedAge'] * 86400; // 24*60*60 Seconds in 24 hours
1774 1
        $scheduledAgeInSeconds = $this->extensionSettings['cleanUpScheduledAge'] * 86400;
1775
1776 1
        $now = time();
1777 1
        $condition = '(exec_time<>0 AND exec_time<' . ($now - $processedAgeInSeconds) . ') OR scheduled<=' . ($now - $scheduledAgeInSeconds);
1778 1
        $this->flushQueue($condition);
0 ignored issues
show
Deprecated Code introduced by
The function AOE\Crawler\Controller\C...ontroller::flushQueue() has been deprecated. ( Ignorable by Annotation )

If this is a false-positive, you can also ignore this issue in your code via the ignore-deprecated  annotation

1778
        /** @scrutinizer ignore-deprecated */ $this->flushQueue($condition);
Loading history...
1779 1
    }
1780
1781
    /**
1782
     * Removes queue entries
1783
     *
1784
     * @param string $where SQL related filter for the entries which should be removed
1785
     *
1786
     * @deprecated
1787
     */
1788 5
    protected function flushQueue($where = ''): void
1789
    {
1790 5
        $realWhere = strlen((string) $where) > 0 ? $where : '1=1';
1791
1792 5
        $queryBuilder = $this->getQueryBuilder($this->tableName);
1793
1794
        $groups = $queryBuilder
1795 5
            ->selectLiteral('DISTINCT set_id')
1796 5
            ->from($this->tableName)
1797 5
            ->where($realWhere)
1798 5
            ->execute()
1799 5
            ->fetchAll();
1800 5
        if (is_array($groups)) {
0 ignored issues
show
introduced by
The condition is_array($groups) is always true.
Loading history...
1801 5
            foreach ($groups as $group) {
1802
                $subSet = $queryBuilder
1803 4
                    ->select('qid', 'set_id')
1804 4
                    ->from($this->tableName)
1805 4
                    ->where(
1806 4
                        $realWhere,
1807 4
                        $queryBuilder->expr()->eq('set_id', $group['set_id'])
1808
                    )
1809 4
                    ->execute()
1810 4
                    ->fetchAll();
1811
1812 4
                $payLoad = ['subSet' => $subSet];
1813 4
                SignalSlotUtility::emitSignal(
1814 4
                    self::class,
1815 4
                    SignalSlotUtility::SIGNAL_QUEUE_ENTRY_FLUSH,
1816 4
                    $payLoad
1817
                );
1818
            }
1819
        }
1820
1821
        $queryBuilder
1822 5
            ->delete($this->tableName)
1823 5
            ->where($realWhere)
1824 5
            ->execute();
1825 5
    }
1826
1827
    /**
1828
     * This method determines duplicates for a queue entry with the same parameters and this timestamp.
1829
     * If the timestamp is in the past, it will check if there is any unprocessed queue entry in the past.
1830
     * If the timestamp is in the future it will check, if the queued entry has exactly the same timestamp
1831
     *
1832
     * @param int $tstamp
1833
     * @param array $fieldArray
1834
     *
1835
     * @return array
1836
     */
1837 5
    protected function getDuplicateRowsIfExist($tstamp, $fieldArray)
1838
    {
1839 5
        $rows = [];
1840
1841 5
        $currentTime = $this->getCurrentTime();
1842
1843 5
        $queryBuilder = GeneralUtility::makeInstance(ConnectionPool::class)->getQueryBuilderForTable($this->tableName);
1844
        $queryBuilder
1845 5
            ->select('qid')
1846 5
            ->from('tx_crawler_queue');
1847
        //if this entry is scheduled with "now"
1848 5
        if ($tstamp <= $currentTime) {
1849 2
            if ($this->extensionSettings['enableTimeslot']) {
1850 1
                $timeBegin = $currentTime - 100;
1851 1
                $timeEnd = $currentTime + 100;
1852
                $queryBuilder
1853 1
                    ->where(
1854 1
                        'scheduled BETWEEN ' . $timeBegin . ' AND ' . $timeEnd . ''
1855
                    )
1856 1
                    ->orWhere(
1857 1
                        $queryBuilder->expr()->lte('scheduled', $currentTime)
1858
                    );
1859
            } else {
1860
                $queryBuilder
1861 1
                    ->where(
1862 2
                        $queryBuilder->expr()->lte('scheduled', $currentTime)
1863
                    );
1864
            }
1865 3
        } elseif ($tstamp > $currentTime) {
1866
            //entry with a timestamp in the future need to have the same schedule time
1867
            $queryBuilder
1868 3
                ->where(
1869 3
                    $queryBuilder->expr()->eq('scheduled', $tstamp)
1870
                );
1871
        }
1872
1873
        $queryBuilder
1874 5
            ->andWhere('NOT exec_time')
1875 5
            ->andWhere('NOT process_id')
1876 5
            ->andWhere($queryBuilder->expr()->eq('page_id', $queryBuilder->createNamedParameter($fieldArray['page_id'], \PDO::PARAM_INT)))
1877 5
            ->andWhere($queryBuilder->expr()->eq('parameters_hash', $queryBuilder->createNamedParameter($fieldArray['parameters_hash'], \PDO::PARAM_STR)));
1878
1879 5
        $statement = $queryBuilder->execute();
1880
1881 5
        while ($row = $statement->fetch()) {
1882 5
            $rows[] = $row['qid'];
1883
        }
1884
1885 5
        return $rows;
1886
    }
1887
1888
    /**
1889
     * Returns a md5 hash generated from a serialized configuration array.
1890
     *
1891
     * @return string
1892
     */
1893 6
    protected function getConfigurationHash(array $configuration)
1894
    {
1895 6
        unset($configuration['paramExpanded']);
1896 6
        unset($configuration['URLs']);
1897 6
        return md5(serialize($configuration));
1898
    }
1899
1900
    /**
1901
     * Build a URL from a Page and the Query String. If the page has a Site configuration, it can be built by using
1902
     * the Site instance.
1903
     *
1904
     * @param int $httpsOrHttp see tx_crawler_configuration.force_ssl
1905
     * @throws \TYPO3\CMS\Core\Exception\SiteNotFoundException
1906
     * @throws \TYPO3\CMS\Core\Routing\InvalidRouteArgumentsException
1907
     */
1908 8
    protected function getUrlFromPageAndQueryParameters(int $pageId, string $queryString, ?string $alternativeBaseUrl, int $httpsOrHttp): UriInterface
1909
    {
1910 8
        $site = GeneralUtility::makeInstance(SiteMatcher::class)->matchByPageId((int) $pageId);
1911 8
        if ($site instanceof Site) {
1912 5
            $queryString = ltrim($queryString, '?&');
1913 5
            $queryParts = [];
1914 5
            parse_str($queryString, $queryParts);
1915 5
            unset($queryParts['id']);
1916
            // workaround as long as we don't have native language support in crawler configurations
1917 5
            if (isset($queryParts['L'])) {
1918
                $queryParts['_language'] = $queryParts['L'];
1919
                unset($queryParts['L']);
1920
                $siteLanguage = $site->getLanguageById((int) $queryParts['_language']);
0 ignored issues
show
Unused Code introduced by
The assignment to $siteLanguage is dead and can be removed.
Loading history...
1921
            } else {
1922 5
                $siteLanguage = $site->getDefaultLanguage();
1923
            }
1924 5
            $url = $site->getRouter()->generateUri($pageId, $queryParts);
1925 5
            if (! empty($alternativeBaseUrl)) {
1926 3
                $alternativeBaseUrl = new Uri($alternativeBaseUrl);
1927 3
                $url = $url->withHost($alternativeBaseUrl->getHost());
1928 3
                $url = $url->withScheme($alternativeBaseUrl->getScheme());
1929 3
                $url = $url->withPort($alternativeBaseUrl->getPort());
1930 3
                if ($userInfo = $alternativeBaseUrl->getUserInfo()) {
1931 5
                    $url = $url->withUserInfo($userInfo);
1932
                }
1933
            }
1934
        } else {
1935
            // Technically this is not possible with site handling, but kept for backwards-compatibility reasons
1936
            // Once EXT:crawler is v10-only compatible, this should be removed completely
1937 3
            $baseUrl = ($alternativeBaseUrl ?: GeneralUtility::getIndpEnv('TYPO3_SITE_URL'));
1938 3
            $cacheHashCalculator = GeneralUtility::makeInstance(CacheHashCalculator::class);
1939 3
            $queryString .= '&cHash=' . $cacheHashCalculator->generateForParameters($queryString);
1940 3
            $url = rtrim($baseUrl, '/') . '/index.php' . $queryString;
1941 3
            $url = new Uri($url);
1942
        }
1943
1944 8
        if ($httpsOrHttp === -1) {
1945 2
            $url = $url->withScheme('http');
1946 6
        } elseif ($httpsOrHttp === 1) {
1947 6
            $url = $url->withScheme('https');
1948
        }
1949
1950 8
        return $url;
1951
    }
1952
1953 1
    protected function swapIfFirstIsLargerThanSecond(array $reg): array
1954
    {
1955
        // Swap if first is larger than last:
1956 1
        if ($reg[1] > $reg[2]) {
1957
            $temp = $reg[2];
1958
            $reg[2] = $reg[1];
1959
            $reg[1] = $temp;
1960
        }
1961
1962 1
        return $reg;
1963
    }
1964
1965
    /**
1966
     * @return BackendUserAuthentication
1967
     */
1968 1
    private function getBackendUser()
1969
    {
1970
        // Make sure the _cli_ user is loaded
1971 1
        Bootstrap::initializeBackendAuthentication();
1972 1
        if ($this->backendUser === null) {
1973 1
            $this->backendUser = $GLOBALS['BE_USER'];
1974
        }
1975 1
        return $this->backendUser;
1976
    }
1977
1978
    /**
1979
     * Get querybuilder for given table
1980
     *
1981
     * @return \TYPO3\CMS\Core\Database\Query\QueryBuilder
1982
     */
1983 12
    private function getQueryBuilder(string $table)
1984
    {
1985 12
        return GeneralUtility::makeInstance(ConnectionPool::class)->getQueryBuilderForTable($table);
1986
    }
1987
}
1988