Completed
Push — ci/infection ( c77b24...2591ac )
by Tomas Norre
14:39
created

CrawlerController::urlListFromUrlArray()   B

Complexity

Conditions 8
Paths 8

Size

Total Lines 66
Code Lines 38

Duplication

Lines 0
Ratio 0 %

Code Coverage

Tests 1
CRAP Score 67.0819

Importance

Changes 1
Bugs 0 Features 0
Metric Value
cc 8
eloc 38
nc 8
nop 9
dl 0
loc 66
ccs 1
cts 38
cp 0.0263
crap 67.0819
rs 8.0675
c 1
b 0
f 0

How to fix   Long Method    Many Parameters   

Long Method

Small methods make your code easier to understand, in particular if combined with a good name. Besides, if your method is small, finding a good name is usually much easier.

For example, if you find yourself adding comments to a method's body, this is usually a good sign to extract the commented part to a new method, and use the comment as a starting point when coming up with a good name for this new method.

Commonly applied refactorings include:

Many Parameters

Methods with many parameters are not only hard to understand, but their parameters also often become inconsistent when you need more, or different data.

There are several approaches to avoid long parameter lists:

1
<?php
2
3
declare(strict_types=1);
4
5
namespace AOE\Crawler\Controller;
6
7
/***************************************************************
8
 *  Copyright notice
9
 *
10
 *  (c) 2020 AOE GmbH <[email protected]>
11
 *
12
 *  All rights reserved
13
 *
14
 *  This script is part of the TYPO3 project. The TYPO3 project is
15
 *  free software; you can redistribute it and/or modify
16
 *  it under the terms of the GNU General Public License as published by
17
 *  the Free Software Foundation; either version 3 of the License, or
18
 *  (at your option) any later version.
19
 *
20
 *  The GNU General Public License can be found at
21
 *  http://www.gnu.org/copyleft/gpl.html.
22
 *
23
 *  This script is distributed in the hope that it will be useful,
24
 *  but WITHOUT ANY WARRANTY; without even the implied warranty of
25
 *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
26
 *  GNU General Public License for more details.
27
 *
28
 *  This copyright notice MUST APPEAR in all copies of the script!
29
 ***************************************************************/
30
31
use AOE\Crawler\Configuration\ExtensionConfigurationProvider;
32
use AOE\Crawler\Converter\JsonCompatibilityConverter;
33
use AOE\Crawler\Domain\Repository\ConfigurationRepository;
34
use AOE\Crawler\Domain\Repository\ProcessRepository;
35
use AOE\Crawler\Domain\Repository\QueueRepository;
36
use AOE\Crawler\QueueExecutor;
37
use AOE\Crawler\Utility\SignalSlotUtility;
38
use Psr\Http\Message\UriInterface;
39
use Psr\Log\LoggerAwareInterface;
40
use Psr\Log\LoggerAwareTrait;
41
use TYPO3\CMS\Backend\Tree\View\PageTreeView;
42
use TYPO3\CMS\Backend\Utility\BackendUtility;
43
use TYPO3\CMS\Core\Authentication\BackendUserAuthentication;
44
use TYPO3\CMS\Core\Compatibility\PublicMethodDeprecationTrait;
45
use TYPO3\CMS\Core\Core\Bootstrap;
46
use TYPO3\CMS\Core\Core\Environment;
47
use TYPO3\CMS\Core\Database\Connection;
48
use TYPO3\CMS\Core\Database\ConnectionPool;
49
use TYPO3\CMS\Core\Database\Query\Restriction\DeletedRestriction;
50
use TYPO3\CMS\Core\Http\Uri;
51
use TYPO3\CMS\Core\Imaging\Icon;
52
use TYPO3\CMS\Core\Imaging\IconFactory;
53
use TYPO3\CMS\Core\Routing\SiteMatcher;
54
use TYPO3\CMS\Core\Site\Entity\Site;
55
use TYPO3\CMS\Core\Type\Bitmask\Permission;
56
use TYPO3\CMS\Core\TypoScript\Parser\TypoScriptParser;
57
use TYPO3\CMS\Core\Utility\DebugUtility;
58
use TYPO3\CMS\Core\Utility\GeneralUtility;
59
use TYPO3\CMS\Core\Utility\MathUtility;
60
use TYPO3\CMS\Extbase\Object\ObjectManager;
61
use TYPO3\CMS\Frontend\Page\CacheHashCalculator;
62
use TYPO3\CMS\Frontend\Page\PageRepository;
63
64
/**
65
 * Class CrawlerController
66
 *
67
 * @package AOE\Crawler\Controller
68
 */
69
class CrawlerController implements LoggerAwareInterface
70
{
71
    use LoggerAwareTrait;
72
    use PublicMethodDeprecationTrait;
73
74
    public const CLI_STATUS_NOTHING_PROCCESSED = 0;
75
76
    public const CLI_STATUS_REMAIN = 1; //queue not empty
77
78
    public const CLI_STATUS_PROCESSED = 2; //(some) queue items where processed
79
80
    public const CLI_STATUS_ABORTED = 4; //instance didn't finish
81
82
    public const CLI_STATUS_POLLABLE_PROCESSED = 8;
83
84
    /**
85
     * @var integer
86
     */
87
    public $setID = 0;
88
89
    /**
90
     * @var string
91
     */
92
    public $processID = '';
93
94
    /**
95
     * @var array
96
     */
97
    public $duplicateTrack = [];
98
99
    /**
100
     * @var array
101
     */
102
    public $downloadUrls = [];
103
104
    /**
105
     * @var array
106
     */
107
    public $incomingProcInstructions = [];
108
109
    /**
110
     * @var array
111
     */
112
    public $incomingConfigurationSelection = [];
113
114
    /**
115
     * @var bool
116
     */
117
    public $registerQueueEntriesInternallyOnly = false;
118
119
    /**
120
     * @var array
121
     */
122
    public $queueEntries = [];
123
124
    /**
125
     * @var array
126
     */
127
    public $urlList = [];
128
129
    /**
130
     * @var array
131
     */
132
    public $extensionSettings = [];
133
134
    /**
135
     * Mount Point
136
     *
137
     * @var bool
138
     * Todo: Check what this is used for and adjust the type hint or code, as bool doesn't match the current code.
139
     */
140
    public $MP = false;
141
142
    /**
143
     * @var string
144
     */
145
    protected $processFilename;
146
147
    /**
148
     * Holds the internal access mode can be 'gui','cli' or 'cli_im'
149
     *
150
     * @var string
151
     */
152
    protected $accessMode;
153
154
    /**
155
     * @var QueueRepository
156
     */
157
    protected $queueRepository;
158
159
    /**
160
     * @var ProcessRepository
161
     */
162
    protected $processRepository;
163
164
    /**
165
     * @var ConfigurationRepository
166
     */
167
    protected $configurationRepository;
168
169
    /**
170
     * @var string
171
     */
172
    protected $tableName = 'tx_crawler_queue';
173
174
    /**
175
     * @var QueueExecutor
176
     */
177
    protected $queueExecutor;
178
179
    /**
180
     * @var int
181
     */
182
    protected $maximumUrlsToCompile = 10000;
183
184
    /**
185
     * @var IconFactory
186
     */
187
    protected $iconFactory;
188
189
    /**
190
     * @var string[]
191
     */
192
    private $deprecatedPublicMethods = [
0 ignored issues
show
introduced by
The private property $deprecatedPublicMethods is not used, and could be removed.
Loading history...
193
        'getLogEntriesForSetId' => 'Using crawlerController::getLogEntriesForSetId() is deprecated since 9.0.1 and will be removed in v11.x',
194
        'flushQueue' => 'Using CrawlerController::flushQueue() is deprecated since 9.0.1 and will be removed in v11.x, please use QueueRepository->flushQueue() instead.',
195
        'cleanUpOldQueueEntries' => 'Using CrawlerController::cleanUpOldQueueEntries() is deprecated since 9.0.1 and will be removed in v11.x, please use QueueRepository->cleanUpOldQueueEntries() instead.',
196
    ];
197
198
    /**
199
     * @var BackendUserAuthentication|null
200
     */
201
    private $backendUser;
202
203
    /**
204
     * @var integer
205
     */
206
    private $scheduledTime = 0;
207
208
    /**
209
     * @var integer
210
     */
211
    private $reqMinute = 0;
212
213
    /**
214
     * @var bool
215
     */
216
    private $submitCrawlUrls = false;
217
218
    /**
219
     * @var bool
220
     */
221
    private $downloadCrawlUrls = false;
222
223
    /************************************
224
     *
225
     * Getting URLs based on Page TSconfig
226
     *
227
     ************************************/
228
229 37
    public function __construct()
230
    {
231 37
        $objectManager = GeneralUtility::makeInstance(ObjectManager::class);
232 37
        $this->queueRepository = $objectManager->get(QueueRepository::class);
233 37
        $this->processRepository = $objectManager->get(ProcessRepository::class);
234 37
        $this->configurationRepository = $objectManager->get(ConfigurationRepository::class);
235 37
        $this->queueExecutor = $objectManager->get(QueueExecutor::class);
236 37
        $this->iconFactory = GeneralUtility::makeInstance(IconFactory::class);
237
238 37
        $this->processFilename = Environment::getVarPath() . '/lock/tx_crawler.proc';
239
240
        /** @var ExtensionConfigurationProvider $configurationProvider */
241 37
        $configurationProvider = GeneralUtility::makeInstance(ExtensionConfigurationProvider::class);
242 37
        $settings = $configurationProvider->getExtensionConfiguration();
243 37
        $this->extensionSettings = is_array($settings) ? $settings : [];
0 ignored issues
show
introduced by
The condition is_array($settings) is always true.
Loading history...
244
245
        // set defaults:
246 37
        if (MathUtility::convertToPositiveInteger($this->extensionSettings['countInARun']) === 0) {
247
            $this->extensionSettings['countInARun'] = 100;
248
        }
249
250 37
        $this->extensionSettings['processLimit'] = MathUtility::forceIntegerInRange($this->extensionSettings['processLimit'], 1, 99, 1);
251 37
        $this->maximumUrlsToCompile = MathUtility::forceIntegerInRange($this->extensionSettings['maxCompileUrls'], 1, 1000000000, 10000);
252 37
    }
253
254
    public function getMaximumUrlsToCompile(): int
255
    {
256
        return $this->maximumUrlsToCompile;
257
    }
258
259 1
    public function setMaximumUrlsToCompile(int $maximumUrlsToCompile): void
260
    {
261 1
        $this->maximumUrlsToCompile = $maximumUrlsToCompile;
262
    }
263
264
    /**
265
     * Method to set the accessMode can be gui, cli or cli_im
266
     *
267 1
     * @return string
268
     */
269 1
    public function getAccessMode()
270 1
    {
271
        return $this->accessMode;
272
    }
273
274
    /**
275
     * @param string $accessMode
276
     */
277 2
    public function setAccessMode($accessMode): void
278
    {
279 2
        $this->accessMode = $accessMode;
280 1
    }
281 1
282 1
    /**
283
     * Set disabled status to prevent processes from being processed
284 2
     *
285
     * @param bool $disabled (optional, defaults to true)
286
     */
287
    public function setDisabled($disabled = true): void
288
    {
289
        if ($disabled) {
290
            GeneralUtility::writeFile($this->processFilename, '');
291 2
        } elseif (is_file($this->processFilename)) {
292
            unlink($this->processFilename);
293 2
        }
294
    }
295
296
    /**
297
     * Get disable status
298
     *
299 3
     * @return bool true if disabled
300
     */
301 3
    public function getDisabled()
302 3
    {
303
        return is_file($this->processFilename);
304
    }
305
306
    /**
307 1
     * @param string $filenameWithPath
308
     */
309 1
    public function setProcessFilename($filenameWithPath): void
310
    {
311
        $this->processFilename = $filenameWithPath;
312
    }
313
314
    /**
315 6
     * @return string
316
     */
317 6
    public function getProcessFilename()
318 6
    {
319
        return $this->processFilename;
320
    }
321
322
    /**
323
     * Sets the extensions settings (unserialized pendant of $TYPO3_CONF_VARS['EXT']['extConf']['crawler']).
324
     */
325
    public function setExtensionSettings(array $extensionSettings): void
326
    {
327
        $this->extensionSettings = $extensionSettings;
328
    }
329
330
    /**
331
     * Check if the given page should be crawled
332
     *
333
     * @return false|string false if the page should be crawled (not excluded), true / skipMessage if it should be skipped
334
     */
335
    public function checkIfPageShouldBeSkipped(array $pageRow)
336
    {
337
        $skipPage = false;
338
        $skipMessage = 'Skipped'; // message will be overwritten later
339
340
        // if page is hidden
341
        if (! $this->extensionSettings['crawlHiddenPages']) {
342
            if ($pageRow['hidden']) {
343
                $skipPage = true;
344
                $skipMessage = 'Because page is hidden';
345
            }
346
        }
347
348
        if (! $skipPage) {
349
            if (GeneralUtility::inList('3,4,199,254,255', $pageRow['doktype'])) {
350
                $skipPage = true;
351
                $skipMessage = 'Because doktype is not allowed';
352
            }
353
        }
354
355
        if (! $skipPage) {
356
            foreach ($GLOBALS['TYPO3_CONF_VARS']['EXTCONF']['crawler']['excludeDoktype'] ?? [] as $key => $doktypeList) {
357
                if (GeneralUtility::inList($doktypeList, $pageRow['doktype'])) {
358
                    $skipPage = true;
359
                    $skipMessage = 'Doktype was excluded by "' . $key . '"';
360
                    break;
361
                }
362
            }
363
        }
364
365
        if (! $skipPage) {
366
            // veto hook
367
            foreach ($GLOBALS['TYPO3_CONF_VARS']['EXTCONF']['crawler']['pageVeto'] ?? [] as $key => $func) {
368
                $params = [
369
                    'pageRow' => $pageRow,
370
                ];
371
                // expects "false" if page is ok and "true" or a skipMessage if this page should _not_ be crawled
372
                $veto = GeneralUtility::callUserFunction($func, $params, $this);
373
                if ($veto !== false) {
374
                    $skipPage = true;
375
                    if (is_string($veto)) {
376
                        $skipMessage = $veto;
377
                    } else {
378
                        $skipMessage = 'Veto from hook "' . htmlspecialchars($key) . '"';
379
                    }
380
                    // no need to execute other hooks if a previous one return a veto
381
                    break;
382
                }
383
            }
384
        }
385
386
        return $skipPage ? $skipMessage : false;
387
    }
388 2
389
    /**
390 2
     * Wrapper method for getUrlsForPageId()
391 2
     * It returns an array of configurations and no urls!
392 1
     *
393 1
     * @param array $pageRow Page record with at least dok-type and uid columns.
394
     * @param string $skipMessage
395 1
     * @return array
396 1
     * @see getUrlsForPageId()
397
     */
398
    public function getUrlsForPageRow(array $pageRow, &$skipMessage = '')
399 2
    {
400
        $message = $this->checkIfPageShouldBeSkipped($pageRow);
401
        if ($message === false) {
402
            $res = $this->getUrlsForPageId($pageRow['uid']);
403
            $skipMessage = '';
404
        } else {
405
            $skipMessage = $message;
406
            $res = [];
407
        }
408
409
        return $res;
410
    }
411
412
    /**
413
     * Creates a list of URLs from input array (and submits them to queue if asked for)
414
     * See Web > Info module script + "indexed_search"'s crawler hook-client using this!
415
     *
416
     * @param array $vv Information about URLs from pageRow to crawl.
417
     * @param array $pageRow Page row
418
     * @param int $scheduledTime Unix time to schedule indexing to, typically time()
419
     * @param int $reqMinute Number of requests per minute (creates the interleave between requests)
420
     * @param bool $submitCrawlUrls If set, submits the URLs to queue
421
     * @param bool $downloadCrawlUrls If set (and submitcrawlUrls is false) will fill $downloadUrls with entries)
422
     * @param array $duplicateTrack Array which is passed by reference and contains the an id per url to secure we will not crawl duplicates
423
     * @param array $downloadUrls Array which will be filled with URLS for download if flag is set.
424
     * @param array $incomingProcInstructions Array of processing instructions
425
     * @return string List of URLs (meant for display in backend module)
426
     */
427
    public function urlListFromUrlArray(
428
        array $vv,
429
        array $pageRow,
430
        $scheduledTime,
431
        $reqMinute,
432
        $submitCrawlUrls,
433
        $downloadCrawlUrls,
434
        array &$duplicateTrack,
435
        array &$downloadUrls,
436
        array $incomingProcInstructions
437
    ) {
438
        if (! is_array($vv['URLs'])) {
439
            return 'ERROR - no URL generated';
440
        }
441
        $urlLog = [];
442
        $pageId = (int) $pageRow['uid'];
443
        $configurationHash = $this->getConfigurationHash($vv);
444
        $skipInnerCheck = $this->queueRepository->noUnprocessedQueueEntriesForPageWithConfigurationHashExist($pageId, $configurationHash);
445
446
        foreach ($vv['URLs'] as $urlQuery) {
447
            if (! $this->drawURLs_PIfilter($vv['subCfg']['procInstrFilter'], $incomingProcInstructions)) {
448
                continue;
449
            }
450
            $url = (string) $this->getUrlFromPageAndQueryParameters(
451
                $pageId,
452
                $urlQuery,
453
                $vv['subCfg']['baseUrl'] ?? null,
454
                $vv['subCfg']['force_ssl'] ?? 0
455
            );
456
457
            // Create key by which to determine unique-ness:
458
            $uKey = $url . '|' . $vv['subCfg']['userGroups'] . '|' . $vv['subCfg']['procInstrFilter'];
459
460
            if (isset($duplicateTrack[$uKey])) {
461
                //if the url key is registered just display it and do not resubmit is
462
                $urlLog[] = '<em><span class="text-muted">' . htmlspecialchars($url) . '</span></em>';
463
            } else {
464
                // Scheduled time:
465
                $schTime = $scheduledTime + round(count($duplicateTrack) * (60 / $reqMinute));
466
                $schTime = intval($schTime / 60) * 60;
467
                $formattedDate = BackendUtility::datetime($schTime);
468
                $this->urlList[] = '[' . $formattedDate . '] ' . $url;
469
                $urlList = '[' . $formattedDate . '] ' . htmlspecialchars($url);
470
471
                // Submit for crawling!
472
                if ($submitCrawlUrls) {
473
                    $added = $this->addUrl(
474
                        $pageId,
475
                        $url,
476
                        $vv['subCfg'],
477
                        $scheduledTime,
478
                        $configurationHash,
479
                        $skipInnerCheck
480
                    );
481
                    if ($added === false) {
482
                        $urlList .= ' (URL already existed)';
483
                    }
484
                } elseif ($downloadCrawlUrls) {
485
                    $downloadUrls[$url] = $url;
486
                }
487
                $urlLog[] = $urlList;
488
            }
489
            $duplicateTrack[$uKey] = true;
490
        }
491
492 5
        return implode('<br>', $urlLog);
493
    }
494 5
495 1
    /**
496
     * Returns true if input processing instruction is among registered ones.
497
     *
498 4
     * @param string $piString PI to test
499 4
     * @param array $incomingProcInstructions Processing instructions
500 2
     * @return boolean
501
     */
502
    public function drawURLs_PIfilter($piString, array $incomingProcInstructions)
503 2
    {
504
        if (empty($incomingProcInstructions)) {
505
            return true;
506 1
        }
507
508 1
        foreach ($incomingProcInstructions as $pi) {
509 1
            if (GeneralUtility::inList($piString, $pi)) {
510
                return true;
511
            }
512
        }
513
        return false;
514
    }
515
516
    public function getPageTSconfigForId($id): array
517 1
    {
518
        if (! $this->MP) {
519
            $pageTSconfig = BackendUtility::getPagesTSconfig($id);
0 ignored issues
show
Deprecated Code introduced by
The function TYPO3\CMS\Backend\Utilit...ity::getPagesTSconfig() has been deprecated. ( Ignorable by Annotation )

If this is a false-positive, you can also ignore this issue in your code via the ignore-deprecated  annotation

519
            $pageTSconfig = /** @scrutinizer ignore-deprecated */ BackendUtility::getPagesTSconfig($id);
Loading history...
520
        } else {
521
            // TODO: Please check, this makes no sense to split a boolean value.
522
            [, $mountPointId] = explode('-', $this->MP);
0 ignored issues
show
Bug introduced by
$this->MP of type true is incompatible with the type string expected by parameter $string of explode(). ( Ignorable by Annotation )

If this is a false-positive, you can also ignore this issue in your code via the ignore-type  annotation

522
            [, $mountPointId] = explode('-', /** @scrutinizer ignore-type */ $this->MP);
Loading history...
523
            $pageTSconfig = BackendUtility::getPagesTSconfig($mountPointId);
0 ignored issues
show
Deprecated Code introduced by
The function TYPO3\CMS\Backend\Utilit...ity::getPagesTSconfig() has been deprecated. ( Ignorable by Annotation )

If this is a false-positive, you can also ignore this issue in your code via the ignore-deprecated  annotation

523
            $pageTSconfig = /** @scrutinizer ignore-deprecated */ BackendUtility::getPagesTSconfig($mountPointId);
Loading history...
Bug introduced by
$mountPointId of type string is incompatible with the type integer expected by parameter $id of TYPO3\CMS\Backend\Utilit...ity::getPagesTSconfig(). ( Ignorable by Annotation )

If this is a false-positive, you can also ignore this issue in your code via the ignore-type  annotation

523
            $pageTSconfig = BackendUtility::getPagesTSconfig(/** @scrutinizer ignore-type */ $mountPointId);
Loading history...
524
        }
525
526 1
        // Call a hook to alter configuration
527
        if (is_array($GLOBALS['TYPO3_CONF_VARS']['EXTCONF']['crawler']['getPageTSconfigForId'])) {
528
            $params = [
529
                'pageId' => $id,
530
                'pageTSConfig' => &$pageTSconfig,
531
            ];
532
            foreach ($GLOBALS['TYPO3_CONF_VARS']['EXTCONF']['crawler']['getPageTSconfigForId'] as $userFunc) {
533
                GeneralUtility::callUserFunction($userFunc, $params, $this);
534
            }
535
        }
536
        return $pageTSconfig;
537
    }
538
539
    /**
540
     * This methods returns an array of configurations.
541
     * Adds no urls!
542
     */
543
    public function getUrlsForPageId(int $pageId): array
544
    {
545
        // Get page TSconfig for page ID
546
        $pageTSconfig = $this->getPageTSconfigForId($pageId);
547
548
        $res = [];
549
550
        // Fetch Crawler Configuration from pageTSconfig
551
        $crawlerCfg = $pageTSconfig['tx_crawler.']['crawlerCfg.']['paramSets.'] ?? [];
552
        foreach ($crawlerCfg as $key => $values) {
553
            if (! is_array($values)) {
554
                continue;
555
            }
556
            $key = str_replace('.', '', $key);
557
            // Sub configuration for a single configuration string:
558
            $subCfg = (array) $crawlerCfg[$key . '.'];
559
            $subCfg['key'] = $key;
560
561
            if (strcmp($subCfg['procInstrFilter'] ?? '', '')) {
562
                $subCfg['procInstrFilter'] = implode(',', GeneralUtility::trimExplode(',', $subCfg['procInstrFilter']));
563
            }
564
            $pidOnlyList = implode(',', GeneralUtility::trimExplode(',', $subCfg['pidsOnly'], true));
565
566
            // process configuration if it is not page-specific or if the specific page is the current page:
567
            // TODO: Check if $pidOnlyList can be kept as Array instead of imploded
568
            if (! strcmp((string) $subCfg['pidsOnly'], '') || GeneralUtility::inList($pidOnlyList, strval($pageId))) {
569
570
                // Explode, process etc.:
571
                $res[$key] = [];
572
                $res[$key]['subCfg'] = $subCfg;
573
                $res[$key]['paramParsed'] = GeneralUtility::explodeUrl2Array($crawlerCfg[$key]);
574
                $res[$key]['paramExpanded'] = $this->expandParameters($res[$key]['paramParsed'], $pageId);
575
                $res[$key]['origin'] = 'pagets';
576
577
                // recognize MP value
578
                if (! $this->MP) {
579
                    $res[$key]['URLs'] = $this->compileUrls($res[$key]['paramExpanded'], ['?id=' . $pageId]);
580
                } else {
581
                    $res[$key]['URLs'] = $this->compileUrls($res[$key]['paramExpanded'], ['?id=' . $pageId . '&MP=' . $this->MP]);
0 ignored issues
show
Bug introduced by
Are you sure $this->MP of type true can be used in concatenation? ( Ignorable by Annotation )

If this is a false-positive, you can also ignore this issue in your code via the ignore-type  annotation

581
                    $res[$key]['URLs'] = $this->compileUrls($res[$key]['paramExpanded'], ['?id=' . $pageId . '&MP=' . /** @scrutinizer ignore-type */ $this->MP]);
Loading history...
582
                }
583
            }
584
        }
585
586
        // Get configuration from tx_crawler_configuration records up the rootline
587
        $crawlerConfigurations = $this->configurationRepository->getCrawlerConfigurationRecordsFromRootLine($pageId);
588
        foreach ($crawlerConfigurations as $configurationRecord) {
589
590
            // check access to the configuration record
591
            if (empty($configurationRecord['begroups']) || $this->getBackendUser()->isAdmin() || $this->hasGroupAccess($this->getBackendUser()->user['usergroup_cached_list'], $configurationRecord['begroups'])) {
592
                $pidOnlyList = implode(',', GeneralUtility::trimExplode(',', $configurationRecord['pidsonly'], true));
593
594
                // process configuration if it is not page-specific or if the specific page is the current page:
595
                // TODO: Check if $pidOnlyList can be kept as Array instead of imploded
596
                if (! strcmp($configurationRecord['pidsonly'], '') || GeneralUtility::inList($pidOnlyList, strval($pageId))) {
597
                    $key = $configurationRecord['name'];
598
599
                    // don't overwrite previously defined paramSets
600
                    if (! isset($res[$key])) {
601
602
                        /* @var $TSparserObject \TYPO3\CMS\Core\TypoScript\Parser\TypoScriptParser */
603
                        $TSparserObject = GeneralUtility::makeInstance(TypoScriptParser::class);
604
                        $TSparserObject->parse($configurationRecord['processing_instruction_parameters_ts']);
605
606
                        $subCfg = [
607
                            'procInstrFilter' => $configurationRecord['processing_instruction_filter'],
608
                            'procInstrParams.' => $TSparserObject->setup,
609
                            'baseUrl' => $configurationRecord['base_url'],
610
                            'force_ssl' => (int) $configurationRecord['force_ssl'],
611
                            'userGroups' => $configurationRecord['fegroups'],
612
                            'exclude' => $configurationRecord['exclude'],
613
                            'key' => $key,
614
                        ];
615
616
                        if (! in_array($pageId, $this->expandExcludeString($subCfg['exclude']), true)) {
617
                            $res[$key] = [];
618
                            $res[$key]['subCfg'] = $subCfg;
619
                            $res[$key]['paramParsed'] = GeneralUtility::explodeUrl2Array($configurationRecord['configuration']);
620
                            $res[$key]['paramExpanded'] = $this->expandParameters($res[$key]['paramParsed'], $pageId);
621
                            $res[$key]['URLs'] = $this->compileUrls($res[$key]['paramExpanded'], ['?id=' . $pageId]);
622
                            $res[$key]['origin'] = 'tx_crawler_configuration_' . $configurationRecord['uid'];
623
                        }
624
                    }
625
                }
626
            }
627
        }
628
629
        foreach ($GLOBALS['TYPO3_CONF_VARS']['EXTCONF']['crawler']['processUrls'] ?? [] as $func) {
630
            $params = [
631
                'res' => &$res,
632 1
            ];
633
            GeneralUtility::callUserFunction($func, $params, $this);
634 1
        }
635 1
        return $res;
636 1
    }
637 1
638
    /**
639
     * Find all configurations of subpages of a page
640
     * TODO: Write Functional Tests
641
     */
642
    public function getConfigurationsForBranch(int $rootid, int $depth): array
643 1
    {
644 1
        $configurationsForBranch = [];
645 1
        $pageTSconfig = $this->getPageTSconfigForId($rootid);
646 1
        $sets = $pageTSconfig['tx_crawler.']['crawlerCfg.']['paramSets.'] ?? [];
647
        foreach ($sets as $key => $value) {
648
            if (! is_array($value)) {
649 1
                continue;
650 1
            }
651 1
            $configurationsForBranch[] = substr($key, -1) === '.' ? substr($key, 0, -1) : $key;
652 1
        }
653 1
        $pids = [];
654
        $rootLine = BackendUtility::BEgetRootLine($rootid);
655
        foreach ($rootLine as $node) {
656
            $pids[] = $node['uid'];
657 1
        }
658
        /* @var PageTreeView $tree */
659 1
        $tree = GeneralUtility::makeInstance(PageTreeView::class);
660 1
        $perms_clause = $this->getBackendUser()->getPagePermsClause(Permission::PAGE_SHOW);
661 1
        $tree->init(empty($perms_clause) ? '' : ('AND ' . $perms_clause));
662 1
        $tree->getTree($rootid, $depth, '');
663
        foreach ($tree->tree as $node) {
664 1
            $pids[] = $node['row']['uid'];
665
        }
666 1
667 1
        $queryBuilder = $this->getQueryBuilder('tx_crawler_configuration');
668
        $statement = $queryBuilder
669 1
            ->select('name')
670
            ->from('tx_crawler_configuration')
671
            ->where(
672
                $queryBuilder->expr()->in('pid', $queryBuilder->createNamedParameter($pids, Connection::PARAM_INT_ARRAY))
673
            )
674
            ->execute();
675
676
        while ($row = $statement->fetch()) {
677
            $configurationsForBranch[] = $row['name'];
678
        }
679
        return $configurationsForBranch;
680
    }
681 3
682
    /**
683 3
     * Check if a user has access to an item
684 1
     * (e.g. get the group list of the current logged in user from $GLOBALS['TSFE']->gr_list)
685
     *
686 2
     * @param string $groupList Comma-separated list of (fe_)group UIDs from a user
687 2
     * @param string $accessList Comma-separated list of (fe_)group UIDs of the item to access
688 1
     * @return bool                 TRUE if at least one of the users group UIDs is in the access list or the access list is empty
689
     * @see \TYPO3\CMS\Frontend\Page\PageRepository::getMultipleGroupsWhereClause()
690
     */
691 1
    public function hasGroupAccess($groupList, $accessList)
692
    {
693
        if (empty($accessList)) {
694
            return true;
695
        }
696
        foreach (GeneralUtility::intExplode(',', $groupList) as $groupUid) {
697
            if (GeneralUtility::inList($accessList, $groupUid)) {
698
                return true;
699
            }
700
        }
701
        return false;
702
    }
703
704
    /**
705
     * Will expand the parameters configuration to individual values. This follows a certain syntax of the value of each parameter.
706
     * Syntax of values:
707
     * - Basically: If the value is wrapped in [...] it will be expanded according to the following syntax, otherwise the value is taken literally
708
     * - Configuration is splitted by "|" and the parts are processed individually and finally added together
709
     * - For each configuration part:
710
     *         - "[int]-[int]" = Integer range, will be expanded to all values in between, values included, starting from low to high (max. 1000). Example "1-34" or "-40--30"
711 7
     *         - "_TABLE:[TCA table name];[_PID:[optional page id, default is current page]];[_ENABLELANG:1]" = Look up of table records from PID, filtering out deleted records. Example "_TABLE:tt_content; _PID:123"
712
     *        _ENABLELANG:1 picks only original records without their language overlays
713
     *         - Default: Literal value
714 7
     *
715 7
     * @param array $paramArray Array with key (GET var name) and values (value of GET var which is configuration for expansion)
716
     * @param integer $pid Current page ID
717
     * @return array
718 7
     *
719
     * TODO: Write Functional Tests
720 7
     */
721 7
    public function expandParameters($paramArray, $pid)
722
    {
723
        // Traverse parameter names:
724 7
        foreach ($paramArray as $p => $v) {
725 7
            $v = trim($v);
726
727
            // If value is encapsulated in square brackets it means there are some ranges of values to find, otherwise the value is literal
728 7
            if (strpos($v, '[') === 0 && substr($v, -1) === ']') {
729 1
                // So, find the value inside brackets and reset the paramArray value as an array.
730
                $v = substr($v, 1, -1);
731
                $paramArray[$p] = [];
732 1
733 1
                // Explode parts and traverse them:
734 1
                $parts = explode('|', $v);
735 1
                foreach ($parts as $pV) {
736 1
737
                    // Look for integer range: (fx. 1-34 or -40--30 // reads minus 40 to minus 30)
738
                    if (preg_match('/^(-?[0-9]+)\s*-\s*(-?[0-9]+)$/', trim($pV), $reg)) {
739
                        $reg = $this->swapIfFirstIsLargerThanSecond($reg);
740 6
741
                        // Traverse range, add values:
742
                        $runAwayBrake = 1000; // Limit to size of range!
743 6
                        for ($a = $reg[1]; $a <= $reg[2]; $a++) {
744 6
                            $paramArray[$p][] = $a;
745 6
                            $runAwayBrake--;
746 6
                            if ($runAwayBrake <= 0) {
747 6
                                break;
748
                            }
749
                        }
750
                    } elseif (strpos(trim($pV), '_TABLE:') === 0) {
751 6
752 6
                        // Parse parameters:
753 6
                        $subparts = GeneralUtility::trimExplode(';', $pV);
754 6
                        $subpartParams = [];
755 6
                        foreach ($subparts as $spV) {
756 6
                            [$pKey, $pVal] = GeneralUtility::trimExplode(':', $spV);
757
                            $subpartParams[$pKey] = $pVal;
758 6
                        }
759 6
760 6
                        // Table exists:
761
                        if (isset($GLOBALS['TCA'][$subpartParams['_TABLE']])) {
762 6
                            $lookUpPid = isset($subpartParams['_PID']) ? intval($subpartParams['_PID']) : intval($pid);
763
                            $recursiveDepth = isset($subpartParams['_RECURSIVE']) ? intval($subpartParams['_RECURSIVE']) : 0;
764 2
                            $pidField = isset($subpartParams['_PIDFIELD']) ? trim($subpartParams['_PIDFIELD']) : 'pid';
765 2
                            $where = $subpartParams['_WHERE'] ?? '';
766 2
                            $addTable = $subpartParams['_ADDTABLE'] ?? '';
767
768 4
                            $fieldName = $subpartParams['_FIELD'] ? $subpartParams['_FIELD'] : 'uid';
769
                            if ($fieldName === 'uid' || $GLOBALS['TCA'][$subpartParams['_TABLE']]['columns'][$fieldName]) {
770
                                $queryBuilder = $this->getQueryBuilder($subpartParams['_TABLE']);
771 6
772 6
                                if ($recursiveDepth > 0) {
773 6
                                    /** @var \TYPO3\CMS\Core\Database\QueryGenerator $queryGenerator */
774
                                    $queryGenerator = GeneralUtility::makeInstance(\TYPO3\CMS\Core\Database\QueryGenerator::class);
775
                                    $pidList = $queryGenerator->getTreeList($lookUpPid, $recursiveDepth, 0, 1);
776 6
                                    $pidArray = GeneralUtility::intExplode(',', $pidList);
777 6
                                } else {
778 6
                                    $pidArray = [(string) $lookUpPid];
779 6
                                }
780 6
781
                                $queryBuilder->getRestrictions()
782
                                    ->removeAll()
783 6
                                    ->add(GeneralUtility::makeInstance(DeletedRestriction::class));
784
785
                                $queryBuilder
786
                                    ->select($fieldName)
787 6
                                    ->from($subpartParams['_TABLE'])
788
                                    ->where(
789 6
                                        $queryBuilder->expr()->in($pidField, $queryBuilder->createNamedParameter($pidArray, Connection::PARAM_INT_ARRAY)),
790
                                        $where
791
                                    );
792
793
                                if (! empty($addTable)) {
794
                                    // TODO: Check if this works as intended!
795
                                    $queryBuilder->add('from', $addTable);
796
                                }
797
                                $transOrigPointerField = $GLOBALS['TCA'][$subpartParams['_TABLE']]['ctrl']['transOrigPointerField'];
798 6
799
                                if ($subpartParams['_ENABLELANG'] && $transOrigPointerField) {
800 6
                                    $queryBuilder->andWhere(
801 6
                                        $queryBuilder->expr()->lte(
802 6
                                            $transOrigPointerField,
803
                                            0
804
                                        )
805 6
                                    );
806 6
                                }
807
808
                                $statement = $queryBuilder->execute();
809
810
                                $rows = [];
811
                                while ($row = $statement->fetch()) {
812
                                    $rows[$row[$fieldName]] = $row;
813
                                }
814 7
815
                                if (is_array($rows)) {
816
                                    $paramArray[$p] = array_merge($paramArray[$p], array_keys($rows));
817
                                }
818
                            }
819
                        }
820
                    } else { // Just add value:
821
                        $paramArray[$p][] = $pV;
822
                    }
823
                    // Hook for processing own expandParameters place holder
824
                    if (is_array($GLOBALS['TYPO3_CONF_VARS']['SC_OPTIONS']['crawler/class.tx_crawler_lib.php']['expandParameters'])) {
825
                        $_params = [
826
                            'pObj' => &$this,
827
                            'paramArray' => &$paramArray,
828
                            'currentKey' => $p,
829 7
                            'currentValue' => $pV,
830 7
                            'pid' => $pid,
831
                        ];
832
                        foreach ($GLOBALS['TYPO3_CONF_VARS']['SC_OPTIONS']['crawler/class.tx_crawler_lib.php']['expandParameters'] as $_funcRef) {
833
                            GeneralUtility::callUserFunction($_funcRef, $_params, $this);
834
                        }
835
                    }
836
                }
837 7
838
                // Make unique set of values and sort array by key:
839
                $paramArray[$p] = array_unique($paramArray[$p]);
840
                ksort($paramArray);
841
            } else {
842
                // Set the literal value as only value in array:
843
                $paramArray[$p] = [$v];
844
            }
845
        }
846
847
        return $paramArray;
848 3
    }
849
850 3
    /**
851 3
     * Compiling URLs from parameter array (output of expandParameters())
852
     * The number of URLs will be the multiplication of the number of parameter values for each key
853
     *
854 2
     * @param array $paramArray Output of expandParameters(): Array with keys (GET var names) and for each an array of values
855 2
     * @param array $urls URLs accumulated in this array (for recursion)
856 2
     * @return array
857
     */
858
    public function compileUrls($paramArray, array $urls)
859 2
    {
860 2
        if (empty($paramArray)) {
861 1
            return $urls;
862 1
        }
863
        // shift first off stack:
864 1
        reset($paramArray);
865
        $varName = key($paramArray);
866
        $valueSet = array_shift($paramArray);
867
868
        // Traverse value set:
869 2
        $newUrls = [];
870
        foreach ($urls as $url) {
871
            foreach ($valueSet as $val) {
872
                if (count($newUrls) < $this->maximumUrlsToCompile) {
873
                    $newUrls[] = $url . (strcmp((string) $val, '') ? '&' . rawurlencode($varName) . '=' . rawurlencode((string) $val) : '');
874
                }
875
            }
876
        }
877
        return $this->compileUrls($paramArray, $newUrls);
878
    }
879
880
    /************************************
881
     *
882
     * Crawler log
883
     *
884
     ************************************/
885
886
    /**
887
     * Return array of records from crawler queue for input page ID
888 4
     *
889
     * @param integer $id Page ID for which to look up log entries.
890 4
     * @param string $filter Filter: "all" => all entries, "pending" => all that is not yet run, "finished" => all complete ones
891
     * @param boolean $doFlush If TRUE, then entries selected at DELETED(!) instead of selected!
892 4
     * @param boolean $doFullFlush
893 4
     * @param integer $itemsPerPage Limit the amount of entries per page default is 10
894 4
     * @return array
895 4
     */
896
    public function getLogEntriesForPageId($id, $filter = '', $doFlush = false, $doFullFlush = false, $itemsPerPage = 10)
897 4
    {
898
        $queryBuilder = GeneralUtility::makeInstance(ConnectionPool::class)->getQueryBuilderForTable($this->tableName);
899 4
        $queryBuilder
900 4
            ->select('*')
901 4
            ->from($this->tableName)
902 4
            ->where(
903
                $queryBuilder->expr()->eq('page_id', $queryBuilder->createNamedParameter($id, \PDO::PARAM_INT))
904
            )
905
            ->orderBy('scheduled', 'DESC');
906 4
907 4
        $expressionBuilder = GeneralUtility::makeInstance(ConnectionPool::class)
908
            ->getConnectionForTable($this->tableName)
909
            ->getExpressionBuilder();
910 4
        $query = $expressionBuilder->andX();
0 ignored issues
show
Unused Code introduced by
The assignment to $query is dead and can be removed.
Loading history...
911
        // PHPStorm adds the highlight that the $addWhere is immediately overwritten,
912
        // but the $query = $expressionBuilder->andX() ensures that the $addWhere is written correctly with AND
913
        // between the statements, it's not a mistake in the code.
914
        switch ($filter) {
915 4
            case 'pending':
916 2
                $queryBuilder->andWhere($queryBuilder->expr()->eq('exec_time', 0));
917 1
                break;
918
            case 'finished':
919 1
                $queryBuilder->andWhere($queryBuilder->expr()->gt('exec_time', 0));
920
                break;
921
        }
922 4
923
        if ($doFlush) {
924 4
            if ($doFullFlush) {
925
                $this->queueRepository->flushQueue('all');
926
            } else {
927 4
                $this->queueRepository->flushQueue($filter);
928
            }
929
        }
930
        if ($itemsPerPage > 0) {
931
            $queryBuilder
932
                ->setMaxResults((int) $itemsPerPage);
933
        }
934
935
        return $queryBuilder->execute()->fetchAll();
936
    }
937
938
    /**
939
     * Return array of records from crawler queue for input set ID
940
     *
941 6
     * @param int $set_id Set ID for which to look up log entries.
942
     * @param string $filter Filter: "all" => all entries, "pending" => all that is not yet run, "finished" => all complete ones
943 6
     * @param bool $doFlush If TRUE, then entries selected at DELETED(!) instead of selected!
944
     * @param int $itemsPerPage Limit the amount of entries per page default is 10
945 6
     * @return array
946 6
     *
947 6
     * @deprecated
948 6
     */
949
    public function getLogEntriesForSetId(int $set_id, string $filter = '', bool $doFlush = false, bool $doFullFlush = false, int $itemsPerPage = 10)
950 6
    {
951
        $queryBuilder = GeneralUtility::makeInstance(ConnectionPool::class)->getQueryBuilderForTable($this->tableName);
952 6
        $queryBuilder
953 6
            ->select('*')
954 6
            ->from($this->tableName)
955 6
            ->where(
956
                $queryBuilder->expr()->eq('set_id', $queryBuilder->createNamedParameter($set_id, \PDO::PARAM_INT))
957
            )
958
            ->orderBy('scheduled', 'DESC');
959 6
960 6
        $expressionBuilder = GeneralUtility::makeInstance(ConnectionPool::class)
961 6
            ->getConnectionForTable($this->tableName)
962 1
            ->getExpressionBuilder();
963 1
        $query = $expressionBuilder->andX();
964 1
        // PHPStorm adds the highlight that the $addWhere is immediately overwritten,
965 5
        // but the $query = $expressionBuilder->andX() ensures that the $addWhere is written correctly with AND
966 1
        // between the statements, it's not a mistake in the code.
967 1
        $addWhere = '';
968 1
        switch ($filter) {
969
            case 'pending':
970 6
                $queryBuilder->andWhere($queryBuilder->expr()->eq('exec_time', 0));
971 4
                $addWhere = $query->add($expressionBuilder->eq('exec_time', 0));
0 ignored issues
show
Unused Code introduced by
The assignment to $addWhere is dead and can be removed.
Loading history...
972 4
                break;
973 4
            case 'finished':
974
                $queryBuilder->andWhere($queryBuilder->expr()->gt('exec_time', 0));
975 2
                $addWhere = $query->add($expressionBuilder->gt('exec_time', 0));
976
                break;
977 2
        }
978
        if ($doFlush) {
979
            $addWhere = $query->add($expressionBuilder->eq('set_id', (int) $set_id));
980 2
            $this->flushQueue($doFullFlush ? '' : $addWhere);
0 ignored issues
show
Deprecated Code introduced by
The function AOE\Crawler\Controller\C...ontroller::flushQueue() has been deprecated. ( Ignorable by Annotation )

If this is a false-positive, you can also ignore this issue in your code via the ignore-deprecated  annotation

980
            /** @scrutinizer ignore-deprecated */ $this->flushQueue($doFullFlush ? '' : $addWhere);
Loading history...
981
            return [];
982
        }
983
        if ($itemsPerPage > 0) {
984
            $queryBuilder
985
                ->setMaxResults((int) $itemsPerPage);
986
        }
987
988
        return $queryBuilder->execute()->fetchAll();
989
    }
990
991
    /**
992
     * Adding call back entries to log (called from hooks typically, see indexed search class "class.crawler.php"
993
     *
994
     * @param integer $setId Set ID
995
     * @param array $params Parameters to pass to call back function
996
     * @param string $callBack Call back object reference, eg. 'EXT:indexed_search/class.crawler.php:&tx_indexedsearch_crawler'
997
     * @param integer $page_id Page ID to attach it to
998
     * @param integer $schedule Time at which to activate
999
     */
1000
    public function addQueueEntry_callBack($setId, $params, $callBack, $page_id = 0, $schedule = 0): void
1001
    {
1002
        if (! is_array($params)) {
0 ignored issues
show
introduced by
The condition is_array($params) is always true.
Loading history...
1003
            $params = [];
1004
        }
1005
        $params['_CALLBACKOBJ'] = $callBack;
1006
1007
        GeneralUtility::makeInstance(ConnectionPool::class)->getConnectionForTable('tx_crawler_queue')
1008
            ->insert(
1009
                'tx_crawler_queue',
1010
                [
1011
                    'page_id' => (int) $page_id,
1012
                    'parameters' => json_encode($params),
1013
                    'scheduled' => (int) $schedule ?: $this->getCurrentTime(),
1014
                    'exec_time' => 0,
1015
                    'set_id' => (int) $setId,
1016
                    'result_data' => '',
1017
                ]
1018
            );
1019
    }
1020
1021
    /************************************
1022
     *
1023
     * URL setting
1024
     *
1025
     ************************************/
1026
1027
    /**
1028
     * Setting a URL for crawling:
1029
     *
1030 4
     * @param integer $id Page ID
1031
     * @param string $url Complete URL
1032
     * @param array $subCfg Sub configuration array (from TS config)
1033
     * @param integer $tstamp Scheduled-time
1034
     * @param string $configurationHash (optional) configuration hash
1035
     * @param bool $skipInnerDuplicationCheck (optional) skip inner duplication check
1036
     * @return bool
1037
     */
1038 4
    public function addUrl(
1039 4
        $id,
1040
        $url,
1041
        array $subCfg,
1042
        $tstamp,
1043 4
        $configurationHash = '',
1044
        $skipInnerDuplicationCheck = false
1045
    ) {
1046
        $urlAdded = false;
1047 4
        $rows = [];
1048 4
1049 1
        // Creating parameters:
1050
        $parameters = [
1051
            'url' => $url,
1052
        ];
1053 4
1054 4
        // fe user group simulation:
1055 1
        $uGs = implode(',', array_unique(GeneralUtility::intExplode(',', $subCfg['userGroups'], true)));
1056
        if ($uGs) {
1057
            $parameters['feUserGroupList'] = $uGs;
1058
        }
1059 4
1060
        // Setting processing instructions
1061 4
        $parameters['procInstructions'] = GeneralUtility::trimExplode(',', $subCfg['procInstrFilter']);
1062 4
        if (is_array($subCfg['procInstrParams.'])) {
1063 4
            $parameters['procInstrParams'] = $subCfg['procInstrParams.'];
1064 4
        }
1065 4
1066 4
        // Compile value array:
1067 4
        $parameters_serialized = json_encode($parameters);
1068 4
        $fieldArray = [
1069 4
            'page_id' => (int) $id,
1070
            'parameters' => $parameters_serialized,
1071
            'parameters_hash' => GeneralUtility::shortMD5($parameters_serialized),
1072 4
            'configuration_hash' => $configurationHash,
1073
            'scheduled' => $tstamp,
1074 1
            'exec_time' => 0,
1075
            'set_id' => (int) $this->setID,
1076 3
            'result_data' => '',
1077
            'configuration' => $subCfg['key'],
1078 2
        ];
1079
1080
        if ($this->registerQueueEntriesInternallyOnly) {
1081 3
            //the entries will only be registered and not stored to the database
1082 2
            $this->queueEntries[] = $fieldArray;
1083 2
        } else {
1084 2
            if (! $skipInnerDuplicationCheck) {
1085 2
                // check if there is already an equal entry
1086
                $rows = $this->getDuplicateRowsIfExist($tstamp, $fieldArray);
1087 2
            }
1088 2
1089 2
            if (empty($rows)) {
1090
                $connectionForCrawlerQueue = GeneralUtility::makeInstance(ConnectionPool::class)->getConnectionForTable('tx_crawler_queue');
1091 2
                $connectionForCrawlerQueue->insert(
1092 2
                    'tx_crawler_queue',
1093 2
                    $fieldArray
1094 2
                );
1095 2
                $uid = $connectionForCrawlerQueue->lastInsertId('tx_crawler_queue', 'qid');
1096
                $rows[] = $uid;
1097
                $urlAdded = true;
1098 1
1099 1
                $signalPayload = ['uid' => $uid, 'fieldArray' => $fieldArray];
1100 1
                SignalSlotUtility::emitSignal(
1101 1
                    self::class,
1102 1
                    SignalSlotUtility::SIGNAL_URL_ADDED_TO_QUEUE,
1103
                    $signalPayload
1104
                );
1105
            } else {
1106
                $signalPayload = ['rows' => $rows, 'fieldArray' => $fieldArray];
1107 4
                SignalSlotUtility::emitSignal(
1108
                    self::class,
1109
                    SignalSlotUtility::SIGNAL_DUPLICATE_URL_IN_QUEUE,
1110
                    $signalPayload
1111
                );
1112
            }
1113
        }
1114
1115
        return $urlAdded;
1116
    }
1117
1118
    /**
1119
     * Returns the current system time
1120
     *
1121
     * @return int
1122
     */
1123
    public function getCurrentTime()
1124
    {
1125
        return time();
1126
    }
1127
1128
    /************************************
1129
     *
1130
     * URL reading
1131
     *
1132
     ************************************/
1133
1134
    /**
1135
     * Read URL for single queue entry
1136
     *
1137
     * @param integer $queueId
1138
     * @param boolean $force If set, will process even if exec_time has been set!
1139
     * @return integer
1140
     */
1141
    public function readUrl($queueId, $force = false)
1142
    {
1143
        $queryBuilder = GeneralUtility::makeInstance(ConnectionPool::class)->getQueryBuilderForTable($this->tableName);
1144
        $ret = 0;
1145
        $this->logger->debug('crawler-readurl start ' . microtime(true));
1146
        // Get entry:
1147
        $queryBuilder
1148
            ->select('*')
1149
            ->from('tx_crawler_queue')
1150
            ->where(
1151
                $queryBuilder->expr()->eq('qid', $queryBuilder->createNamedParameter($queueId, \PDO::PARAM_INT))
1152
            );
1153
        if (! $force) {
1154
            $queryBuilder
1155
                ->andWhere('exec_time = 0')
1156
                ->andWhere('process_scheduled > 0');
1157
        }
1158
        $queueRec = $queryBuilder->execute()->fetch();
1159
1160
        if (! is_array($queueRec)) {
1161
            return;
1162
        }
1163
1164
        SignalSlotUtility::emitSignal(
1165
            self::class,
1166
            SignalSlotUtility::SIGNAL_QUEUEITEM_PREPROCESS,
1167
            [$queueId, &$queueRec]
1168
        );
1169
1170
        // Set exec_time to lock record:
1171
        $field_array = ['exec_time' => $this->getCurrentTime()];
1172
1173
        if (isset($this->processID)) {
1174
            //if mulitprocessing is used we need to store the id of the process which has handled this entry
1175
            $field_array['process_id_completed'] = $this->processID;
1176
        }
1177
1178
        GeneralUtility::makeInstance(ConnectionPool::class)->getConnectionForTable('tx_crawler_queue')
1179
            ->update(
1180
                'tx_crawler_queue',
1181
                $field_array,
1182
                ['qid' => (int) $queueId]
1183
            );
1184
1185
        $result = $this->queueExecutor->executeQueueItem($queueRec, $this);
1186
        if ($result['content'] === null) {
1187
            $resultData = 'An errors happened';
1188
        } else {
1189
            /** @var JsonCompatibilityConverter $jsonCompatibilityConverter */
1190
            $jsonCompatibilityConverter = GeneralUtility::makeInstance(JsonCompatibilityConverter::class);
1191
            $resultData = $jsonCompatibilityConverter->convert($result['content']);
1192
        }
1193
1194
        //atm there's no need to point to specific pollable extensions
1195
        if (is_array($GLOBALS['TYPO3_CONF_VARS']['EXTCONF']['crawler']['pollSuccess'])) {
1196
            foreach ($GLOBALS['TYPO3_CONF_VARS']['EXTCONF']['crawler']['pollSuccess'] as $pollable) {
1197
                // only check the success value if the instruction is runnig
1198
                // it is important to name the pollSuccess key same as the procInstructions key
1199
                if (is_array($resultData['parameters']['procInstructions'])
1200
                    && in_array(
1201
                        $pollable,
1202
                        $resultData['parameters']['procInstructions'], true
1203
                    )
1204
                ) {
1205
                    if (! empty($resultData['success'][$pollable]) && $resultData['success'][$pollable]) {
1206
                        $ret |= self::CLI_STATUS_POLLABLE_PROCESSED;
1207
                    }
1208
                }
1209
            }
1210
        }
1211
1212
        // Set result in log which also denotes the end of the processing of this entry.
1213
        $field_array = ['result_data' => json_encode($result)];
1214
1215
        SignalSlotUtility::emitSignal(
1216
            self::class,
1217
            SignalSlotUtility::SIGNAL_QUEUEITEM_POSTPROCESS,
1218
            [$queueId, &$field_array]
1219
        );
1220
1221
        GeneralUtility::makeInstance(ConnectionPool::class)->getConnectionForTable('tx_crawler_queue')
1222
            ->update(
1223
                'tx_crawler_queue',
1224
                $field_array,
1225
                ['qid' => (int) $queueId]
1226
            );
1227
1228
        $this->logger->debug('crawler-readurl stop ' . microtime(true));
1229
        return $ret;
1230
    }
1231
1232
    /**
1233
     * Read URL for not-yet-inserted log-entry
1234
     *
1235
     * @param array $field_array Queue field array,
1236
     *
1237
     * @return string
1238
     */
1239
    public function readUrlFromArray($field_array)
1240
    {
1241
        // Set exec_time to lock record:
1242
        $field_array['exec_time'] = $this->getCurrentTime();
1243
        $connectionForCrawlerQueue = GeneralUtility::makeInstance(ConnectionPool::class)->getConnectionForTable($this->tableName);
1244
        $connectionForCrawlerQueue->insert(
1245
            $this->tableName,
1246
            $field_array
1247
        );
1248
        $queueId = $field_array['qid'] = $connectionForCrawlerQueue->lastInsertId($this->tableName, 'qid');
1249
1250
        $result = $this->queueExecutor->executeQueueItem($field_array, $this);
1251
1252
        // Set result in log which also denotes the end of the processing of this entry.
1253
        $field_array = ['result_data' => json_encode($result)];
1254
1255
        SignalSlotUtility::emitSignal(
1256
            self::class,
1257
            SignalSlotUtility::SIGNAL_QUEUEITEM_POSTPROCESS,
1258
            [$queueId, &$field_array]
1259
        );
1260
1261
        $connectionForCrawlerQueue->update(
1262
            $this->tableName,
1263
            $field_array,
1264
            ['qid' => $queueId]
1265
        );
1266
1267
        return $result;
0 ignored issues
show
Bug Best Practice introduced by
The expression return $result also could return the type array|boolean which is incompatible with the documented return type string.
Loading history...
1268
    }
1269
1270
    /*****************************
1271
     *
1272
     * Compiling URLs to crawl - tools
1273
     *
1274
     *****************************/
1275
1276
    /**
1277
     * @param integer $id Root page id to start from.
1278
     * @param integer $depth Depth of tree, 0=only id-page, 1= on sublevel, 99 = infinite
1279
     * @param integer $scheduledTime Unix Time when the URL is timed to be visited when put in queue
1280
     * @param integer $reqMinute Number of requests per minute (creates the interleave between requests)
1281
     * @param boolean $submitCrawlUrls If set, submits the URLs to queue in database (real crawling)
1282
     * @param boolean $downloadCrawlUrls If set (and submitcrawlUrls is false) will fill $downloadUrls with entries)
1283
     * @param array $incomingProcInstructions Array of processing instructions
1284
     * @param array $configurationSelection Array of configuration keys
1285
     * @return string
1286
     */
1287
    public function getPageTreeAndUrls(
1288
        $id,
1289
        $depth,
1290
        $scheduledTime,
1291
        $reqMinute,
1292
        $submitCrawlUrls,
1293
        $downloadCrawlUrls,
1294
        array $incomingProcInstructions,
1295
        array $configurationSelection
1296
    ) {
1297
        $this->scheduledTime = $scheduledTime;
1298
        $this->reqMinute = $reqMinute;
1299
        $this->submitCrawlUrls = $submitCrawlUrls;
1300
        $this->downloadCrawlUrls = $downloadCrawlUrls;
1301
        $this->incomingProcInstructions = $incomingProcInstructions;
1302
        $this->incomingConfigurationSelection = $configurationSelection;
1303
1304
        $this->duplicateTrack = [];
1305
        $this->downloadUrls = [];
1306
1307
        // Drawing tree:
1308
        /* @var PageTreeView $tree */
1309
        $tree = GeneralUtility::makeInstance(PageTreeView::class);
1310
        $perms_clause = $this->getBackendUser()->getPagePermsClause(Permission::PAGE_SHOW);
1311
        $tree->init('AND ' . $perms_clause);
1312
1313
        $pageInfo = BackendUtility::readPageAccess($id, $perms_clause);
1314
        if (is_array($pageInfo)) {
0 ignored issues
show
introduced by
The condition is_array($pageInfo) is always true.
Loading history...
1315
            // Set root row:
1316
            $tree->tree[] = [
1317
                'row' => $pageInfo,
1318
                'HTML' => $this->iconFactory->getIconForRecord('pages', $pageInfo, Icon::SIZE_SMALL),
1319
            ];
1320
        }
1321
1322
        // Get branch beneath:
1323
        if ($depth) {
1324
            $tree->getTree($id, $depth, '');
1325
        }
1326
1327
        // Traverse page tree:
1328
        $code = '';
1329
1330
        foreach ($tree->tree as $data) {
1331
            $this->MP = false;
1332
1333
            // recognize mount points
1334
            if ($data['row']['doktype'] === PageRepository::DOKTYPE_MOUNTPOINT) {
1335
                $queryBuilder = GeneralUtility::makeInstance(ConnectionPool::class)->getQueryBuilderForTable('pages');
1336
                $queryBuilder->getRestrictions()->removeAll()->add(GeneralUtility::makeInstance(DeletedRestriction::class));
1337
                $mountpage = $queryBuilder
1338
                    ->select('*')
1339
                    ->from('pages')
1340
                    ->where(
1341
                        $queryBuilder->expr()->eq('uid', $queryBuilder->createNamedParameter($data['row']['uid'], \PDO::PARAM_INT))
1342
                    )
1343
                    ->execute()
1344
                    ->fetchAll();
1345
                $queryBuilder->resetRestrictions();
1346
1347
                // fetch mounted pages
1348
                $this->MP = $mountpage[0]['mount_pid'] . '-' . $data['row']['uid'];
0 ignored issues
show
Documentation Bug introduced by
The property $MP was declared of type boolean, but $mountpage[0]['mount_pid...' . $data['row']['uid'] is of type string. Maybe add a type cast?

This check looks for assignments to scalar types that may be of the wrong type.

To ensure the code behaves as expected, it may be a good idea to add an explicit type cast.

$answer = 42;

$correct = false;

$correct = (bool) $answer;
Loading history...
1349
1350
                $mountTree = GeneralUtility::makeInstance(PageTreeView::class);
1351
                $mountTree->init('AND ' . $perms_clause);
1352
                $mountTree->getTree($mountpage[0]['mount_pid'], $depth);
1353
1354
                foreach ($mountTree->tree as $mountData) {
1355
                    $code .= $this->drawURLs_addRowsForPage(
1356
                        $mountData['row'],
1357
                        $mountData['HTML'] . BackendUtility::getRecordTitle('pages', $mountData['row'], true)
1358
                    );
1359
                }
1360
1361
                // replace page when mount_pid_ol is enabled
1362
                if ($mountpage[0]['mount_pid_ol']) {
1363
                    $data['row']['uid'] = $mountpage[0]['mount_pid'];
1364
                } else {
1365
                    // if the mount_pid_ol is not set the MP must not be used for the mountpoint page
1366
                    $this->MP = false;
1367
                }
1368
            }
1369
1370
            $code .= $this->drawURLs_addRowsForPage(
1371
                $data['row'],
1372
                $data['HTML'] . BackendUtility::getRecordTitle('pages', $data['row'], true)
1373
            );
1374
        }
1375
1376
        return $code;
1377
    }
1378
1379
    /**
1380
     * Expands exclude string
1381
     *
1382
     * @param string $excludeString Exclude string
1383
     * @return array
1384
     */
1385
    public function expandExcludeString($excludeString)
1386
    {
1387
        // internal static caches;
1388
        static $expandedExcludeStringCache;
1389
        static $treeCache;
1390
1391
        if (empty($expandedExcludeStringCache[$excludeString])) {
1392
            $pidList = [];
1393
1394
            if (! empty($excludeString)) {
1395
                /** @var PageTreeView $tree */
1396
                $tree = GeneralUtility::makeInstance(PageTreeView::class);
1397
                $tree->init('AND ' . $this->getBackendUser()->getPagePermsClause(Permission::PAGE_SHOW));
1398
1399
                $excludeParts = GeneralUtility::trimExplode(',', $excludeString);
1400
1401
                foreach ($excludeParts as $excludePart) {
1402
                    [$pid, $depth] = GeneralUtility::trimExplode('+', $excludePart);
1403
1404
                    // default is "page only" = "depth=0"
1405
                    if (empty($depth)) {
1406
                        $depth = (stristr($excludePart, '+')) ? 99 : 0;
1407
                    }
1408
1409
                    $pidList[] = $pid;
1410
1411
                    if ($depth > 0) {
1412
                        if (empty($treeCache[$pid][$depth])) {
1413
                            $tree->reset();
1414
                            $tree->getTree($pid, $depth);
0 ignored issues
show
Bug introduced by
$pid of type string is incompatible with the type integer expected by parameter $uid of TYPO3\CMS\Backend\Tree\V...ractTreeView::getTree(). ( Ignorable by Annotation )

If this is a false-positive, you can also ignore this issue in your code via the ignore-type  annotation

1414
                            $tree->getTree(/** @scrutinizer ignore-type */ $pid, $depth);
Loading history...
1415
                            $treeCache[$pid][$depth] = $tree->tree;
1416
                        }
1417
1418
                        foreach ($treeCache[$pid][$depth] as $data) {
1419
                            $pidList[] = $data['row']['uid'];
1420
                        }
1421
                    }
1422
                }
1423
            }
1424
1425
            $expandedExcludeStringCache[$excludeString] = array_unique($pidList);
1426
        }
1427
1428
        return $expandedExcludeStringCache[$excludeString];
1429
    }
1430
1431
    /**
1432
     * Create the rows for display of the page tree
1433
     * For each page a number of rows are shown displaying GET variable configuration
1434
     */
1435
    public function drawURLs_addRowsForPage(array $pageRow, string $pageTitle): string
1436
    {
1437
        $skipMessage = '';
1438
1439
        // Get list of configurations
1440
        $configurations = $this->getUrlsForPageRow($pageRow, $skipMessage);
1441
1442
        if (! empty($this->incomingConfigurationSelection)) {
1443
            // remove configuration that does not match the current selection
1444
            foreach ($configurations as $confKey => $confArray) {
1445
                if (! in_array($confKey, $this->incomingConfigurationSelection, true)) {
1446
                    unset($configurations[$confKey]);
1447
                }
1448
            }
1449
        }
1450
1451
        // Traverse parameter combinations:
1452
        $c = 0;
1453
        $content = '';
1454
        if (! empty($configurations)) {
1455
            foreach ($configurations as $confKey => $confArray) {
1456
1457
                // Title column:
1458
                if (! $c) {
1459
                    $titleClm = '<td rowspan="' . count($configurations) . '">' . $pageTitle . '</td>';
1460
                } else {
1461
                    $titleClm = '';
1462
                }
1463
1464
                if (! in_array($pageRow['uid'], $this->expandExcludeString($confArray['subCfg']['exclude']), true)) {
1465
1466
                    // URL list:
1467
                    $urlList = $this->urlListFromUrlArray(
1468
                        $confArray,
1469
                        $pageRow,
1470
                        $this->scheduledTime,
1471
                        $this->reqMinute,
1472
                        $this->submitCrawlUrls,
1473
                        $this->downloadCrawlUrls,
1474
                        $this->duplicateTrack,
1475
                        $this->downloadUrls,
1476
                        $this->incomingProcInstructions // if empty the urls won't be filtered by processing instructions
1477
                    );
1478
1479
                    // Expanded parameters:
1480
                    $paramExpanded = '';
1481
                    $calcAccu = [];
1482
                    $calcRes = 1;
1483
                    foreach ($confArray['paramExpanded'] as $gVar => $gVal) {
1484
                        $paramExpanded .= '
1485
                            <tr>
1486
                                <td>' . htmlspecialchars('&' . $gVar . '=') . '<br/>' .
1487
                            '(' . count($gVal) . ')' .
1488
                            '</td>
1489
                                <td nowrap="nowrap">' . nl2br(htmlspecialchars(implode(chr(10), $gVal))) . '</td>
1490
                            </tr>
1491
                        ';
1492
                        $calcRes *= count($gVal);
1493
                        $calcAccu[] = count($gVal);
1494
                    }
1495
                    $paramExpanded = '<table>' . $paramExpanded . '</table>';
1496
                    $paramExpanded .= 'Comb: ' . implode('*', $calcAccu) . '=' . $calcRes;
1497
1498
                    // Options
1499
                    $optionValues = '';
1500
                    if ($confArray['subCfg']['userGroups']) {
1501
                        $optionValues .= 'User Groups: ' . $confArray['subCfg']['userGroups'] . '<br/>';
1502
                    }
1503
                    if ($confArray['subCfg']['procInstrFilter']) {
1504
                        $optionValues .= 'ProcInstr: ' . $confArray['subCfg']['procInstrFilter'] . '<br/>';
1505
                    }
1506
1507
                    // Compile row:
1508
                    $content .= '
1509
                        <tr>
1510
                            ' . $titleClm . '
1511
                            <td>' . htmlspecialchars($confKey) . '</td>
1512
                            <td>' . nl2br(htmlspecialchars(rawurldecode(trim(str_replace('&', chr(10) . '&', GeneralUtility::implodeArrayForUrl('', $confArray['paramParsed'])))))) . '</td>
1513
                            <td>' . $paramExpanded . '</td>
1514
                            <td nowrap="nowrap">' . $urlList . '</td>
1515
                            <td nowrap="nowrap">' . $optionValues . '</td>
1516
                            <td nowrap="nowrap">' . DebugUtility::viewArray($confArray['subCfg']['procInstrParams.']) . '</td>
1517
                        </tr>';
1518
                } else {
1519
                    $content .= '<tr>
1520
                            ' . $titleClm . '
1521
                            <td>' . htmlspecialchars($confKey) . '</td>
1522
                            <td colspan="5"><em>No entries</em> (Page is excluded in this configuration)</td>
1523
                        </tr>';
1524
                }
1525
1526
                $c++;
1527
            }
1528
        } else {
1529
            $message = ! empty($skipMessage) ? ' (' . $skipMessage . ')' : '';
1530
1531
            // Compile row:
1532
            $content .= '
1533
                <tr>
1534
                    <td>' . $pageTitle . '</td>
1535
                    <td colspan="6"><em>No entries</em>' . $message . '</td>
1536
                </tr>';
1537
        }
1538
1539
        return $content;
1540
    }
1541
1542
    /*****************************
1543
     *
1544
     * CLI functions
1545
     *
1546
     *****************************/
1547
1548
    /**
1549
     * Running the functionality of the CLI (crawling URLs from queue)
1550
     */
1551
    public function CLI_run(int $countInARun, int $sleepTime, int $sleepAfterFinish): int
1552
    {
1553
        $result = 0;
1554
        $counter = 0;
1555
1556
        // First, run hooks:
1557
        $this->CLI_runHooks();
1558
1559
        // Clean up the queue
1560
        $this->queueRepository->cleanupQueue();
1561
1562
        // Select entries:
1563
        $rows = $this->queueRepository->fetchRecordsToBeCrawled($countInARun);
1564
1565
        if (! empty($rows)) {
1566
            $quidList = [];
1567
1568
            foreach ($rows as $r) {
1569
                $quidList[] = $r['qid'];
1570
            }
1571
1572
            $processId = $this->CLI_buildProcessId();
1573
1574
            //save the number of assigned queue entries to determine how many have been processed later
1575
            $numberOfAffectedRows = $this->queueRepository->updateProcessIdAndSchedulerForQueueIds($quidList, $processId);
1576
            $this->processRepository->updateProcessAssignItemsCount($numberOfAffectedRows, $processId);
1577
1578
            if ($numberOfAffectedRows !== count($quidList)) {
1579
                $this->CLI_debug('Nothing processed due to multi-process collision (' . $this->CLI_buildProcessId() . ')');
1580
                return ($result | self::CLI_STATUS_ABORTED);
1581
            }
1582
1583
            foreach ($rows as $r) {
1584
                $result |= $this->readUrl($r['qid']);
1585
1586
                $counter++;
1587
                usleep((int) $sleepTime); // Just to relax the system
1588
1589
                // if during the start and the current read url the cli has been disable we need to return from the function
1590
                // mark the process NOT as ended.
1591
                if ($this->getDisabled()) {
1592
                    return ($result | self::CLI_STATUS_ABORTED);
1593
                }
1594
1595
                if (! $this->processRepository->isProcessActive($this->CLI_buildProcessId())) {
1596
                    $this->CLI_debug('conflict / timeout (' . $this->CLI_buildProcessId() . ')');
1597
                    $result |= self::CLI_STATUS_ABORTED;
1598
                    break; //possible timeout
1599
                }
1600
            }
1601
1602
            sleep((int) $sleepAfterFinish);
1603
1604
            $msg = 'Rows: ' . $counter;
1605
            $this->CLI_debug($msg . ' (' . $this->CLI_buildProcessId() . ')');
1606
        } else {
1607
            $this->CLI_debug('Nothing within queue which needs to be processed (' . $this->CLI_buildProcessId() . ')');
1608
        }
1609
1610
        if ($counter > 0) {
1611
            $result |= self::CLI_STATUS_PROCESSED;
1612
        }
1613
1614
        return $result;
1615
    }
1616
1617
    /**
1618
     * Activate hooks
1619
     */
1620
    public function CLI_runHooks(): void
1621
    {
1622
        foreach ($GLOBALS['TYPO3_CONF_VARS']['EXTCONF']['crawler']['cli_hooks'] ?? [] as $objRef) {
1623
            $hookObj = GeneralUtility::makeInstance($objRef);
1624
            if (is_object($hookObj)) {
1625
                $hookObj->crawler_init($this);
1626
            }
1627
        }
1628
    }
1629
1630
    /**
1631
     * Try to acquire a new process with the given id
1632
     * also performs some auto-cleanup for orphan processes
1633
     * @param string $id identification string for the process
1634
     * @return boolean
1635
     * @todo preemption might not be the most elegant way to clean up
1636
     */
1637
    public function CLI_checkAndAcquireNewProcess($id)
1638
    {
1639
        $queryBuilder = GeneralUtility::makeInstance(ConnectionPool::class)->getQueryBuilderForTable($this->tableName);
1640
        $ret = true;
1641
1642
        $systemProcessId = getmypid();
1643
        if ($systemProcessId < 1) {
1644
            return false;
1645
        }
1646
1647
        $processCount = 0;
1648
        $orphanProcesses = [];
1649
1650
        $statement = $queryBuilder
1651
            ->select('process_id', 'ttl')
1652
            ->from('tx_crawler_process')
1653
            ->where(
1654
                'active = 1 AND deleted = 0'
1655
            )
1656
            ->execute();
1657
1658
        $currentTime = $this->getCurrentTime();
1659
1660
        while ($row = $statement->fetch()) {
1661
            if ($row['ttl'] < $currentTime) {
1662
                $orphanProcesses[] = $row['process_id'];
1663
            } else {
1664
                $processCount++;
1665
            }
1666
        }
1667
1668
        // if there are less than allowed active processes then add a new one
1669
        if ($processCount < (int) $this->extensionSettings['processLimit']) {
1670
            $this->CLI_debug('add process ' . $this->CLI_buildProcessId() . ' (' . ($processCount + 1) . '/' . (int) $this->extensionSettings['processLimit'] . ')');
1671
1672
            GeneralUtility::makeInstance(ConnectionPool::class)->getConnectionForTable('tx_crawler_process')->insert(
1673
                'tx_crawler_process',
1674
                [
1675
                    'process_id' => $id,
1676
                    'active' => 1,
1677
                    'ttl' => $currentTime + (int) $this->extensionSettings['processMaxRunTime'],
1678
                    'system_process_id' => $systemProcessId,
1679
                ]
1680
            );
1681
        } else {
1682
            $this->CLI_debug('Processlimit reached (' . ($processCount) . '/' . (int) $this->extensionSettings['processLimit'] . ')');
1683
            $ret = false;
1684
        }
1685
1686
        $this->processRepository->deleteProcessesMarkedAsDeleted();
1687
        $this->CLI_releaseProcesses($orphanProcesses);
1688
1689
        return $ret;
1690
    }
1691
1692
    /**
1693
     * Release a process and the required resources
1694
     *
1695
     * @param mixed $releaseIds string with a single process-id or array with multiple process-ids
1696
     * @return boolean
1697
     */
1698
    public function CLI_releaseProcesses($releaseIds)
1699
    {
1700
        $queryBuilder = GeneralUtility::makeInstance(ConnectionPool::class)->getQueryBuilderForTable($this->tableName);
1701
1702
        if (! is_array($releaseIds)) {
1703
            $releaseIds = [$releaseIds];
1704
        }
1705
1706
        if (empty($releaseIds)) {
1707
            return false;   //nothing to release
1708
        }
1709
1710
        // some kind of 2nd chance algo - this way you need at least 2 processes to have a real cleanup
1711
        // this ensures that a single process can't mess up the entire process table
1712
1713
        // mark all processes as deleted which have no "waiting" queue-entires and which are not active
1714
1715
        $queryBuilder
1716
            ->update($this->tableName, 'q')
1717
            ->where(
1718
                'q.process_id IN(SELECT p.process_id FROM tx_crawler_process as p WHERE p.active = 0)'
1719
            )
1720
            ->set('q.process_scheduled', 0)
1721
            ->set('q.process_id', '')
1722
            ->execute();
1723
1724
        // FIXME: Not entirely sure that this is equivalent to the previous version
1725
        $queryBuilder->resetQueryPart('set');
1726
1727
        $queryBuilder
1728
            ->update('tx_crawler_process')
1729
            ->where(
1730
                $queryBuilder->expr()->eq('active', 0),
1731
                'process_id IN(SELECT q.process_id FROM tx_crawler_queue as q WHERE q.exec_time = 0)'
1732
            )
1733
            ->set('system_process_id', 0)
1734
            ->execute();
1735
1736
        $this->processRepository->markRequestedProcessesAsNotActive($releaseIds);
1737
        $this->queueRepository->unsetProcessScheduledAndProcessIdForQueueEntries($releaseIds);
1738
1739 1
        return true;
1740
    }
1741 1
1742
    /**
1743
     * Create a unique Id for the current process
1744 1
     *
1745
     * @return string  the ID
1746
     */
1747
    public function CLI_buildProcessId()
1748
    {
1749
        if (! $this->processID) {
1750
            $this->processID = GeneralUtility::shortMD5(microtime(true));
1751
        }
1752
        return $this->processID;
1753
    }
1754
1755
    /**
1756
     * Prints a message to the stdout (only if debug-mode is enabled)
1757
     *
1758
     * @param string $msg the message
1759
     */
1760
    public function CLI_debug($msg): void
1761
    {
1762
        if ((int) $this->extensionSettings['processDebug']) {
1763
            echo $msg . "\n";
1764
            flush();
1765
        }
1766
    }
1767 1
1768
    /**
1769 1
     * Cleans up entries that stayed for too long in the queue. These are:
1770 1
     * - processed entries that are over 1.5 days in age
1771
     * - scheduled entries that are over 7 days old
1772 1
     *
1773 1
     * @deprecated
1774 1
     */
1775 1
    public function cleanUpOldQueueEntries(): void
1776
    {
1777
        $processedAgeInSeconds = $this->extensionSettings['cleanUpProcessedAge'] * 86400; // 24*60*60 Seconds in 24 hours
1778
        $scheduledAgeInSeconds = $this->extensionSettings['cleanUpScheduledAge'] * 86400;
1779
1780
        $now = time();
1781
        $condition = '(exec_time<>0 AND exec_time<' . ($now - $processedAgeInSeconds) . ') OR scheduled<=' . ($now - $scheduledAgeInSeconds);
1782
        $this->flushQueue($condition);
0 ignored issues
show
Deprecated Code introduced by
The function AOE\Crawler\Controller\C...ontroller::flushQueue() has been deprecated. ( Ignorable by Annotation )

If this is a false-positive, you can also ignore this issue in your code via the ignore-deprecated  annotation

1782
        /** @scrutinizer ignore-deprecated */ $this->flushQueue($condition);
Loading history...
1783
    }
1784 5
1785
    /**
1786 5
     * Removes queue entries
1787
     *
1788 5
     * @param string $where SQL related filter for the entries which should be removed
1789
     *
1790
     * @deprecated
1791 5
     */
1792 5
    protected function flushQueue($where = ''): void
1793 5
    {
1794 5
        $realWhere = strlen((string) $where) > 0 ? $where : '1=1';
1795 5
1796 5
        $queryBuilder = $this->getQueryBuilder($this->tableName);
1797 5
1798
        $groups = $queryBuilder
1799 4
            ->selectLiteral('DISTINCT set_id')
1800 4
            ->from($this->tableName)
1801 4
            ->where($realWhere)
1802 4
            ->execute()
1803 4
            ->fetchAll();
1804
        if (is_array($groups)) {
0 ignored issues
show
introduced by
The condition is_array($groups) is always true.
Loading history...
1805 4
            foreach ($groups as $group) {
1806 4
                $subSet = $queryBuilder
1807
                    ->select('qid', 'set_id')
1808 4
                    ->from($this->tableName)
1809 4
                    ->where(
1810 4
                        $realWhere,
1811 4
                        $queryBuilder->expr()->eq('set_id', $group['set_id'])
1812 4
                    )
1813
                    ->execute()
1814
                    ->fetchAll();
1815
1816
                $payLoad = ['subSet' => $subSet];
1817
                SignalSlotUtility::emitSignal(
1818 5
                    self::class,
1819 5
                    SignalSlotUtility::SIGNAL_QUEUE_ENTRY_FLUSH,
1820 5
                    $payLoad
1821 5
                );
1822
            }
1823
        }
1824
1825
        $queryBuilder
1826
            ->delete($this->tableName)
1827
            ->where($realWhere)
1828
            ->execute();
1829
    }
1830
1831
    /**
1832
     * This method determines duplicates for a queue entry with the same parameters and this timestamp.
1833 5
     * If the timestamp is in the past, it will check if there is any unprocessed queue entry in the past.
1834
     * If the timestamp is in the future it will check, if the queued entry has exactly the same timestamp
1835 5
     *
1836
     * @param int $tstamp
1837 5
     * @param array $fieldArray
1838
     *
1839 5
     * @return array
1840
     */
1841 5
    protected function getDuplicateRowsIfExist($tstamp, $fieldArray)
1842 5
    {
1843
        $rows = [];
1844 5
1845 2
        $currentTime = $this->getCurrentTime();
1846 1
1847 1
        $queryBuilder = GeneralUtility::makeInstance(ConnectionPool::class)->getQueryBuilderForTable($this->tableName);
1848
        $queryBuilder
1849 1
            ->select('qid')
1850 1
            ->from('tx_crawler_queue');
1851
        //if this entry is scheduled with "now"
1852 1
        if ($tstamp <= $currentTime) {
1853 1
            if ($this->extensionSettings['enableTimeslot']) {
1854
                $timeBegin = $currentTime - 100;
1855
                $timeEnd = $currentTime + 100;
1856
                $queryBuilder
1857 1
                    ->where(
1858 2
                        'scheduled BETWEEN ' . $timeBegin . ' AND ' . $timeEnd . ''
1859
                    )
1860
                    ->orWhere(
1861 3
                        $queryBuilder->expr()->lte('scheduled', $currentTime)
1862
                    );
1863
            } else {
1864 3
                $queryBuilder
1865 3
                    ->where(
1866
                        $queryBuilder->expr()->lte('scheduled', $currentTime)
1867
                    );
1868
            }
1869
        } elseif ($tstamp > $currentTime) {
1870 5
            //entry with a timestamp in the future need to have the same schedule time
1871 5
            $queryBuilder
1872 5
                ->where(
1873 5
                    $queryBuilder->expr()->eq('scheduled', $tstamp)
1874
                );
1875 5
        }
1876
1877 5
        $queryBuilder
1878 5
            ->andWhere('NOT exec_time')
1879
            ->andWhere('NOT process_id')
1880
            ->andWhere($queryBuilder->expr()->eq('page_id', $queryBuilder->createNamedParameter($fieldArray['page_id'], \PDO::PARAM_INT)))
1881 5
            ->andWhere($queryBuilder->expr()->eq('parameters_hash', $queryBuilder->createNamedParameter($fieldArray['parameters_hash'], \PDO::PARAM_STR)));
1882
1883
        $statement = $queryBuilder->execute();
1884
1885
        while ($row = $statement->fetch()) {
1886
            $rows[] = $row['qid'];
1887
        }
1888
1889 6
        return $rows;
1890
    }
1891 6
1892 6
    /**
1893 6
     * Returns a md5 hash generated from a serialized configuration array.
1894
     *
1895
     * @return string
1896
     */
1897
    protected function getConfigurationHash(array $configuration)
1898
    {
1899
        unset($configuration['paramExpanded']);
1900
        unset($configuration['URLs']);
1901
        return md5(serialize($configuration));
1902
    }
1903
1904 8
    /**
1905
     * Build a URL from a Page and the Query String. If the page has a Site configuration, it can be built by using
1906 8
     * the Site instance.
1907 8
     *
1908 5
     * @param int $httpsOrHttp see tx_crawler_configuration.force_ssl
1909 5
     * @throws \TYPO3\CMS\Core\Exception\SiteNotFoundException
1910 5
     * @throws \TYPO3\CMS\Core\Routing\InvalidRouteArgumentsException
1911 5
     */
1912
    protected function getUrlFromPageAndQueryParameters(int $pageId, string $queryString, ?string $alternativeBaseUrl, int $httpsOrHttp): UriInterface
1913 5
    {
1914
        $site = GeneralUtility::makeInstance(SiteMatcher::class)->matchByPageId((int) $pageId);
1915
        if ($site instanceof Site) {
1916
            $queryString = ltrim($queryString, '?&');
1917
            $queryParts = [];
1918 5
            parse_str($queryString, $queryParts);
1919
            unset($queryParts['id']);
1920 5
            // workaround as long as we don't have native language support in crawler configurations
1921 5
            if (isset($queryParts['L'])) {
1922 3
                $queryParts['_language'] = $queryParts['L'];
1923 3
                unset($queryParts['L']);
1924 3
                $siteLanguage = $site->getLanguageById((int) $queryParts['_language']);
0 ignored issues
show
Unused Code introduced by
The assignment to $siteLanguage is dead and can be removed.
Loading history...
1925 3
            } else {
1926 3
                $siteLanguage = $site->getDefaultLanguage();
1927 5
            }
1928
            $url = $site->getRouter()->generateUri($pageId, $queryParts);
1929
            if (! empty($alternativeBaseUrl)) {
1930
                $alternativeBaseUrl = new Uri($alternativeBaseUrl);
1931
                $url = $url->withHost($alternativeBaseUrl->getHost());
1932
                $url = $url->withScheme($alternativeBaseUrl->getScheme());
1933 3
                $url = $url->withPort($alternativeBaseUrl->getPort());
1934 3
                if ($userInfo = $alternativeBaseUrl->getUserInfo()) {
1935 3
                    $url = $url->withUserInfo($userInfo);
1936 3
                }
1937 3
            }
1938
        } else {
1939
            // Technically this is not possible with site handling, but kept for backwards-compatibility reasons
1940 8
            // Once EXT:crawler is v10-only compatible, this should be removed completely
1941 2
            $baseUrl = ($alternativeBaseUrl ?: GeneralUtility::getIndpEnv('TYPO3_SITE_URL'));
1942 6
            $cacheHashCalculator = GeneralUtility::makeInstance(CacheHashCalculator::class);
1943 6
            $queryString .= '&cHash=' . $cacheHashCalculator->generateForParameters($queryString);
1944
            $url = rtrim($baseUrl, '/') . '/index.php' . $queryString;
1945
            $url = new Uri($url);
1946 8
        }
1947
1948
        if ($httpsOrHttp === -1) {
1949 1
            $url = $url->withScheme('http');
1950
        } elseif ($httpsOrHttp === 1) {
1951
            $url = $url->withScheme('https');
1952 1
        }
1953
1954
        return $url;
1955
    }
1956
1957
    protected function swapIfFirstIsLargerThanSecond(array $reg): array
1958 1
    {
1959
        // Swap if first is larger than last:
1960
        if ($reg[1] > $reg[2]) {
1961
            $temp = $reg[2];
1962
            $reg[2] = $reg[1];
1963
            $reg[1] = $temp;
1964 1
        }
1965
1966
        return $reg;
1967 1
    }
1968 1
1969 1
    /**
1970
     * @return BackendUserAuthentication
1971 1
     */
1972
    private function getBackendUser()
1973
    {
1974
        // Make sure the _cli_ user is loaded
1975
        Bootstrap::initializeBackendAuthentication();
1976
        if ($this->backendUser === null) {
1977
            $this->backendUser = $GLOBALS['BE_USER'];
1978
        }
1979 12
        return $this->backendUser;
1980
    }
1981 12
1982
    /**
1983
     * Get querybuilder for given table
1984
     *
1985
     * @return \TYPO3\CMS\Core\Database\Query\QueryBuilder
1986
     */
1987
    private function getQueryBuilder(string $table)
1988
    {
1989
        return GeneralUtility::makeInstance(ConnectionPool::class)->getQueryBuilderForTable($table);
1990
    }
1991
}
1992