Passed
Push — remove-deprecations ( 992d8c...03dd24 )
by Tomas Norre
20:37 queued 17:40
created

CrawlerController::getUrlsForPageId()   C

Complexity

Conditions 16
Paths 96

Size

Total Lines 93
Code Lines 51

Duplication

Lines 0
Ratio 0 %

Code Coverage

Tests 46
CRAP Score 16.0585

Importance

Changes 3
Bugs 0 Features 0
Metric Value
cc 16
eloc 51
c 3
b 0
f 0
nc 96
nop 1
dl 0
loc 93
ccs 46
cts 49
cp 0.9388
crap 16.0585
rs 5.5666

How to fix   Long Method    Complexity   

Long Method

Small methods make your code easier to understand, in particular if combined with a good name. Besides, if your method is small, finding a good name is usually much easier.

For example, if you find yourself adding comments to a method's body, this is usually a good sign to extract the commented part to a new method, and use the comment as a starting point when coming up with a good name for this new method.

Commonly applied refactorings include:

1
<?php
2
3
declare(strict_types=1);
4
5
namespace AOE\Crawler\Controller;
6
7
/***************************************************************
8
 *  Copyright notice
9
 *
10
 *  (c) 2020 AOE GmbH <[email protected]>
11
 *
12
 *  All rights reserved
13
 *
14
 *  This script is part of the TYPO3 project. The TYPO3 project is
15
 *  free software; you can redistribute it and/or modify
16
 *  it under the terms of the GNU General Public License as published by
17
 *  the Free Software Foundation; either version 3 of the License, or
18
 *  (at your option) any later version.
19
 *
20
 *  The GNU General Public License can be found at
21
 *  http://www.gnu.org/copyleft/gpl.html.
22
 *
23
 *  This script is distributed in the hope that it will be useful,
24
 *  but WITHOUT ANY WARRANTY; without even the implied warranty of
25
 *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
26
 *  GNU General Public License for more details.
27
 *
28
 *  This copyright notice MUST APPEAR in all copies of the script!
29
 ***************************************************************/
30
31
use AOE\Crawler\Configuration\ExtensionConfigurationProvider;
32
use AOE\Crawler\Domain\Repository\ConfigurationRepository;
33
use AOE\Crawler\Domain\Repository\ProcessRepository;
34
use AOE\Crawler\Domain\Repository\QueueRepository;
35
use AOE\Crawler\QueueExecutor;
36
use AOE\Crawler\Utility\SignalSlotUtility;
37
use Psr\Http\Message\UriInterface;
38
use Psr\Log\LoggerAwareInterface;
39
use Psr\Log\LoggerAwareTrait;
40
use TYPO3\CMS\Backend\Tree\View\PageTreeView;
41
use TYPO3\CMS\Backend\Utility\BackendUtility;
42
use TYPO3\CMS\Core\Authentication\BackendUserAuthentication;
43
use TYPO3\CMS\Core\Compatibility\PublicMethodDeprecationTrait;
44
use TYPO3\CMS\Core\Core\Bootstrap;
45
use TYPO3\CMS\Core\Core\Environment;
46
use TYPO3\CMS\Core\Database\Connection;
47
use TYPO3\CMS\Core\Database\ConnectionPool;
48
use TYPO3\CMS\Core\Database\Query\Restriction\DeletedRestriction;
49
use TYPO3\CMS\Core\Http\Uri;
50
use TYPO3\CMS\Core\Imaging\Icon;
51
use TYPO3\CMS\Core\Imaging\IconFactory;
52
use TYPO3\CMS\Core\Routing\SiteMatcher;
53
use TYPO3\CMS\Core\Site\Entity\Site;
54
use TYPO3\CMS\Core\Type\Bitmask\Permission;
55
use TYPO3\CMS\Core\TypoScript\Parser\TypoScriptParser;
56
use TYPO3\CMS\Core\Utility\DebugUtility;
57
use TYPO3\CMS\Core\Utility\GeneralUtility;
58
use TYPO3\CMS\Core\Utility\MathUtility;
59
use TYPO3\CMS\Extbase\Object\ObjectManager;
60
use TYPO3\CMS\Frontend\Page\CacheHashCalculator;
61
use TYPO3\CMS\Frontend\Page\PageRepository;
62
63
/**
64
 * Class CrawlerController
65
 *
66
 * @package AOE\Crawler\Controller
67
 */
68
class CrawlerController implements LoggerAwareInterface
69
{
70
    use LoggerAwareTrait;
71
    use PublicMethodDeprecationTrait;
72
73
    public const CLI_STATUS_NOTHING_PROCCESSED = 0;
74
75
    public const CLI_STATUS_REMAIN = 1; //queue not empty
76
77
    public const CLI_STATUS_PROCESSED = 2; //(some) queue items where processed
78
79
    public const CLI_STATUS_ABORTED = 4; //instance didn't finish
80
81
    public const CLI_STATUS_POLLABLE_PROCESSED = 8;
82
83
    /**
84
     * @var integer
85
     */
86
    public $setID = 0;
87
88
    /**
89
     * @var string
90
     */
91
    public $processID = '';
92
93
    /**
94
     * @var array
95
     */
96
    public $duplicateTrack = [];
97
98
    /**
99
     * @var array
100
     */
101
    public $downloadUrls = [];
102
103
    /**
104
     * @var array
105
     */
106
    public $incomingProcInstructions = [];
107
108
    /**
109
     * @var array
110
     */
111
    public $incomingConfigurationSelection = [];
112
113
    /**
114
     * @var bool
115
     */
116
    public $registerQueueEntriesInternallyOnly = false;
117
118
    /**
119
     * @var array
120
     */
121
    public $queueEntries = [];
122
123
    /**
124
     * @var array
125
     */
126
    public $urlList = [];
127
128
    /**
129
     * @var array
130
     */
131
    public $extensionSettings = [];
132
133
    /**
134
     * Mount Point
135
     *
136
     * @var bool
137
     * Todo: Check what this is used for and adjust the type hint or code, as bool doesn't match the current code.
138
     */
139
    public $MP = false;
140
141
    /**
142
     * @var string
143
     */
144
    protected $processFilename;
145
146
    /**
147
     * Holds the internal access mode can be 'gui','cli' or 'cli_im'
148
     *
149
     * @var string
150
     */
151
    protected $accessMode;
152
153
    /**
154
     * @var QueueRepository
155
     */
156
    protected $queueRepository;
157
158
    /**
159
     * @var ProcessRepository
160
     */
161
    protected $processRepository;
162
163
    /**
164
     * @var ConfigurationRepository
165
     */
166
    protected $configurationRepository;
167
168
    /**
169
     * @var string
170
     */
171
    protected $tableName = 'tx_crawler_queue';
172
173
    /**
174
     * @var QueueExecutor
175
     */
176
    protected $queueExecutor;
177
178
    /**
179
     * @var int
180
     */
181
    protected $maximumUrlsToCompile = 10000;
182
183
    /**
184
     * @var IconFactory
185
     */
186
    protected $iconFactory;
187
188
    /**
189
     * @var string[]
190
     */
191
    private $deprecatedPublicMethods = [
0 ignored issues
show
introduced by
The private property $deprecatedPublicMethods is not used, and could be removed.
Loading history...
192
        'getLogEntriesForSetId' => 'Using crawlerController::getLogEntriesForSetId() is deprecated since 9.0.1 and will be removed in v11.x',
193
        'flushQueue' => 'Using CrawlerController::flushQueue() is deprecated since 9.0.1 and will be removed in v11.x, please use QueueRepository->flushQueue() instead.',
194
        'cleanUpOldQueueEntries' => 'Using CrawlerController::cleanUpOldQueueEntries() is deprecated since 9.0.1 and will be removed in v11.x, please use QueueRepository->cleanUpOldQueueEntries() instead.',
195
    ];
196
197
    /**
198
     * @var BackendUserAuthentication|null
199
     */
200
    private $backendUser;
201
202
    /**
203
     * @var integer
204
     */
205
    private $scheduledTime = 0;
206
207
    /**
208
     * @var integer
209
     */
210
    private $reqMinute = 0;
211
212
    /**
213
     * @var bool
214
     */
215
    private $submitCrawlUrls = false;
216
217
    /**
218
     * @var bool
219
     */
220
    private $downloadCrawlUrls = false;
221
222
    /************************************
223
     *
224
     * Getting URLs based on Page TSconfig
225
     *
226
     ************************************/
227
228 43
    public function __construct()
229
    {
230 43
        $objectManager = GeneralUtility::makeInstance(ObjectManager::class);
231 43
        $this->queueRepository = $objectManager->get(QueueRepository::class);
232 43
        $this->processRepository = $objectManager->get(ProcessRepository::class);
233 43
        $this->configurationRepository = $objectManager->get(ConfigurationRepository::class);
234 43
        $this->queueExecutor = $objectManager->get(QueueExecutor::class);
235 43
        $this->iconFactory = GeneralUtility::makeInstance(IconFactory::class);
236
237 43
        $this->processFilename = Environment::getVarPath() . '/lock/tx_crawler.proc';
238
239
        /** @var ExtensionConfigurationProvider $configurationProvider */
240 43
        $configurationProvider = GeneralUtility::makeInstance(ExtensionConfigurationProvider::class);
241 43
        $settings = $configurationProvider->getExtensionConfiguration();
242 43
        $this->extensionSettings = is_array($settings) ? $settings : [];
0 ignored issues
show
introduced by
The condition is_array($settings) is always true.
Loading history...
243
244
        // set defaults:
245 43
        if (MathUtility::convertToPositiveInteger($this->extensionSettings['countInARun']) === 0) {
246
            $this->extensionSettings['countInARun'] = 100;
247
        }
248
249 43
        $this->extensionSettings['processLimit'] = MathUtility::forceIntegerInRange($this->extensionSettings['processLimit'], 1, 99, 1);
250 43
        $this->maximumUrlsToCompile = MathUtility::forceIntegerInRange($this->extensionSettings['maxCompileUrls'], 1, 1000000000, 10000);
251 43
    }
252
253
    /**
254
     * Method to set the accessMode can be gui, cli or cli_im
255
     *
256
     * @return string
257
     */
258 1
    public function getAccessMode()
259
    {
260 1
        return $this->accessMode;
261
    }
262
263
    /**
264
     * @param string $accessMode
265
     */
266 1
    public function setAccessMode($accessMode): void
267
    {
268 1
        $this->accessMode = $accessMode;
269 1
    }
270
271
    /**
272
     * Set disabled status to prevent processes from being processed
273
     *
274
     * @param bool $disabled (optional, defaults to true)
275
     */
276 2
    public function setDisabled($disabled = true): void
277
    {
278 2
        if ($disabled) {
279 1
            GeneralUtility::writeFile($this->processFilename, '');
280
        } else {
281 1
            if (is_file($this->processFilename)) {
282 1
                unlink($this->processFilename);
283
            }
284
        }
285 2
    }
286
287
    /**
288
     * Get disable status
289
     *
290
     * @return bool true if disabled
291
     */
292 2
    public function getDisabled()
293
    {
294 2
        return is_file($this->processFilename);
295
    }
296
297
    /**
298
     * @param string $filenameWithPath
299
     */
300 3
    public function setProcessFilename($filenameWithPath): void
301
    {
302 3
        $this->processFilename = $filenameWithPath;
303 3
    }
304
305
    /**
306
     * @return string
307
     */
308 1
    public function getProcessFilename()
309
    {
310 1
        return $this->processFilename;
311
    }
312
313
    /**
314
     * Sets the extensions settings (unserialized pendant of $TYPO3_CONF_VARS['EXT']['extConf']['crawler']).
315
     */
316 12
    public function setExtensionSettings(array $extensionSettings): void
317
    {
318 12
        $this->extensionSettings = $extensionSettings;
319 12
    }
320
321
    /**
322
     * Check if the given page should be crawled
323
     *
324
     * @return false|string false if the page should be crawled (not excluded), true / skipMessage if it should be skipped
325
     */
326 10
    public function checkIfPageShouldBeSkipped(array $pageRow)
327
    {
328 10
        $skipPage = false;
329 10
        $skipMessage = 'Skipped'; // message will be overwritten later
330
331
        // if page is hidden
332 10
        if (! $this->extensionSettings['crawlHiddenPages']) {
333 10
            if ($pageRow['hidden']) {
334 1
                $skipPage = true;
335 1
                $skipMessage = 'Because page is hidden';
336
            }
337
        }
338
339 10
        if (! $skipPage) {
340 9
            if (GeneralUtility::inList('3,4', $pageRow['doktype']) || $pageRow['doktype'] >= 199) {
341 3
                $skipPage = true;
342 3
                $skipMessage = 'Because doktype is not allowed';
343
            }
344
        }
345
346 10
        if (! $skipPage) {
347 6
            foreach ($GLOBALS['TYPO3_CONF_VARS']['EXTCONF']['crawler']['excludeDoktype'] ?? [] as $key => $doktypeList) {
348 1
                if (GeneralUtility::inList($doktypeList, $pageRow['doktype'])) {
349 1
                    $skipPage = true;
350 1
                    $skipMessage = 'Doktype was excluded by "' . $key . '"';
351 1
                    break;
352
                }
353
            }
354
        }
355
356 10
        if (! $skipPage) {
357
            // veto hook
358 5
            foreach ($GLOBALS['TYPO3_CONF_VARS']['EXTCONF']['crawler']['pageVeto'] ?? [] as $key => $func) {
359
                $params = [
360
                    'pageRow' => $pageRow,
361
                ];
362
                // expects "false" if page is ok and "true" or a skipMessage if this page should _not_ be crawled
363
                $veto = GeneralUtility::callUserFunction($func, $params, $this);
364
                if ($veto !== false) {
365
                    $skipPage = true;
366
                    if (is_string($veto)) {
367
                        $skipMessage = $veto;
368
                    } else {
369
                        $skipMessage = 'Veto from hook "' . htmlspecialchars($key) . '"';
370
                    }
371
                    // no need to execute other hooks if a previous one return a veto
372
                    break;
373
                }
374
            }
375
        }
376
377 10
        return $skipPage ? $skipMessage : false;
378
    }
379
380
    /**
381
     * Wrapper method for getUrlsForPageId()
382
     * It returns an array of configurations and no urls!
383
     *
384
     * @param array $pageRow Page record with at least dok-type and uid columns.
385
     * @param string $skipMessage
386
     * @return array
387
     * @see getUrlsForPageId()
388
     */
389 6
    public function getUrlsForPageRow(array $pageRow, &$skipMessage = '')
390
    {
391 6
        $message = $this->checkIfPageShouldBeSkipped($pageRow);
392
393 6
        if ($message === false) {
394 5
            $res = $this->getUrlsForPageId($pageRow['uid']);
395 5
            $skipMessage = '';
396
        } else {
397 1
            $skipMessage = $message;
398 1
            $res = [];
399
        }
400
401 6
        return $res;
402
    }
403
404
    /**
405
     * Creates a list of URLs from input array (and submits them to queue if asked for)
406
     * See Web > Info module script + "indexed_search"'s crawler hook-client using this!
407
     *
408
     * @param array $vv Information about URLs from pageRow to crawl.
409
     * @param array $pageRow Page row
410
     * @param int $scheduledTime Unix time to schedule indexing to, typically time()
411
     * @param int $reqMinute Number of requests per minute (creates the interleave between requests)
412
     * @param bool $submitCrawlUrls If set, submits the URLs to queue
413
     * @param bool $downloadCrawlUrls If set (and submitcrawlUrls is false) will fill $downloadUrls with entries)
414
     * @param array $duplicateTrack Array which is passed by reference and contains the an id per url to secure we will not crawl duplicates
415
     * @param array $downloadUrls Array which will be filled with URLS for download if flag is set.
416
     * @param array $incomingProcInstructions Array of processing instructions
417
     * @return string List of URLs (meant for display in backend module)
418
     */
419 4
    public function urlListFromUrlArray(
420
        array $vv,
421
        array $pageRow,
422
        $scheduledTime,
423
        $reqMinute,
424
        $submitCrawlUrls,
425
        $downloadCrawlUrls,
426
        array &$duplicateTrack,
427
        array &$downloadUrls,
428
        array $incomingProcInstructions
429
    ) {
430 4
        if (! is_array($vv['URLs'])) {
431
            return 'ERROR - no URL generated';
432
        }
433 4
        $urlLog = [];
434 4
        $pageId = (int) $pageRow['uid'];
435 4
        $configurationHash = $this->getConfigurationHash($vv);
436 4
        $skipInnerCheck = $this->queueRepository->noUnprocessedQueueEntriesForPageWithConfigurationHashExist($pageId, $configurationHash);
437
438 4
        foreach ($vv['URLs'] as $urlQuery) {
439 4
            if (! $this->drawURLs_PIfilter($vv['subCfg']['procInstrFilter'], $incomingProcInstructions)) {
440
                continue;
441
            }
442 4
            $url = (string) $this->getUrlFromPageAndQueryParameters(
443 4
                $pageId,
444 4
                $urlQuery,
445 4
                $vv['subCfg']['baseUrl'] ?? null,
446 4
                $vv['subCfg']['force_ssl'] ?? 0
447
            );
448
449
            // Create key by which to determine unique-ness:
450 4
            $uKey = $url . '|' . $vv['subCfg']['userGroups'] . '|' . $vv['subCfg']['procInstrFilter'];
451
452 4
            if (isset($duplicateTrack[$uKey])) {
453
                //if the url key is registered just display it and do not resubmit is
454
                $urlLog[] = '<em><span class="text-muted">' . htmlspecialchars($url) . '</span></em>';
455
            } else {
456
                // Scheduled time:
457 4
                $schTime = $scheduledTime + round(count($duplicateTrack) * (60 / $reqMinute));
458 4
                $schTime = intval($schTime / 60) * 60;
459 4
                $formattedDate = BackendUtility::datetime($schTime);
460 4
                $this->urlList[] = '[' . $formattedDate . '] ' . $url;
461 4
                $urlList = '[' . $formattedDate . '] ' . htmlspecialchars($url);
462
463
                // Submit for crawling!
464 4
                if ($submitCrawlUrls) {
465 4
                    $added = $this->addUrl(
466 4
                        $pageId,
467 4
                        $url,
468 4
                        $vv['subCfg'],
469 4
                        $scheduledTime,
470 4
                        $configurationHash,
471 4
                        $skipInnerCheck
472
                    );
473 4
                    if ($added === false) {
474 4
                        $urlList .= ' (URL already existed)';
475
                    }
476
                } elseif ($downloadCrawlUrls) {
477
                    $downloadUrls[$url] = $url;
478
                }
479 4
                $urlLog[] = $urlList;
480
            }
481 4
            $duplicateTrack[$uKey] = true;
482
        }
483
484 4
        return implode('<br>', $urlLog);
485
    }
486
487
    /**
488
     * Returns true if input processing instruction is among registered ones.
489
     *
490
     * @param string $piString PI to test
491
     * @param array $incomingProcInstructions Processing instructions
492
     * @return boolean
493
     */
494 5
    public function drawURLs_PIfilter($piString, array $incomingProcInstructions)
495
    {
496 5
        if (empty($incomingProcInstructions)) {
497 1
            return true;
498
        }
499
500 4
        foreach ($incomingProcInstructions as $pi) {
501 4
            if (GeneralUtility::inList($piString, $pi)) {
502 2
                return true;
503
            }
504
        }
505 2
        return false;
506
    }
507
508 5
    public function getPageTSconfigForId($id): array
509
    {
510 5
        if (! $this->MP) {
511 5
            $pageTSconfig = BackendUtility::getPagesTSconfig($id);
0 ignored issues
show
Deprecated Code introduced by
The function TYPO3\CMS\Backend\Utilit...ity::getPagesTSconfig() has been deprecated. ( Ignorable by Annotation )

If this is a false-positive, you can also ignore this issue in your code via the ignore-deprecated  annotation

511
            $pageTSconfig = /** @scrutinizer ignore-deprecated */ BackendUtility::getPagesTSconfig($id);
Loading history...
512
        } else {
513
            // TODO: Please check, this makes no sense to split a boolean value.
514
            [, $mountPointId] = explode('-', $this->MP);
0 ignored issues
show
Bug introduced by
$this->MP of type true is incompatible with the type string expected by parameter $string of explode(). ( Ignorable by Annotation )

If this is a false-positive, you can also ignore this issue in your code via the ignore-type  annotation

514
            [, $mountPointId] = explode('-', /** @scrutinizer ignore-type */ $this->MP);
Loading history...
515
            $pageTSconfig = BackendUtility::getPagesTSconfig($mountPointId);
0 ignored issues
show
Bug introduced by
$mountPointId of type string is incompatible with the type integer expected by parameter $id of TYPO3\CMS\Backend\Utilit...ity::getPagesTSconfig(). ( Ignorable by Annotation )

If this is a false-positive, you can also ignore this issue in your code via the ignore-type  annotation

515
            $pageTSconfig = BackendUtility::getPagesTSconfig(/** @scrutinizer ignore-type */ $mountPointId);
Loading history...
Deprecated Code introduced by
The function TYPO3\CMS\Backend\Utilit...ity::getPagesTSconfig() has been deprecated. ( Ignorable by Annotation )

If this is a false-positive, you can also ignore this issue in your code via the ignore-deprecated  annotation

515
            $pageTSconfig = /** @scrutinizer ignore-deprecated */ BackendUtility::getPagesTSconfig($mountPointId);
Loading history...
516
        }
517
518
        // Call a hook to alter configuration
519 5
        if (is_array($GLOBALS['TYPO3_CONF_VARS']['EXTCONF']['crawler']['getPageTSconfigForId'])) {
520
            $params = [
521
                'pageId' => $id,
522
                'pageTSConfig' => &$pageTSconfig,
523
            ];
524
            foreach ($GLOBALS['TYPO3_CONF_VARS']['EXTCONF']['crawler']['getPageTSconfigForId'] as $userFunc) {
525
                GeneralUtility::callUserFunction($userFunc, $params, $this);
526
            }
527
        }
528 5
        return $pageTSconfig;
529
    }
530
531
    /**
532
     * This methods returns an array of configurations.
533
     * Adds no urls!
534
     */
535 4
    public function getUrlsForPageId(int $pageId): array
536
    {
537
        // Get page TSconfig for page ID
538 4
        $pageTSconfig = $this->getPageTSconfigForId($pageId);
539
540 4
        $res = [];
541
542
        // Fetch Crawler Configuration from pageTSconfig
543 4
        $crawlerCfg = $pageTSconfig['tx_crawler.']['crawlerCfg.']['paramSets.'] ?? [];
544 4
        foreach ($crawlerCfg as $key => $values) {
545 3
            if (! is_array($values)) {
546 3
                continue;
547
            }
548 3
            $key = str_replace('.', '', $key);
549
            // Sub configuration for a single configuration string:
550 3
            $subCfg = (array) $crawlerCfg[$key . '.'];
551 3
            $subCfg['key'] = $key;
552
553 3
            if (strcmp($subCfg['procInstrFilter'], '')) {
554 3
                $subCfg['procInstrFilter'] = implode(',', GeneralUtility::trimExplode(',', $subCfg['procInstrFilter']));
555
            }
556 3
            $pidOnlyList = implode(',', GeneralUtility::trimExplode(',', $subCfg['pidsOnly'], true));
557
558
            // process configuration if it is not page-specific or if the specific page is the current page:
559
            // TODO: Check if $pidOnlyList can be kept as Array instead of imploded
560 3
            if (! strcmp((string) $subCfg['pidsOnly'], '') || GeneralUtility::inList($pidOnlyList, strval($pageId))) {
561
562
                // Explode, process etc.:
563 3
                $res[$key] = [];
564 3
                $res[$key]['subCfg'] = $subCfg;
565 3
                $res[$key]['paramParsed'] = GeneralUtility::explodeUrl2Array($crawlerCfg[$key]);
566 3
                $res[$key]['paramExpanded'] = $this->expandParameters($res[$key]['paramParsed'], $pageId);
567 3
                $res[$key]['origin'] = 'pagets';
568
569
                // recognize MP value
570 3
                if (! $this->MP) {
571 3
                    $res[$key]['URLs'] = $this->compileUrls($res[$key]['paramExpanded'], ['?id=' . $pageId]);
572
                } else {
573
                    $res[$key]['URLs'] = $this->compileUrls($res[$key]['paramExpanded'], ['?id=' . $pageId . '&MP=' . $this->MP]);
0 ignored issues
show
Bug introduced by
Are you sure $this->MP of type true can be used in concatenation? ( Ignorable by Annotation )

If this is a false-positive, you can also ignore this issue in your code via the ignore-type  annotation

573
                    $res[$key]['URLs'] = $this->compileUrls($res[$key]['paramExpanded'], ['?id=' . $pageId . '&MP=' . /** @scrutinizer ignore-type */ $this->MP]);
Loading history...
574
                }
575
            }
576
        }
577
578
        // Get configuration from tx_crawler_configuration records up the rootline
579 4
        $crawlerConfigurations = $this->configurationRepository->getCrawlerConfigurationRecordsFromRootLine($pageId);
580 4
        foreach ($crawlerConfigurations as $configurationRecord) {
581
582
            // check access to the configuration record
583 1
            if (empty($configurationRecord['begroups']) || $this->getBackendUser()->isAdmin() || $this->hasGroupAccess($this->getBackendUser()->user['usergroup_cached_list'], $configurationRecord['begroups'])) {
584 1
                $pidOnlyList = implode(',', GeneralUtility::trimExplode(',', $configurationRecord['pidsonly'], true));
585
586
                // process configuration if it is not page-specific or if the specific page is the current page:
587
                // TODO: Check if $pidOnlyList can be kept as Array instead of imploded
588 1
                if (! strcmp($configurationRecord['pidsonly'], '') || GeneralUtility::inList($pidOnlyList, strval($pageId))) {
589 1
                    $key = $configurationRecord['name'];
590
591
                    // don't overwrite previously defined paramSets
592 1
                    if (! isset($res[$key])) {
593
594
                        /* @var $TSparserObject \TYPO3\CMS\Core\TypoScript\Parser\TypoScriptParser */
595 1
                        $TSparserObject = GeneralUtility::makeInstance(TypoScriptParser::class);
596 1
                        $TSparserObject->parse($configurationRecord['processing_instruction_parameters_ts']);
597
598
                        $subCfg = [
599 1
                            'procInstrFilter' => $configurationRecord['processing_instruction_filter'],
600 1
                            'procInstrParams.' => $TSparserObject->setup,
601 1
                            'baseUrl' => $configurationRecord['base_url'],
602 1
                            'force_ssl' => (int) $configurationRecord['force_ssl'],
603 1
                            'userGroups' => $configurationRecord['fegroups'],
604 1
                            'exclude' => $configurationRecord['exclude'],
605 1
                            'key' => $key,
606
                        ];
607
608 1
                        if (! in_array($pageId, $this->expandExcludeString($subCfg['exclude']), true)) {
609 1
                            $res[$key] = [];
610 1
                            $res[$key]['subCfg'] = $subCfg;
611 1
                            $res[$key]['paramParsed'] = GeneralUtility::explodeUrl2Array($configurationRecord['configuration']);
612 1
                            $res[$key]['paramExpanded'] = $this->expandParameters($res[$key]['paramParsed'], $pageId);
613 1
                            $res[$key]['URLs'] = $this->compileUrls($res[$key]['paramExpanded'], ['?id=' . $pageId]);
614 1
                            $res[$key]['origin'] = 'tx_crawler_configuration_' . $configurationRecord['uid'];
615
                        }
616
                    }
617
                }
618
            }
619
        }
620
621 4
        foreach ($GLOBALS['TYPO3_CONF_VARS']['EXTCONF']['crawler']['processUrls'] ?? [] as $func) {
622
            $params = [
623
                'res' => &$res,
624
            ];
625
            GeneralUtility::callUserFunction($func, $params, $this);
626
        }
627 4
        return $res;
628
    }
629
630
    /**
631
     * Find all configurations of subpages of a page
632
     * TODO: Write Functional Tests
633
     */
634 1
    public function getConfigurationsForBranch(int $rootid, int $depth): array
635
    {
636 1
        $configurationsForBranch = [];
637 1
        $pageTSconfig = $this->getPageTSconfigForId($rootid);
638 1
        $sets = $pageTSconfig['tx_crawler.']['crawlerCfg.']['paramSets.'] ?? [];
639 1
        foreach ($sets as $key => $value) {
640
            if (! is_array($value)) {
641
                continue;
642
            }
643
            $configurationsForBranch[] = substr($key, -1) === '.' ? substr($key, 0, -1) : $key;
644
        }
645 1
        $pids = [];
646 1
        $rootLine = BackendUtility::BEgetRootLine($rootid);
647 1
        foreach ($rootLine as $node) {
648 1
            $pids[] = $node['uid'];
649
        }
650
        /* @var PageTreeView $tree */
651 1
        $tree = GeneralUtility::makeInstance(PageTreeView::class);
652 1
        $perms_clause = $this->getBackendUser()->getPagePermsClause(Permission::PAGE_SHOW);
653 1
        $tree->init(empty($perms_clause) ? '' : ('AND ' . $perms_clause));
654 1
        $tree->getTree($rootid, $depth, '');
655 1
        foreach ($tree->tree as $node) {
656
            $pids[] = $node['row']['uid'];
657
        }
658
659 1
        $queryBuilder = $this->getQueryBuilder('tx_crawler_configuration');
660
        $statement = $queryBuilder
661 1
            ->select('name')
662 1
            ->from('tx_crawler_configuration')
663 1
            ->where(
664 1
                $queryBuilder->expr()->in('pid', $queryBuilder->createNamedParameter($pids, Connection::PARAM_INT_ARRAY))
665
            )
666 1
            ->execute();
667
668 1
        while ($row = $statement->fetch()) {
669 1
            $configurationsForBranch[] = $row['name'];
670
        }
671 1
        return $configurationsForBranch;
672
    }
673
674
    /**
675
     * Check if a user has access to an item
676
     * (e.g. get the group list of the current logged in user from $GLOBALS['TSFE']->gr_list)
677
     *
678
     * @param string $groupList Comma-separated list of (fe_)group UIDs from a user
679
     * @param string $accessList Comma-separated list of (fe_)group UIDs of the item to access
680
     * @return bool                 TRUE if at least one of the users group UIDs is in the access list or the access list is empty
681
     * @see \TYPO3\CMS\Frontend\Page\PageRepository::getMultipleGroupsWhereClause()
682
     */
683 3
    public function hasGroupAccess($groupList, $accessList)
684
    {
685 3
        if (empty($accessList)) {
686 1
            return true;
687
        }
688 2
        foreach (GeneralUtility::intExplode(',', $groupList) as $groupUid) {
689 2
            if (GeneralUtility::inList($accessList, $groupUid)) {
690 1
                return true;
691
            }
692
        }
693 1
        return false;
694
    }
695
696
    /**
697
     * Will expand the parameters configuration to individual values. This follows a certain syntax of the value of each parameter.
698
     * Syntax of values:
699
     * - Basically: If the value is wrapped in [...] it will be expanded according to the following syntax, otherwise the value is taken literally
700
     * - Configuration is splitted by "|" and the parts are processed individually and finally added together
701
     * - For each configuration part:
702
     *         - "[int]-[int]" = Integer range, will be expanded to all values in between, values included, starting from low to high (max. 1000). Example "1-34" or "-40--30"
703
     *         - "_TABLE:[TCA table name];[_PID:[optional page id, default is current page]];[_ENABLELANG:1]" = Look up of table records from PID, filtering out deleted records. Example "_TABLE:tt_content; _PID:123"
704
     *        _ENABLELANG:1 picks only original records without their language overlays
705
     *         - Default: Literal value
706
     *
707
     * @param array $paramArray Array with key (GET var name) and values (value of GET var which is configuration for expansion)
708
     * @param integer $pid Current page ID
709
     * @return array
710
     *
711
     * TODO: Write Functional Tests
712
     */
713 11
    public function expandParameters($paramArray, $pid)
714
    {
715
        // Traverse parameter names:
716 11
        foreach ($paramArray as $p => $v) {
717 11
            $v = trim($v);
718
719
            // If value is encapsulated in square brackets it means there are some ranges of values to find, otherwise the value is literal
720 11
            if (strpos($v, '[') === 0 && substr($v, -1) === ']') {
721
                // So, find the value inside brackets and reset the paramArray value as an array.
722 11
                $v = substr($v, 1, -1);
723 11
                $paramArray[$p] = [];
724
725
                // Explode parts and traverse them:
726 11
                $parts = explode('|', $v);
727 11
                foreach ($parts as $pV) {
728
729
                    // Look for integer range: (fx. 1-34 or -40--30 // reads minus 40 to minus 30)
730 11
                    if (preg_match('/^(-?[0-9]+)\s*-\s*(-?[0-9]+)$/', trim($pV), $reg)) {
731 1
                        $reg = $this->swapIfFirstIsLargerThanSecond($reg);
732
733
                        // Traverse range, add values:
734 1
                        $runAwayBrake = 1000; // Limit to size of range!
735 1
                        for ($a = $reg[1]; $a <= $reg[2]; $a++) {
736 1
                            $paramArray[$p][] = $a;
737 1
                            $runAwayBrake--;
738 1
                            if ($runAwayBrake <= 0) {
739
                                break;
740
                            }
741
                        }
742 10
                    } elseif (strpos(trim($pV), '_TABLE:') === 0) {
743
744
                        // Parse parameters:
745 6
                        $subparts = GeneralUtility::trimExplode(';', $pV);
746 6
                        $subpartParams = [];
747 6
                        foreach ($subparts as $spV) {
748 6
                            [$pKey, $pVal] = GeneralUtility::trimExplode(':', $spV);
749 6
                            $subpartParams[$pKey] = $pVal;
750
                        }
751
752
                        // Table exists:
753 6
                        if (isset($GLOBALS['TCA'][$subpartParams['_TABLE']])) {
754 6
                            $lookUpPid = isset($subpartParams['_PID']) ? intval($subpartParams['_PID']) : intval($pid);
755 6
                            $recursiveDepth = isset($subpartParams['_RECURSIVE']) ? intval($subpartParams['_RECURSIVE']) : 0;
756 6
                            $pidField = isset($subpartParams['_PIDFIELD']) ? trim($subpartParams['_PIDFIELD']) : 'pid';
757 6
                            $where = $subpartParams['_WHERE'] ?? '';
758 6
                            $addTable = $subpartParams['_ADDTABLE'] ?? '';
759
760 6
                            $fieldName = $subpartParams['_FIELD'] ? $subpartParams['_FIELD'] : 'uid';
761 6
                            if ($fieldName === 'uid' || $GLOBALS['TCA'][$subpartParams['_TABLE']]['columns'][$fieldName]) {
762 6
                                $queryBuilder = $this->getQueryBuilder($subpartParams['_TABLE']);
763
764 6
                                if ($recursiveDepth > 0) {
765
                                    /** @var \TYPO3\CMS\Core\Database\QueryGenerator $queryGenerator */
766 2
                                    $queryGenerator = GeneralUtility::makeInstance(\TYPO3\CMS\Core\Database\QueryGenerator::class);
767 2
                                    $pidList = $queryGenerator->getTreeList($lookUpPid, $recursiveDepth, 0, 1);
768 2
                                    $pidArray = GeneralUtility::intExplode(',', $pidList);
769
                                } else {
770 4
                                    $pidArray = [(string) $lookUpPid];
771
                                }
772
773 6
                                $queryBuilder->getRestrictions()
774 6
                                    ->removeAll()
775 6
                                    ->add(GeneralUtility::makeInstance(DeletedRestriction::class));
776
777
                                $queryBuilder
778 6
                                    ->select($fieldName)
779 6
                                    ->from($subpartParams['_TABLE'])
780 6
                                    ->where(
781 6
                                        $queryBuilder->expr()->in($pidField, $queryBuilder->createNamedParameter($pidArray, Connection::PARAM_INT_ARRAY)),
782 6
                                        $where
783
                                    );
784
785 6
                                if (! empty($addTable)) {
786
                                    // TODO: Check if this works as intended!
787
                                    $queryBuilder->add('from', $addTable);
788
                                }
789 6
                                $transOrigPointerField = $GLOBALS['TCA'][$subpartParams['_TABLE']]['ctrl']['transOrigPointerField'];
790
791 6
                                if ($subpartParams['_ENABLELANG'] && $transOrigPointerField) {
792
                                    $queryBuilder->andWhere(
793
                                        $queryBuilder->expr()->lte(
794
                                            $transOrigPointerField,
795
                                            0
796
                                        )
797
                                    );
798
                                }
799
800 6
                                $statement = $queryBuilder->execute();
801
802 6
                                $rows = [];
803 6
                                while ($row = $statement->fetch()) {
804 6
                                    $rows[$row[$fieldName]] = $row;
805
                                }
806
807 6
                                if (is_array($rows)) {
808 6
                                    $paramArray[$p] = array_merge($paramArray[$p], array_keys($rows));
809
                                }
810
                            }
811
                        }
812
                    } else { // Just add value:
813 4
                        $paramArray[$p][] = $pV;
814
                    }
815
                    // Hook for processing own expandParameters place holder
816 11
                    if (is_array($GLOBALS['TYPO3_CONF_VARS']['SC_OPTIONS']['crawler/class.tx_crawler_lib.php']['expandParameters'])) {
817
                        $_params = [
818
                            'pObj' => &$this,
819
                            'paramArray' => &$paramArray,
820
                            'currentKey' => $p,
821
                            'currentValue' => $pV,
822
                            'pid' => $pid,
823
                        ];
824
                        foreach ($GLOBALS['TYPO3_CONF_VARS']['SC_OPTIONS']['crawler/class.tx_crawler_lib.php']['expandParameters'] as $_funcRef) {
825
                            GeneralUtility::callUserFunction($_funcRef, $_params, $this);
826
                        }
827
                    }
828
                }
829
830
                // Make unique set of values and sort array by key:
831 11
                $paramArray[$p] = array_unique($paramArray[$p]);
832 11
                ksort($paramArray);
833
            } else {
834
                // Set the literal value as only value in array:
835 4
                $paramArray[$p] = [$v];
836
            }
837
        }
838
839 11
        return $paramArray;
840
    }
841
842
    /**
843
     * Compiling URLs from parameter array (output of expandParameters())
844
     * The number of URLs will be the multiplication of the number of parameter values for each key
845
     *
846
     * @param array $paramArray Output of expandParameters(): Array with keys (GET var names) and for each an array of values
847
     * @param array $urls URLs accumulated in this array (for recursion)
848
     * @return array
849
     */
850 7
    public function compileUrls($paramArray, array $urls)
851
    {
852 7
        if (empty($paramArray)) {
853 7
            return $urls;
854
        }
855
        // shift first off stack:
856 6
        reset($paramArray);
857 6
        $varName = key($paramArray);
858 6
        $valueSet = array_shift($paramArray);
859
860
        // Traverse value set:
861 6
        $newUrls = [];
862 6
        foreach ($urls as $url) {
863 5
            foreach ($valueSet as $val) {
864 5
                $newUrls[] = $url . (strcmp((string) $val, '') ? '&' . rawurlencode($varName) . '=' . rawurlencode((string) $val) : '');
865
866 5
                if (count($newUrls) > $this->maximumUrlsToCompile) {
867
                    break;
868
                }
869
            }
870
        }
871 6
        return $this->compileUrls($paramArray, $newUrls);
872
    }
873
874
    /************************************
875
     *
876
     * Crawler log
877
     *
878
     ************************************/
879
880
    /**
881
     * Return array of records from crawler queue for input page ID
882
     *
883
     * @param integer $id Page ID for which to look up log entries.
884
     * @param string $filter Filter: "all" => all entries, "pending" => all that is not yet run, "finished" => all complete ones
885
     * @param boolean $doFlush If TRUE, then entries selected at DELETED(!) instead of selected!
886
     * @param boolean $doFullFlush
887
     * @param integer $itemsPerPage Limit the amount of entries per page default is 10
888
     * @return array
889
     */
890 4
    public function getLogEntriesForPageId($id, $filter = '', $doFlush = false, $doFullFlush = false, $itemsPerPage = 10)
891
    {
892 4
        $queryBuilder = GeneralUtility::makeInstance(ConnectionPool::class)->getQueryBuilderForTable($this->tableName);
893
        $queryBuilder
894 4
            ->select('*')
895 4
            ->from($this->tableName)
896 4
            ->where(
897 4
                $queryBuilder->expr()->eq('page_id', $queryBuilder->createNamedParameter($id, \PDO::PARAM_INT))
898
            )
899 4
            ->orderBy('scheduled', 'DESC');
900
901 4
        $expressionBuilder = GeneralUtility::makeInstance(ConnectionPool::class)
902 4
            ->getConnectionForTable($this->tableName)
903 4
            ->getExpressionBuilder();
904 4
        $query = $expressionBuilder->andX();
0 ignored issues
show
Unused Code introduced by
The assignment to $query is dead and can be removed.
Loading history...
905
        // PHPStorm adds the highlight that the $addWhere is immediately overwritten,
906
        // but the $query = $expressionBuilder->andX() ensures that the $addWhere is written correctly with AND
907
        // between the statements, it's not a mistake in the code.
908 4
        switch ($filter) {
909 4
            case 'pending':
910
                $queryBuilder->andWhere($queryBuilder->expr()->eq('exec_time', 0));
911
                break;
912 4
            case 'finished':
913
                $queryBuilder->andWhere($queryBuilder->expr()->gt('exec_time', 0));
914
                break;
915
        }
916
917 4
        if ($doFlush) {
918 2
            if ($doFullFlush) {
919 1
                $this->queueRepository->flushQueue('all');
920
            } else {
921 1
                $this->queueRepository->flushQueue($filter);
922
            }
923
        }
924 4
        if ($itemsPerPage > 0) {
925
            $queryBuilder
926 4
                ->setMaxResults((int) $itemsPerPage);
927
        }
928
929 4
        return $queryBuilder->execute()->fetchAll();
930
    }
931
932
    /**
933
     * Return array of records from crawler queue for input set ID
934
     *
935
     * @param int $set_id Set ID for which to look up log entries.
936
     * @param string $filter Filter: "all" => all entries, "pending" => all that is not yet run, "finished" => all complete ones
937
     * @param bool $doFlush If TRUE, then entries selected at DELETED(!) instead of selected!
938
     * @param int $itemsPerPage Limit the amount of entries per page default is 10
939
     * @return array
940
     *
941
     * @deprecated
942
     */
943 6
    public function getLogEntriesForSetId(int $set_id, string $filter = '', bool $doFlush = false, bool $doFullFlush = false, int $itemsPerPage = 10)
944
    {
945 6
        $queryBuilder = GeneralUtility::makeInstance(ConnectionPool::class)->getQueryBuilderForTable($this->tableName);
946
        $queryBuilder
947 6
            ->select('*')
948 6
            ->from($this->tableName)
949 6
            ->where(
950 6
                $queryBuilder->expr()->eq('set_id', $queryBuilder->createNamedParameter($set_id, \PDO::PARAM_INT))
951
            )
952 6
            ->orderBy('scheduled', 'DESC');
953
954 6
        $expressionBuilder = GeneralUtility::makeInstance(ConnectionPool::class)
955 6
            ->getConnectionForTable($this->tableName)
956 6
            ->getExpressionBuilder();
957 6
        $query = $expressionBuilder->andX();
958
        // PHPStorm adds the highlight that the $addWhere is immediately overwritten,
959
        // but the $query = $expressionBuilder->andX() ensures that the $addWhere is written correctly with AND
960
        // between the statements, it's not a mistake in the code.
961 6
        $addWhere = '';
962 6
        switch ($filter) {
963 6
            case 'pending':
964 1
                $queryBuilder->andWhere($queryBuilder->expr()->eq('exec_time', 0));
965 1
                $addWhere = $query->add($expressionBuilder->eq('exec_time', 0));
0 ignored issues
show
Unused Code introduced by
The assignment to $addWhere is dead and can be removed.
Loading history...
966 1
                break;
967 5
            case 'finished':
968 1
                $queryBuilder->andWhere($queryBuilder->expr()->gt('exec_time', 0));
969 1
                $addWhere = $query->add($expressionBuilder->gt('exec_time', 0));
970 1
                break;
971
        }
972 6
        if ($doFlush) {
973 4
            $addWhere = $query->add($expressionBuilder->eq('set_id', (int) $set_id));
974 4
            $this->flushQueue($doFullFlush ? '' : $addWhere);
0 ignored issues
show
Deprecated Code introduced by
The function AOE\Crawler\Controller\C...ontroller::flushQueue() has been deprecated. ( Ignorable by Annotation )

If this is a false-positive, you can also ignore this issue in your code via the ignore-deprecated  annotation

974
            /** @scrutinizer ignore-deprecated */ $this->flushQueue($doFullFlush ? '' : $addWhere);
Loading history...
975 4
            return [];
976
        }
977 2
        if ($itemsPerPage > 0) {
978
            $queryBuilder
979 2
                ->setMaxResults((int) $itemsPerPage);
980
        }
981
982 2
        return $queryBuilder->execute()->fetchAll();
983
    }
984
985
    /**
986
     * Adding call back entries to log (called from hooks typically, see indexed search class "class.crawler.php"
987
     *
988
     * @param integer $setId Set ID
989
     * @param array $params Parameters to pass to call back function
990
     * @param string $callBack Call back object reference, eg. 'EXT:indexed_search/class.crawler.php:&tx_indexedsearch_crawler'
991
     * @param integer $page_id Page ID to attach it to
992
     * @param integer $schedule Time at which to activate
993
     */
994
    public function addQueueEntry_callBack($setId, $params, $callBack, $page_id = 0, $schedule = 0): void
995
    {
996
        if (! is_array($params)) {
0 ignored issues
show
introduced by
The condition is_array($params) is always true.
Loading history...
997
            $params = [];
998
        }
999
        $params['_CALLBACKOBJ'] = $callBack;
1000
1001
        GeneralUtility::makeInstance(ConnectionPool::class)->getConnectionForTable('tx_crawler_queue')
1002
            ->insert(
1003
                'tx_crawler_queue',
1004
                [
1005
                    'page_id' => (int) $page_id,
1006
                    'parameters' => serialize($params),
1007
                    'scheduled' => (int) $schedule ?: $this->getCurrentTime(),
1008
                    'exec_time' => 0,
1009
                    'set_id' => (int) $setId,
1010
                    'result_data' => '',
1011
                ]
1012
            );
1013
    }
1014
1015
    /************************************
1016
     *
1017
     * URL setting
1018
     *
1019
     ************************************/
1020
1021
    /**
1022
     * Setting a URL for crawling:
1023
     *
1024
     * @param integer $id Page ID
1025
     * @param string $url Complete URL
1026
     * @param array $subCfg Sub configuration array (from TS config)
1027
     * @param integer $tstamp Scheduled-time
1028
     * @param string $configurationHash (optional) configuration hash
1029
     * @param bool $skipInnerDuplicationCheck (optional) skip inner duplication check
1030
     * @return bool
1031
     */
1032 8
    public function addUrl(
1033
        $id,
1034
        $url,
1035
        array $subCfg,
1036
        $tstamp,
1037
        $configurationHash = '',
1038
        $skipInnerDuplicationCheck = false
1039
    ) {
1040 8
        $urlAdded = false;
1041 8
        $rows = [];
1042
1043
        // Creating parameters:
1044
        $parameters = [
1045 8
            'url' => $url,
1046
        ];
1047
1048
        // fe user group simulation:
1049 8
        $uGs = implode(',', array_unique(GeneralUtility::intExplode(',', $subCfg['userGroups'], true)));
1050 8
        if ($uGs) {
1051 1
            $parameters['feUserGroupList'] = $uGs;
1052
        }
1053
1054
        // Setting processing instructions
1055 8
        $parameters['procInstructions'] = GeneralUtility::trimExplode(',', $subCfg['procInstrFilter']);
1056 8
        if (is_array($subCfg['procInstrParams.'])) {
1057 5
            $parameters['procInstrParams'] = $subCfg['procInstrParams.'];
1058
        }
1059
1060
        // Compile value array:
1061 8
        $parameters_serialized = serialize($parameters);
1062
        $fieldArray = [
1063 8
            'page_id' => (int) $id,
1064 8
            'parameters' => $parameters_serialized,
1065 8
            'parameters_hash' => GeneralUtility::shortMD5($parameters_serialized),
1066 8
            'configuration_hash' => $configurationHash,
1067 8
            'scheduled' => $tstamp,
1068 8
            'exec_time' => 0,
1069 8
            'set_id' => (int) $this->setID,
1070 8
            'result_data' => '',
1071 8
            'configuration' => $subCfg['key'],
1072
        ];
1073
1074 8
        if ($this->registerQueueEntriesInternallyOnly) {
1075
            //the entries will only be registered and not stored to the database
1076 1
            $this->queueEntries[] = $fieldArray;
1077
        } else {
1078 7
            if (! $skipInnerDuplicationCheck) {
1079
                // check if there is already an equal entry
1080 6
                $rows = $this->getDuplicateRowsIfExist($tstamp, $fieldArray);
1081
            }
1082
1083 7
            if (empty($rows)) {
1084 6
                $connectionForCrawlerQueue = GeneralUtility::makeInstance(ConnectionPool::class)->getConnectionForTable('tx_crawler_queue');
1085 6
                $connectionForCrawlerQueue->insert(
1086 6
                    'tx_crawler_queue',
1087 6
                    $fieldArray
1088
                );
1089 6
                $uid = $connectionForCrawlerQueue->lastInsertId('tx_crawler_queue', 'qid');
1090 6
                $rows[] = $uid;
1091 6
                $urlAdded = true;
1092
1093 6
                $signalPayload = ['uid' => $uid, 'fieldArray' => $fieldArray];
1094 6
                SignalSlotUtility::emitSignal(
1095 6
                    self::class,
1096 6
                    SignalSlotUtility::SIGNAL_URL_ADDED_TO_QUEUE,
1097 6
                    $signalPayload
1098
                );
1099
            } else {
1100 3
                $signalPayload = ['rows' => $rows, 'fieldArray' => $fieldArray];
1101 3
                SignalSlotUtility::emitSignal(
1102 3
                    self::class,
1103 3
                    SignalSlotUtility::SIGNAL_DUPLICATE_URL_IN_QUEUE,
1104 3
                    $signalPayload
1105
                );
1106
            }
1107
        }
1108
1109 8
        return $urlAdded;
1110
    }
1111
1112
    /**
1113
     * Returns the current system time
1114
     *
1115
     * @return int
1116
     */
1117
    public function getCurrentTime()
1118
    {
1119
        return time();
1120
    }
1121
1122
    /************************************
1123
     *
1124
     * URL reading
1125
     *
1126
     ************************************/
1127
1128
    /**
1129
     * Read URL for single queue entry
1130
     *
1131
     * @param integer $queueId
1132
     * @param boolean $force If set, will process even if exec_time has been set!
1133
     * @return integer
1134
     */
1135
    public function readUrl($queueId, $force = false)
1136
    {
1137
        $queryBuilder = GeneralUtility::makeInstance(ConnectionPool::class)->getQueryBuilderForTable($this->tableName);
1138
        $ret = 0;
1139
        $this->logger->debug('crawler-readurl start ' . microtime(true));
1140
        // Get entry:
1141
        $queryBuilder
1142
            ->select('*')
1143
            ->from('tx_crawler_queue')
1144
            ->where(
1145
                $queryBuilder->expr()->eq('qid', $queryBuilder->createNamedParameter($queueId, \PDO::PARAM_INT))
1146
            );
1147
        if (! $force) {
1148
            $queryBuilder
1149
                ->andWhere('exec_time = 0')
1150
                ->andWhere('process_scheduled > 0');
1151
        }
1152
        $queueRec = $queryBuilder->execute()->fetch();
1153
1154
        if (! is_array($queueRec)) {
1155
            return;
1156
        }
1157
1158
        SignalSlotUtility::emitSignal(
1159
            self::class,
1160
            SignalSlotUtility::SIGNAL_QUEUEITEM_PREPROCESS,
1161
            [$queueId, &$queueRec]
1162
        );
1163
1164
        // Set exec_time to lock record:
1165
        $field_array = ['exec_time' => $this->getCurrentTime()];
1166
1167
        if (isset($this->processID)) {
1168
            //if mulitprocessing is used we need to store the id of the process which has handled this entry
1169
            $field_array['process_id_completed'] = $this->processID;
1170
        }
1171
1172
        GeneralUtility::makeInstance(ConnectionPool::class)->getConnectionForTable('tx_crawler_queue')
1173
            ->update(
1174
                'tx_crawler_queue',
1175
                $field_array,
1176
                ['qid' => (int) $queueId]
1177
            );
1178
1179
        $result = $this->queueExecutor->executeQueueItem($queueRec, $this);
1180
        if ($result['content'] === null) {
1181
            $resultData = 'An errors happened';
1182
        } else {
1183
            $resultData = unserialize($result['content']);
1184
        }
1185
1186
        //atm there's no need to point to specific pollable extensions
1187
        if (is_array($GLOBALS['TYPO3_CONF_VARS']['EXTCONF']['crawler']['pollSuccess'])) {
1188
            foreach ($GLOBALS['TYPO3_CONF_VARS']['EXTCONF']['crawler']['pollSuccess'] as $pollable) {
1189
                // only check the success value if the instruction is runnig
1190
                // it is important to name the pollSuccess key same as the procInstructions key
1191
                if (is_array($resultData['parameters']['procInstructions'])
1192
                    && in_array(
1193
                        $pollable,
1194
                        $resultData['parameters']['procInstructions'], true
1195
                    )
1196
                ) {
1197
                    if (! empty($resultData['success'][$pollable]) && $resultData['success'][$pollable]) {
1198
                        $ret |= self::CLI_STATUS_POLLABLE_PROCESSED;
1199
                    }
1200
                }
1201
            }
1202
        }
1203
1204
        // Set result in log which also denotes the end of the processing of this entry.
1205
        $field_array = ['result_data' => serialize($result)];
1206
1207
        SignalSlotUtility::emitSignal(
1208
            self::class,
1209
            SignalSlotUtility::SIGNAL_QUEUEITEM_POSTPROCESS,
1210
            [$queueId, &$field_array]
1211
        );
1212
1213
        GeneralUtility::makeInstance(ConnectionPool::class)->getConnectionForTable('tx_crawler_queue')
1214
            ->update(
1215
                'tx_crawler_queue',
1216
                $field_array,
1217
                ['qid' => (int) $queueId]
1218
            );
1219
1220
        $this->logger->debug('crawler-readurl stop ' . microtime(true));
1221
        return $ret;
1222
    }
1223
1224
    /**
1225
     * Read URL for not-yet-inserted log-entry
1226
     *
1227
     * @param array $field_array Queue field array,
1228
     *
1229
     * @return string
1230
     */
1231
    public function readUrlFromArray($field_array)
1232
    {
1233
        // Set exec_time to lock record:
1234
        $field_array['exec_time'] = $this->getCurrentTime();
1235
        $connectionForCrawlerQueue = GeneralUtility::makeInstance(ConnectionPool::class)->getConnectionForTable($this->tableName);
1236
        $connectionForCrawlerQueue->insert(
1237
            $this->tableName,
1238
            $field_array
1239
        );
1240
        $queueId = $field_array['qid'] = $connectionForCrawlerQueue->lastInsertId($this->tableName, 'qid');
1241
1242
        $result = $this->queueExecutor->executeQueueItem($field_array, $this);
1243
1244
        // Set result in log which also denotes the end of the processing of this entry.
1245
        $field_array = ['result_data' => serialize($result)];
1246
1247
        SignalSlotUtility::emitSignal(
1248
            self::class,
1249
            SignalSlotUtility::SIGNAL_QUEUEITEM_POSTPROCESS,
1250
            [$queueId, &$field_array]
1251
        );
1252
1253
        $connectionForCrawlerQueue->update(
1254
            $this->tableName,
1255
            $field_array,
1256
            ['qid' => $queueId]
1257
        );
1258
1259
        return $result;
0 ignored issues
show
Bug Best Practice introduced by
The expression return $result also could return the type array|boolean which is incompatible with the documented return type string.
Loading history...
1260
    }
1261
1262
    /*****************************
1263
     *
1264
     * Compiling URLs to crawl - tools
1265
     *
1266
     *****************************/
1267
1268
    /**
1269
     * @param integer $id Root page id to start from.
1270
     * @param integer $depth Depth of tree, 0=only id-page, 1= on sublevel, 99 = infinite
1271
     * @param integer $scheduledTime Unix Time when the URL is timed to be visited when put in queue
1272
     * @param integer $reqMinute Number of requests per minute (creates the interleave between requests)
1273
     * @param boolean $submitCrawlUrls If set, submits the URLs to queue in database (real crawling)
1274
     * @param boolean $downloadCrawlUrls If set (and submitcrawlUrls is false) will fill $downloadUrls with entries)
1275
     * @param array $incomingProcInstructions Array of processing instructions
1276
     * @param array $configurationSelection Array of configuration keys
1277
     * @return string
1278
     */
1279
    public function getPageTreeAndUrls(
1280
        $id,
1281
        $depth,
1282
        $scheduledTime,
1283
        $reqMinute,
1284
        $submitCrawlUrls,
1285
        $downloadCrawlUrls,
1286
        array $incomingProcInstructions,
1287
        array $configurationSelection
1288
    ) {
1289
        $this->scheduledTime = $scheduledTime;
1290
        $this->reqMinute = $reqMinute;
1291
        $this->submitCrawlUrls = $submitCrawlUrls;
1292
        $this->downloadCrawlUrls = $downloadCrawlUrls;
1293
        $this->incomingProcInstructions = $incomingProcInstructions;
1294
        $this->incomingConfigurationSelection = $configurationSelection;
1295
1296
        $this->duplicateTrack = [];
1297
        $this->downloadUrls = [];
1298
1299
        // Drawing tree:
1300
        /* @var PageTreeView $tree */
1301
        $tree = GeneralUtility::makeInstance(PageTreeView::class);
1302
        $perms_clause = $this->getBackendUser()->getPagePermsClause(Permission::PAGE_SHOW);
1303
        $tree->init('AND ' . $perms_clause);
1304
1305
        $pageInfo = BackendUtility::readPageAccess($id, $perms_clause);
1306
        if (is_array($pageInfo)) {
0 ignored issues
show
introduced by
The condition is_array($pageInfo) is always true.
Loading history...
1307
            // Set root row:
1308
            $tree->tree[] = [
1309
                'row' => $pageInfo,
1310
                'HTML' => $this->iconFactory->getIconForRecord('pages', $pageInfo, Icon::SIZE_SMALL),
1311
            ];
1312
        }
1313
1314
        // Get branch beneath:
1315
        if ($depth) {
1316
            $tree->getTree($id, $depth, '');
1317
        }
1318
1319
        // Traverse page tree:
1320
        $code = '';
1321
1322
        foreach ($tree->tree as $data) {
1323
            $this->MP = false;
1324
1325
            // recognize mount points
1326
            if ($data['row']['doktype'] === PageRepository::DOKTYPE_MOUNTPOINT) {
1327
                $queryBuilder = GeneralUtility::makeInstance(ConnectionPool::class)->getQueryBuilderForTable('pages');
1328
                $queryBuilder->getRestrictions()->removeAll()->add(GeneralUtility::makeInstance(DeletedRestriction::class));
1329
                $mountpage = $queryBuilder
1330
                    ->select('*')
1331
                    ->from('pages')
1332
                    ->where(
1333
                        $queryBuilder->expr()->eq('uid', $queryBuilder->createNamedParameter($data['row']['uid'], \PDO::PARAM_INT))
1334
                    )
1335
                    ->execute()
1336
                    ->fetchAll();
1337
                $queryBuilder->resetRestrictions();
1338
1339
                // fetch mounted pages
1340
                $this->MP = $mountpage[0]['mount_pid'] . '-' . $data['row']['uid'];
0 ignored issues
show
Documentation Bug introduced by
The property $MP was declared of type boolean, but $mountpage[0]['mount_pid...' . $data['row']['uid'] is of type string. Maybe add a type cast?

This check looks for assignments to scalar types that may be of the wrong type.

To ensure the code behaves as expected, it may be a good idea to add an explicit type cast.

$answer = 42;

$correct = false;

$correct = (bool) $answer;
Loading history...
1341
1342
                $mountTree = GeneralUtility::makeInstance(PageTreeView::class);
1343
                $mountTree->init('AND ' . $perms_clause);
1344
                $mountTree->getTree($mountpage[0]['mount_pid'], $depth);
1345
1346
                foreach ($mountTree->tree as $mountData) {
1347
                    $code .= $this->drawURLs_addRowsForPage(
1348
                        $mountData['row'],
1349
                        $mountData['HTML'] . BackendUtility::getRecordTitle('pages', $mountData['row'], true)
1350
                    );
1351
                }
1352
1353
                // replace page when mount_pid_ol is enabled
1354
                if ($mountpage[0]['mount_pid_ol']) {
1355
                    $data['row']['uid'] = $mountpage[0]['mount_pid'];
1356
                } else {
1357
                    // if the mount_pid_ol is not set the MP must not be used for the mountpoint page
1358
                    $this->MP = false;
1359
                }
1360
            }
1361
1362
            $code .= $this->drawURLs_addRowsForPage(
1363
                $data['row'],
1364
                $data['HTML'] . BackendUtility::getRecordTitle('pages', $data['row'], true)
1365
            );
1366
        }
1367
1368
        return $code;
1369
    }
1370
1371
    /**
1372
     * Expands exclude string
1373
     *
1374
     * @param string $excludeString Exclude string
1375
     * @return array
1376
     */
1377 1
    public function expandExcludeString($excludeString)
1378
    {
1379
        // internal static caches;
1380 1
        static $expandedExcludeStringCache;
1381 1
        static $treeCache;
1382
1383 1
        if (empty($expandedExcludeStringCache[$excludeString])) {
1384 1
            $pidList = [];
1385
1386 1
            if (! empty($excludeString)) {
1387
                /** @var PageTreeView $tree */
1388
                $tree = GeneralUtility::makeInstance(PageTreeView::class);
1389
                $tree->init('AND ' . $this->getBackendUser()->getPagePermsClause(Permission::PAGE_SHOW));
1390
1391
                $excludeParts = GeneralUtility::trimExplode(',', $excludeString);
1392
1393
                foreach ($excludeParts as $excludePart) {
1394
                    [$pid, $depth] = GeneralUtility::trimExplode('+', $excludePart);
1395
1396
                    // default is "page only" = "depth=0"
1397
                    if (empty($depth)) {
1398
                        $depth = (stristr($excludePart, '+')) ? 99 : 0;
1399
                    }
1400
1401
                    $pidList[] = $pid;
1402
1403
                    if ($depth > 0) {
1404
                        if (empty($treeCache[$pid][$depth])) {
1405
                            $tree->reset();
1406
                            $tree->getTree($pid, $depth);
1407
                            $treeCache[$pid][$depth] = $tree->tree;
1408
                        }
1409
1410
                        foreach ($treeCache[$pid][$depth] as $data) {
1411
                            $pidList[] = $data['row']['uid'];
1412
                        }
1413
                    }
1414
                }
1415
            }
1416
1417 1
            $expandedExcludeStringCache[$excludeString] = array_unique($pidList);
1418
        }
1419
1420 1
        return $expandedExcludeStringCache[$excludeString];
1421
    }
1422
1423
    /**
1424
     * Create the rows for display of the page tree
1425
     * For each page a number of rows are shown displaying GET variable configuration
1426
     */
1427
    public function drawURLs_addRowsForPage(array $pageRow, string $pageTitle): string
1428
    {
1429
        $skipMessage = '';
1430
1431
        // Get list of configurations
1432
        $configurations = $this->getUrlsForPageRow($pageRow, $skipMessage);
1433
1434
        if (! empty($this->incomingConfigurationSelection)) {
1435
            // remove configuration that does not match the current selection
1436
            foreach ($configurations as $confKey => $confArray) {
1437
                if (! in_array($confKey, $this->incomingConfigurationSelection, true)) {
1438
                    unset($configurations[$confKey]);
1439
                }
1440
            }
1441
        }
1442
1443
        // Traverse parameter combinations:
1444
        $c = 0;
1445
        $content = '';
1446
        if (! empty($configurations)) {
1447
            foreach ($configurations as $confKey => $confArray) {
1448
1449
                // Title column:
1450
                if (! $c) {
1451
                    $titleClm = '<td rowspan="' . count($configurations) . '">' . $pageTitle . '</td>';
1452
                } else {
1453
                    $titleClm = '';
1454
                }
1455
1456
                if (! in_array($pageRow['uid'], $this->expandExcludeString($confArray['subCfg']['exclude']), true)) {
1457
1458
                    // URL list:
1459
                    $urlList = $this->urlListFromUrlArray(
1460
                        $confArray,
1461
                        $pageRow,
1462
                        $this->scheduledTime,
1463
                        $this->reqMinute,
1464
                        $this->submitCrawlUrls,
1465
                        $this->downloadCrawlUrls,
1466
                        $this->duplicateTrack,
1467
                        $this->downloadUrls,
1468
                        $this->incomingProcInstructions // if empty the urls won't be filtered by processing instructions
1469
                    );
1470
1471
                    // Expanded parameters:
1472
                    $paramExpanded = '';
1473
                    $calcAccu = [];
1474
                    $calcRes = 1;
1475
                    foreach ($confArray['paramExpanded'] as $gVar => $gVal) {
1476
                        $paramExpanded .= '
1477
                            <tr>
1478
                                <td>' . htmlspecialchars('&' . $gVar . '=') . '<br/>' .
1479
                            '(' . count($gVal) . ')' .
1480
                            '</td>
1481
                                <td nowrap="nowrap">' . nl2br(htmlspecialchars(implode(chr(10), $gVal))) . '</td>
1482
                            </tr>
1483
                        ';
1484
                        $calcRes *= count($gVal);
1485
                        $calcAccu[] = count($gVal);
1486
                    }
1487
                    $paramExpanded = '<table>' . $paramExpanded . '</table>';
1488
                    $paramExpanded .= 'Comb: ' . implode('*', $calcAccu) . '=' . $calcRes;
1489
1490
                    // Options
1491
                    $optionValues = '';
1492
                    if ($confArray['subCfg']['userGroups']) {
1493
                        $optionValues .= 'User Groups: ' . $confArray['subCfg']['userGroups'] . '<br/>';
1494
                    }
1495
                    if ($confArray['subCfg']['procInstrFilter']) {
1496
                        $optionValues .= 'ProcInstr: ' . $confArray['subCfg']['procInstrFilter'] . '<br/>';
1497
                    }
1498
1499
                    // Compile row:
1500
                    $content .= '
1501
                        <tr>
1502
                            ' . $titleClm . '
1503
                            <td>' . htmlspecialchars($confKey) . '</td>
1504
                            <td>' . nl2br(htmlspecialchars(rawurldecode(trim(str_replace('&', chr(10) . '&', GeneralUtility::implodeArrayForUrl('', $confArray['paramParsed'])))))) . '</td>
1505
                            <td>' . $paramExpanded . '</td>
1506
                            <td nowrap="nowrap">' . $urlList . '</td>
1507
                            <td nowrap="nowrap">' . $optionValues . '</td>
1508
                            <td nowrap="nowrap">' . DebugUtility::viewArray($confArray['subCfg']['procInstrParams.']) . '</td>
1509
                        </tr>';
1510
                } else {
1511
                    $content .= '<tr>
1512
                            ' . $titleClm . '
1513
                            <td>' . htmlspecialchars($confKey) . '</td>
1514
                            <td colspan="5"><em>No entries</em> (Page is excluded in this configuration)</td>
1515
                        </tr>';
1516
                }
1517
1518
                $c++;
1519
            }
1520
        } else {
1521
            $message = ! empty($skipMessage) ? ' (' . $skipMessage . ')' : '';
1522
1523
            // Compile row:
1524
            $content .= '
1525
                <tr>
1526
                    <td>' . $pageTitle . '</td>
1527
                    <td colspan="6"><em>No entries</em>' . $message . '</td>
1528
                </tr>';
1529
        }
1530
1531
        return $content;
1532
    }
1533
1534
    /*****************************
1535
     *
1536
     * CLI functions
1537
     *
1538
     *****************************/
1539
1540
    /**
1541
     * Running the functionality of the CLI (crawling URLs from queue)
1542
     */
1543
    public function CLI_run(int $countInARun, int $sleepTime, int $sleepAfterFinish): int
1544
    {
1545
        $result = 0;
1546
        $counter = 0;
1547
1548
        // First, run hooks:
1549
        $this->CLI_runHooks();
1550
1551
        // Clean up the queue
1552
        $this->queueRepository->cleanupQueue();
1553
1554
        // Select entries:
1555
        $rows = $this->queueRepository->fetchRecordsToBeCrawled($countInARun);
1556
1557
        if (! empty($rows)) {
1558
            $quidList = [];
1559
1560
            foreach ($rows as $r) {
1561
                $quidList[] = $r['qid'];
1562
            }
1563
1564
            $processId = $this->CLI_buildProcessId();
1565
1566
            //save the number of assigned queue entries to determine how many have been processed later
1567
            $numberOfAffectedRows = $this->queueRepository->updateProcessIdAndSchedulerForQueueIds($quidList, $processId);
1568
            $this->processRepository->updateProcessAssignItemsCount($numberOfAffectedRows, $processId);
1569
1570
            if ($numberOfAffectedRows !== count($quidList)) {
1571
                $this->CLI_debug('Nothing processed due to multi-process collision (' . $this->CLI_buildProcessId() . ')');
1572
                return ($result | self::CLI_STATUS_ABORTED);
1573
            }
1574
1575
            foreach ($rows as $r) {
1576
                $result |= $this->readUrl($r['qid']);
1577
1578
                $counter++;
1579
                usleep((int) $sleepTime); // Just to relax the system
1580
1581
                // if during the start and the current read url the cli has been disable we need to return from the function
1582
                // mark the process NOT as ended.
1583
                if ($this->getDisabled()) {
1584
                    return ($result | self::CLI_STATUS_ABORTED);
1585
                }
1586
1587
                if (! $this->processRepository->isProcessActive($this->CLI_buildProcessId())) {
1588
                    $this->CLI_debug('conflict / timeout (' . $this->CLI_buildProcessId() . ')');
1589
                    $result |= self::CLI_STATUS_ABORTED;
1590
                    break; //possible timeout
1591
                }
1592
            }
1593
1594
            sleep((int) $sleepAfterFinish);
1595
1596
            $msg = 'Rows: ' . $counter;
1597
            $this->CLI_debug($msg . ' (' . $this->CLI_buildProcessId() . ')');
1598
        } else {
1599
            $this->CLI_debug('Nothing within queue which needs to be processed (' . $this->CLI_buildProcessId() . ')');
1600
        }
1601
1602
        if ($counter > 0) {
1603
            $result |= self::CLI_STATUS_PROCESSED;
1604
        }
1605
1606
        return $result;
1607
    }
1608
1609
    /**
1610
     * Activate hooks
1611
     */
1612
    public function CLI_runHooks(): void
1613
    {
1614
        foreach ($GLOBALS['TYPO3_CONF_VARS']['EXTCONF']['crawler']['cli_hooks'] ?? [] as $objRef) {
1615
            $hookObj = GeneralUtility::makeInstance($objRef);
1616
            if (is_object($hookObj)) {
1617
                $hookObj->crawler_init($this);
1618
            }
1619
        }
1620
    }
1621
1622
    /**
1623
     * Try to acquire a new process with the given id
1624
     * also performs some auto-cleanup for orphan processes
1625
     * @param string $id identification string for the process
1626
     * @return boolean
1627
     * @todo preemption might not be the most elegant way to clean up
1628
     */
1629
    public function CLI_checkAndAcquireNewProcess($id)
1630
    {
1631
        $queryBuilder = GeneralUtility::makeInstance(ConnectionPool::class)->getQueryBuilderForTable($this->tableName);
1632
        $ret = true;
1633
1634
        $systemProcessId = getmypid();
1635
        if ($systemProcessId < 1) {
1636
            return false;
1637
        }
1638
1639
        $processCount = 0;
1640
        $orphanProcesses = [];
1641
1642
        $statement = $queryBuilder
1643
            ->select('process_id', 'ttl')
1644
            ->from('tx_crawler_process')
1645
            ->where(
1646
                'active = 1 AND deleted = 0'
1647
            )
1648
            ->execute();
1649
1650
        $currentTime = $this->getCurrentTime();
1651
1652
        while ($row = $statement->fetch()) {
1653
            if ($row['ttl'] < $currentTime) {
1654
                $orphanProcesses[] = $row['process_id'];
1655
            } else {
1656
                $processCount++;
1657
            }
1658
        }
1659
1660
        // if there are less than allowed active processes then add a new one
1661
        if ($processCount < (int) $this->extensionSettings['processLimit']) {
1662
            $this->CLI_debug('add process ' . $this->CLI_buildProcessId() . ' (' . ($processCount + 1) . '/' . (int) $this->extensionSettings['processLimit'] . ')');
1663
1664
            GeneralUtility::makeInstance(ConnectionPool::class)->getConnectionForTable('tx_crawler_process')->insert(
1665
                'tx_crawler_process',
1666
                [
1667
                    'process_id' => $id,
1668
                    'active' => 1,
1669
                    'ttl' => $currentTime + (int) $this->extensionSettings['processMaxRunTime'],
1670
                    'system_process_id' => $systemProcessId,
1671
                ]
1672
            );
1673
        } else {
1674
            $this->CLI_debug('Processlimit reached (' . ($processCount) . '/' . (int) $this->extensionSettings['processLimit'] . ')');
1675
            $ret = false;
1676
        }
1677
1678
        $this->processRepository->deleteProcessesMarkedAsDeleted();
1679
        $this->CLI_releaseProcesses($orphanProcesses);
1680
1681
        return $ret;
1682
    }
1683
1684
    /**
1685
     * Release a process and the required resources
1686
     *
1687
     * @param mixed $releaseIds string with a single process-id or array with multiple process-ids
1688
     * @return boolean
1689
     */
1690
    public function CLI_releaseProcesses($releaseIds)
1691
    {
1692
        $queryBuilder = GeneralUtility::makeInstance(ConnectionPool::class)->getQueryBuilderForTable($this->tableName);
1693
1694
        if (! is_array($releaseIds)) {
1695
            $releaseIds = [$releaseIds];
1696
        }
1697
1698
        if (empty($releaseIds)) {
1699
            return false;   //nothing to release
1700
        }
1701
1702
        // some kind of 2nd chance algo - this way you need at least 2 processes to have a real cleanup
1703
        // this ensures that a single process can't mess up the entire process table
1704
1705
        // mark all processes as deleted which have no "waiting" queue-entires and which are not active
1706
1707
        $queryBuilder
1708
            ->update($this->tableName, 'q')
1709
            ->where(
1710
                'q.process_id IN(SELECT p.process_id FROM tx_crawler_process as p WHERE p.active = 0)'
1711
            )
1712
            ->set('q.process_scheduled', 0)
1713
            ->set('q.process_id', '')
1714
            ->execute();
1715
1716
        // FIXME: Not entirely sure that this is equivalent to the previous version
1717
        $queryBuilder->resetQueryPart('set');
1718
1719
        $queryBuilder
1720
            ->update('tx_crawler_process')
1721
            ->where(
1722
                $queryBuilder->expr()->eq('active', 0),
1723
                'process_id IN(SELECT q.process_id FROM tx_crawler_queue as q WHERE q.exec_time = 0)'
1724
            )
1725
            ->set('system_process_id', 0)
1726
            ->execute();
1727
1728
        $this->processRepository->markRequestedProcessesAsNotActive($releaseIds);
1729
        $this->queueRepository->unsetProcessScheduledAndProcessIdForQueueEntries($releaseIds);
1730
1731
        return true;
1732
    }
1733
1734
    /**
1735
     * Create a unique Id for the current process
1736
     *
1737
     * @return string  the ID
1738
     */
1739 1
    public function CLI_buildProcessId()
1740
    {
1741 1
        if (! $this->processID) {
1742
            $this->processID = GeneralUtility::shortMD5(microtime(true));
1743
        }
1744 1
        return $this->processID;
1745
    }
1746
1747
    /**
1748
     * Prints a message to the stdout (only if debug-mode is enabled)
1749
     *
1750
     * @param string $msg the message
1751
     */
1752
    public function CLI_debug($msg): void
1753
    {
1754
        if ((int) $this->extensionSettings['processDebug']) {
1755
            echo $msg . "\n";
1756
            flush();
1757
        }
1758
    }
1759
1760
    /**
1761
     * Cleans up entries that stayed for too long in the queue. These are:
1762
     * - processed entries that are over 1.5 days in age
1763
     * - scheduled entries that are over 7 days old
1764
     *
1765
     * @deprecated
1766
     */
1767 1
    public function cleanUpOldQueueEntries(): void
1768
    {
1769 1
        $processedAgeInSeconds = $this->extensionSettings['cleanUpProcessedAge'] * 86400; // 24*60*60 Seconds in 24 hours
1770 1
        $scheduledAgeInSeconds = $this->extensionSettings['cleanUpScheduledAge'] * 86400;
1771
1772 1
        $now = time();
1773 1
        $condition = '(exec_time<>0 AND exec_time<' . ($now - $processedAgeInSeconds) . ') OR scheduled<=' . ($now - $scheduledAgeInSeconds);
1774 1
        $this->flushQueue($condition);
0 ignored issues
show
Deprecated Code introduced by
The function AOE\Crawler\Controller\C...ontroller::flushQueue() has been deprecated. ( Ignorable by Annotation )

If this is a false-positive, you can also ignore this issue in your code via the ignore-deprecated  annotation

1774
        /** @scrutinizer ignore-deprecated */ $this->flushQueue($condition);
Loading history...
1775 1
    }
1776
1777
    /**
1778
     * Removes queue entries
1779
     *
1780
     * @param string $where SQL related filter for the entries which should be removed
1781
     *
1782
     * @deprecated
1783
     */
1784 5
    protected function flushQueue($where = ''): void
1785
    {
1786 5
        $realWhere = strlen((string) $where) > 0 ? $where : '1=1';
1787
1788 5
        $queryBuilder = $this->getQueryBuilder($this->tableName);
1789
1790
        $groups = $queryBuilder
1791 5
            ->selectLiteral('DISTINCT set_id')
1792 5
            ->from($this->tableName)
1793 5
            ->where($realWhere)
1794 5
            ->execute()
1795 5
            ->fetchAll();
1796 5
        if (is_array($groups)) {
0 ignored issues
show
introduced by
The condition is_array($groups) is always true.
Loading history...
1797 5
            foreach ($groups as $group) {
1798
                $subSet = $queryBuilder
1799 4
                    ->select('qid', 'set_id')
1800 4
                    ->from($this->tableName)
1801 4
                    ->where(
1802 4
                        $realWhere,
1803 4
                        $queryBuilder->expr()->eq('set_id', $group['set_id'])
1804
                    )
1805 4
                    ->execute()
1806 4
                    ->fetchAll();
1807
1808 4
                $payLoad = ['subSet' => $subSet];
1809 4
                SignalSlotUtility::emitSignal(
1810 4
                    self::class,
1811 4
                    SignalSlotUtility::SIGNAL_QUEUE_ENTRY_FLUSH,
1812 4
                    $payLoad
1813
                );
1814
            }
1815
        }
1816
1817
        $queryBuilder
1818 5
            ->delete($this->tableName)
1819 5
            ->where($realWhere)
1820 5
            ->execute();
1821 5
    }
1822
1823
    /**
1824
     * This method determines duplicates for a queue entry with the same parameters and this timestamp.
1825
     * If the timestamp is in the past, it will check if there is any unprocessed queue entry in the past.
1826
     * If the timestamp is in the future it will check, if the queued entry has exactly the same timestamp
1827
     *
1828
     * @param int $tstamp
1829
     * @param array $fieldArray
1830
     *
1831
     * @return array
1832
     */
1833 9
    protected function getDuplicateRowsIfExist($tstamp, $fieldArray)
1834
    {
1835 9
        $rows = [];
1836
1837 9
        $currentTime = $this->getCurrentTime();
1838
1839 9
        $queryBuilder = GeneralUtility::makeInstance(ConnectionPool::class)->getQueryBuilderForTable($this->tableName);
1840
        $queryBuilder
1841 9
            ->select('qid')
1842 9
            ->from('tx_crawler_queue');
1843
        //if this entry is scheduled with "now"
1844 9
        if ($tstamp <= $currentTime) {
1845 3
            if ($this->extensionSettings['enableTimeslot']) {
1846 2
                $timeBegin = $currentTime - 100;
1847 2
                $timeEnd = $currentTime + 100;
1848
                $queryBuilder
1849 2
                    ->where(
1850 2
                        'scheduled BETWEEN ' . $timeBegin . ' AND ' . $timeEnd . ''
1851
                    )
1852 2
                    ->orWhere(
1853 2
                        $queryBuilder->expr()->lte('scheduled', $currentTime)
1854
                    );
1855
            } else {
1856
                $queryBuilder
1857 1
                    ->where(
1858 3
                        $queryBuilder->expr()->lte('scheduled', $currentTime)
1859
                    );
1860
            }
1861 6
        } elseif ($tstamp > $currentTime) {
1862
            //entry with a timestamp in the future need to have the same schedule time
1863
            $queryBuilder
1864 6
                ->where(
1865 6
                    $queryBuilder->expr()->eq('scheduled', $tstamp)
1866
                );
1867
        }
1868
1869
        $queryBuilder
1870 9
            ->andWhere('NOT exec_time')
1871 9
            ->andWhere('NOT process_id')
1872 9
            ->andWhere($queryBuilder->expr()->eq('page_id', $queryBuilder->createNamedParameter($fieldArray['page_id'], \PDO::PARAM_INT)))
1873 9
            ->andWhere($queryBuilder->expr()->eq('parameters_hash', $queryBuilder->createNamedParameter($fieldArray['parameters_hash'], \PDO::PARAM_STR)));
1874
1875 9
        $statement = $queryBuilder->execute();
1876
1877 9
        while ($row = $statement->fetch()) {
1878 7
            $rows[] = $row['qid'];
1879
        }
1880
1881 9
        return $rows;
1882
    }
1883
1884
    /**
1885
     * Returns a md5 hash generated from a serialized configuration array.
1886
     *
1887
     * @return string
1888
     */
1889 10
    protected function getConfigurationHash(array $configuration)
1890
    {
1891 10
        unset($configuration['paramExpanded']);
1892 10
        unset($configuration['URLs']);
1893 10
        return md5(serialize($configuration));
1894
    }
1895
1896
    /**
1897
     * Build a URL from a Page and the Query String. If the page has a Site configuration, it can be built by using
1898
     * the Site instance.
1899
     *
1900
     * @param int $httpsOrHttp see tx_crawler_configuration.force_ssl
1901
     * @throws \TYPO3\CMS\Core\Exception\SiteNotFoundException
1902
     * @throws \TYPO3\CMS\Core\Routing\InvalidRouteArgumentsException
1903
     */
1904 12
    protected function getUrlFromPageAndQueryParameters(int $pageId, string $queryString, ?string $alternativeBaseUrl, int $httpsOrHttp): UriInterface
1905
    {
1906 12
        $site = GeneralUtility::makeInstance(SiteMatcher::class)->matchByPageId((int) $pageId);
1907 12
        if ($site instanceof Site) {
1908 5
            $queryString = ltrim($queryString, '?&');
1909 5
            $queryParts = [];
1910 5
            parse_str($queryString, $queryParts);
1911 5
            unset($queryParts['id']);
1912
            // workaround as long as we don't have native language support in crawler configurations
1913 5
            if (isset($queryParts['L'])) {
1914
                $queryParts['_language'] = $queryParts['L'];
1915
                unset($queryParts['L']);
1916
                $siteLanguage = $site->getLanguageById((int) $queryParts['_language']);
0 ignored issues
show
Unused Code introduced by
The assignment to $siteLanguage is dead and can be removed.
Loading history...
1917
            } else {
1918 5
                $siteLanguage = $site->getDefaultLanguage();
1919
            }
1920 5
            $url = $site->getRouter()->generateUri($pageId, $queryParts);
1921 5
            if (! empty($alternativeBaseUrl)) {
1922 3
                $alternativeBaseUrl = new Uri($alternativeBaseUrl);
1923 3
                $url = $url->withHost($alternativeBaseUrl->getHost());
1924 3
                $url = $url->withScheme($alternativeBaseUrl->getScheme());
1925 3
                $url = $url->withPort($alternativeBaseUrl->getPort());
1926 3
                if ($userInfo = $alternativeBaseUrl->getUserInfo()) {
1927 5
                    $url = $url->withUserInfo($userInfo);
1928
                }
1929
            }
1930
        } else {
1931
            // Technically this is not possible with site handling, but kept for backwards-compatibility reasons
1932
            // Once EXT:crawler is v10-only compatible, this should be removed completely
1933 7
            $baseUrl = ($alternativeBaseUrl ?: GeneralUtility::getIndpEnv('TYPO3_SITE_URL'));
1934 7
            $cacheHashCalculator = GeneralUtility::makeInstance(CacheHashCalculator::class);
1935 7
            $queryString .= '&cHash=' . $cacheHashCalculator->generateForParameters($queryString);
1936 7
            $url = rtrim($baseUrl, '/') . '/index.php' . $queryString;
1937 7
            $url = new Uri($url);
1938
        }
1939
1940 12
        if ($httpsOrHttp === -1) {
1941 2
            $url = $url->withScheme('http');
1942 10
        } elseif ($httpsOrHttp === 1) {
1943 6
            $url = $url->withScheme('https');
1944
        }
1945
1946 12
        return $url;
1947
    }
1948
1949 1
    protected function swapIfFirstIsLargerThanSecond(array $reg): array
1950
    {
1951
        // Swap if first is larger than last:
1952 1
        if ($reg[1] > $reg[2]) {
1953
            $temp = $reg[2];
1954
            $reg[2] = $reg[1];
1955
            $reg[1] = $temp;
1956
        }
1957
1958 1
        return $reg;
1959
    }
1960
1961
    /**
1962
     * @return BackendUserAuthentication
1963
     */
1964 1
    private function getBackendUser()
1965
    {
1966
        // Make sure the _cli_ user is loaded
1967 1
        Bootstrap::initializeBackendAuthentication();
1968 1
        if ($this->backendUser === null) {
1969 1
            $this->backendUser = $GLOBALS['BE_USER'];
1970
        }
1971 1
        return $this->backendUser;
1972
    }
1973
1974
    /**
1975
     * Get querybuilder for given table
1976
     *
1977
     * @return \TYPO3\CMS\Core\Database\Query\QueryBuilder
1978
     */
1979 12
    private function getQueryBuilder(string $table)
1980
    {
1981 12
        return GeneralUtility::makeInstance(ConnectionPool::class)->getQueryBuilderForTable($table);
1982
    }
1983
}
1984