Passed
Push — refactor/crawlerController ( 30d084...b87fbb )
by Tomas Norre
07:12
created

CrawlerController::urlListFromUrlArray()   B

Complexity

Conditions 8
Paths 8

Size

Total Lines 68
Code Lines 39

Duplication

Lines 0
Ratio 0 %

Code Coverage

Tests 34
CRAP Score 8.1348

Importance

Changes 1
Bugs 0 Features 0
Metric Value
cc 8
eloc 39
c 1
b 0
f 0
nc 8
nop 9
dl 0
loc 68
ccs 34
cts 39
cp 0.8718
crap 8.1348
rs 8.0515

How to fix   Long Method    Many Parameters   

Long Method

Small methods make your code easier to understand, in particular if combined with a good name. Besides, if your method is small, finding a good name is usually much easier.

For example, if you find yourself adding comments to a method's body, this is usually a good sign to extract the commented part to a new method, and use the comment as a starting point when coming up with a good name for this new method.

Commonly applied refactorings include:

Many Parameters

Methods with many parameters are not only hard to understand, but their parameters also often become inconsistent when you need more, or different data.

There are several approaches to avoid long parameter lists:

1
<?php
2
3
declare(strict_types=1);
4
5
namespace AOE\Crawler\Controller;
6
7
/***************************************************************
8
 *  Copyright notice
9
 *
10
 *  (c) 2020 AOE GmbH <[email protected]>
11
 *
12
 *  All rights reserved
13
 *
14
 *  This script is part of the TYPO3 project. The TYPO3 project is
15
 *  free software; you can redistribute it and/or modify
16
 *  it under the terms of the GNU General Public License as published by
17
 *  the Free Software Foundation; either version 3 of the License, or
18
 *  (at your option) any later version.
19
 *
20
 *  The GNU General Public License can be found at
21
 *  http://www.gnu.org/copyleft/gpl.html.
22
 *
23
 *  This script is distributed in the hope that it will be useful,
24
 *  but WITHOUT ANY WARRANTY; without even the implied warranty of
25
 *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
26
 *  GNU General Public License for more details.
27
 *
28
 *  This copyright notice MUST APPEAR in all copies of the script!
29
 ***************************************************************/
30
31
use AOE\Crawler\Configuration\ExtensionConfigurationProvider;
32
use AOE\Crawler\Converter\JsonCompatibilityConverter;
33
use AOE\Crawler\CrawlStrategy\CrawlStrategyFactory;
34
use AOE\Crawler\Domain\Repository\ConfigurationRepository;
35
use AOE\Crawler\Domain\Repository\ProcessRepository;
36
use AOE\Crawler\Domain\Repository\QueueRepository;
37
use AOE\Crawler\QueueExecutor;
38
use AOE\Crawler\Service\UrlService;
39
use AOE\Crawler\Utility\SignalSlotUtility;
40
use Psr\Http\Message\UriInterface;
41
use Psr\Log\LoggerAwareInterface;
42
use Psr\Log\LoggerAwareTrait;
43
use TYPO3\CMS\Backend\Tree\View\PageTreeView;
44
use TYPO3\CMS\Backend\Utility\BackendUtility;
45
use TYPO3\CMS\Core\Authentication\BackendUserAuthentication;
46
use TYPO3\CMS\Core\Compatibility\PublicMethodDeprecationTrait;
47
use TYPO3\CMS\Core\Compatibility\PublicPropertyDeprecationTrait;
48
use TYPO3\CMS\Core\Core\Bootstrap;
49
use TYPO3\CMS\Core\Core\Environment;
50
use TYPO3\CMS\Core\Database\Connection;
51
use TYPO3\CMS\Core\Database\ConnectionPool;
52
use TYPO3\CMS\Core\Database\Query\Restriction\DeletedRestriction;
53
use TYPO3\CMS\Core\Imaging\Icon;
54
use TYPO3\CMS\Core\Imaging\IconFactory;
55
use TYPO3\CMS\Core\Site\Entity\Site;
56
use TYPO3\CMS\Core\Type\Bitmask\Permission;
57
use TYPO3\CMS\Core\TypoScript\Parser\TypoScriptParser;
58
use TYPO3\CMS\Core\Utility\DebugUtility;
59
use TYPO3\CMS\Core\Utility\GeneralUtility;
60
use TYPO3\CMS\Core\Utility\MathUtility;
61
use TYPO3\CMS\Extbase\Object\ObjectManager;
62
use TYPO3\CMS\Frontend\Page\PageRepository;
63
64
/**
65
 * Class CrawlerController
66
 *
67
 * @package AOE\Crawler\Controller
68
 */
69
class CrawlerController implements LoggerAwareInterface
70
{
71
    use LoggerAwareTrait;
72
    use PublicMethodDeprecationTrait;
73
    use PublicPropertyDeprecationTrait;
74
75
    public const CLI_STATUS_NOTHING_PROCCESSED = 0;
76
77
    //queue not empty
78
    public const CLI_STATUS_REMAIN = 1;
79
80
    //(some) queue items where processed
81
    public const CLI_STATUS_PROCESSED = 2;
82
83
    //instance didn't finish
84
    public const CLI_STATUS_ABORTED = 4;
85
86
    public const CLI_STATUS_POLLABLE_PROCESSED = 8;
87
88
    /**
89
     * @var integer
90
     */
91
    public $setID = 0;
92
93
    /**
94
     * @var string
95
     */
96
    public $processID = '';
97
98
    /**
99
     * @var array
100
     */
101
    public $duplicateTrack = [];
102
103
    /**
104
     * @var array
105
     */
106
    public $downloadUrls = [];
107
108
    /**
109
     * @var array
110
     */
111
    public $incomingProcInstructions = [];
112
113
    /**
114
     * @var array
115
     */
116
    public $incomingConfigurationSelection = [];
117
118
    /**
119
     * @var bool
120
     */
121
    public $registerQueueEntriesInternallyOnly = false;
122
123
    /**
124
     * @var array
125
     */
126
    public $queueEntries = [];
127
128
    /**
129
     * @var array
130
     */
131
    public $urlList = [];
132
133
    /**
134
     * @var array
135
     */
136
    public $extensionSettings = [];
137
138
    /**
139
     * Mount Point
140
     *
141
     * @var bool
142
     * Todo: Check what this is used for and adjust the type hint or code, as bool doesn't match the current code.
143
     */
144
    public $MP = false;
145
146
    /**
147
     * @var string
148
     */
149
    protected $processFilename;
150
151
    /**
152
     * Holds the internal access mode can be 'gui','cli' or 'cli_im'
153
     *
154
     * @var string
155
     * @deprecated
156
     */
157
    protected $accessMode;
158
159
    /**
160
     * @var QueueRepository
161
     */
162
    protected $queueRepository;
163
164
    /**
165
     * @var ProcessRepository
166
     */
167
    protected $processRepository;
168
169
    /**
170
     * @var ConfigurationRepository
171
     */
172
    protected $configurationRepository;
173
174
    /**
175
     * @var string
176
     */
177
    protected $tableName = 'tx_crawler_queue';
178
179
    /**
180
     * @var QueueExecutor
181
     */
182
    protected $queueExecutor;
183
184
    /**
185
     * @var int
186
     */
187
    protected $maximumUrlsToCompile = 10000;
188
189
    /**
190
     * @var IconFactory
191
     */
192
    protected $iconFactory;
193
194
    /**
195
     * @var string[]
196
     */
197
    private $deprecatedPublicMethods = [
0 ignored issues
show
introduced by
The private property $deprecatedPublicMethods is not used, and could be removed.
Loading history...
198
        'cleanUpOldQueueEntries' => 'Using CrawlerController::cleanUpOldQueueEntries() is deprecated since 9.0.1 and will be removed in v11.x, please use QueueRepository->cleanUpOldQueueEntries() instead.',
199
        'CLI_debug' => 'Using CrawlerController->CLI_debug() is deprecated since 9.1.3 and will be removed in v11.x',
200
        'getAccessMode' => 'Using CrawlerController->getAccessMode() is deprecated since 9.1.3 and will be removed in v11.x',
201
        'getLogEntriesForSetId' => 'Using crawlerController::getLogEntriesForSetId() is deprecated since 9.0.1 and will be removed in v11.x',
202
        'flushQueue' => 'Using CrawlerController::flushQueue() is deprecated since 9.0.1 and will be removed in v11.x, please use QueueRepository->flushQueue() instead.',
203
        'setAccessMode' => 'Using CrawlerController->setAccessMode() is deprecated since 9.1.3 and will be removed in v11.x'
204
    ];
205
206
    /**
207
     * @var string[]
208
     */
209
    private $deprecatedPublicProperties = [
1 ignored issue
show
introduced by
The private property $deprecatedPublicProperties is not used, and could be removed.
Loading history...
210
        'accessMode' => 'Using CrawlerController->accessMode is deprecated since 9.1.3 and will be removed in v11.x'
211
    ];
212
213
    /**
214
     * @var BackendUserAuthentication|null
215
     */
216
    private $backendUser;
217
218
    /**
219
     * @var integer
220
     */
221
    private $scheduledTime = 0;
222
223
    /**
224
     * @var integer
225
     */
226
    private $reqMinute = 0;
227
228
    /**
229
     * @var bool
230
     */
231
    private $submitCrawlUrls = false;
232
233
    /**
234
     * @var bool
235
     */
236
    private $downloadCrawlUrls = false;
237
238
    /**
239
     * @var PageRepository
240
     */
241
    private $pageRepository;
242
243
    /************************************
244
     *
245
     * Getting URLs based on Page TSconfig
246
     *
247
     ************************************/
248
249 36
    public function __construct()
250
    {
251 36
        $objectManager = GeneralUtility::makeInstance(ObjectManager::class);
252 36
        $crawlStrategyFactory = GeneralUtility::makeInstance(CrawlStrategyFactory::class);
253 36
        $this->queueRepository = $objectManager->get(QueueRepository::class);
254 36
        $this->processRepository = $objectManager->get(ProcessRepository::class);
255 36
        $this->configurationRepository = $objectManager->get(ConfigurationRepository::class);
256 36
        $this->pageRepository = $objectManager->get(PageRepository::class);
257 36
        $this->queueExecutor = GeneralUtility::makeInstance(QueueExecutor::class, $crawlStrategyFactory);
258 36
        $this->iconFactory = GeneralUtility::makeInstance(IconFactory::class);
259
260 36
        $this->processFilename = Environment::getVarPath() . '/lock/tx_crawler.proc';
261
262
        /** @var ExtensionConfigurationProvider $configurationProvider */
263 36
        $configurationProvider = GeneralUtility::makeInstance(ExtensionConfigurationProvider::class);
264 36
        $settings = $configurationProvider->getExtensionConfiguration();
265 36
        $this->extensionSettings = is_array($settings) ? $settings : [];
0 ignored issues
show
introduced by
The condition is_array($settings) is always true.
Loading history...
266
267
        // set defaults:
268 36
        if (MathUtility::convertToPositiveInteger($this->extensionSettings['countInARun']) === 0) {
269
            $this->extensionSettings['countInARun'] = 100;
270
        }
271
272 36
        $this->extensionSettings['processLimit'] = MathUtility::forceIntegerInRange($this->extensionSettings['processLimit'], 1, 99, 1);
273 36
        $this->maximumUrlsToCompile = MathUtility::forceIntegerInRange($this->extensionSettings['maxCompileUrls'], 1, 1000000000, 10000);
274 36
    }
275
276
    /**
277
     * Method to set the accessMode can be gui, cli or cli_im
278
     *
279
     * @return string
280
     * @deprecated
281
     */
282 1
    public function getAccessMode()
283
    {
284 1
        return $this->accessMode;
0 ignored issues
show
Deprecated Code introduced by
The property AOE\Crawler\Controller\C...Controller::$accessMode has been deprecated. ( Ignorable by Annotation )

If this is a false-positive, you can also ignore this issue in your code via the ignore-deprecated  annotation

284
        return /** @scrutinizer ignore-deprecated */ $this->accessMode;
Loading history...
285
    }
286
287
    /**
288
     * @param string $accessMode
289
     * @deprecated
290
     */
291 1
    public function setAccessMode($accessMode): void
292
    {
293 1
        $this->accessMode = $accessMode;
0 ignored issues
show
Deprecated Code introduced by
The property AOE\Crawler\Controller\C...Controller::$accessMode has been deprecated. ( Ignorable by Annotation )

If this is a false-positive, you can also ignore this issue in your code via the ignore-deprecated  annotation

293
        /** @scrutinizer ignore-deprecated */ $this->accessMode = $accessMode;
Loading history...
294 1
    }
295
296
    /**
297
     * Set disabled status to prevent processes from being processed
298
     *
299
     * @param bool $disabled (optional, defaults to true)
300
     */
301 2
    public function setDisabled($disabled = true): void
302
    {
303 2
        if ($disabled) {
304 1
            GeneralUtility::writeFile($this->processFilename, '');
305
        } else {
306 1
            if (is_file($this->processFilename)) {
307 1
                unlink($this->processFilename);
308
            }
309
        }
310 2
    }
311
312
    /**
313
     * Get disable status
314
     *
315
     * @return bool true if disabled
316
     */
317 2
    public function getDisabled()
318
    {
319 2
        return is_file($this->processFilename);
320
    }
321
322
    /**
323
     * @param string $filenameWithPath
324
     */
325 3
    public function setProcessFilename($filenameWithPath): void
326
    {
327 3
        $this->processFilename = $filenameWithPath;
328 3
    }
329
330
    /**
331
     * @return string
332
     */
333 1
    public function getProcessFilename()
334
    {
335 1
        return $this->processFilename;
336
    }
337
338
    /**
339
     * Sets the extensions settings (unserialized pendant of $TYPO3_CONF_VARS['EXT']['extConf']['crawler']).
340
     */
341 14
    public function setExtensionSettings(array $extensionSettings): void
342
    {
343 14
        $this->extensionSettings = $extensionSettings;
344 14
    }
345
346
    /**
347
     * Check if the given page should be crawled
348
     *
349
     * @return false|string false if the page should be crawled (not excluded), true / skipMessage if it should be skipped
350
     */
351 12
    public function checkIfPageShouldBeSkipped(array $pageRow)
352
    {
353 12
        $skipPage = false;
354
        // message will be overwritten later
355 12
        $skipMessage = 'Skipped';
356
357
        // if page is hidden
358 12
        if (! $this->extensionSettings['crawlHiddenPages']) {
359 12
            if ($pageRow['hidden']) {
360 1
                $skipPage = true;
361 1
                $skipMessage = 'Because page is hidden';
362
            }
363
        }
364
365 12
        if (! $skipPage) {
366 11
            if (GeneralUtility::inList('3,4,199,254,255', $pageRow['doktype'])) {
367 3
                $skipPage = true;
368 3
                $skipMessage = 'Because doktype is not allowed';
369
            }
370
        }
371
372 12
        if (! $skipPage) {
373 8
            foreach ($GLOBALS['TYPO3_CONF_VARS']['EXTCONF']['crawler']['excludeDoktype'] ?? [] as $key => $doktypeList) {
374 1
                if (GeneralUtility::inList($doktypeList, $pageRow['doktype'])) {
375 1
                    $skipPage = true;
376 1
                    $skipMessage = 'Doktype was excluded by "' . $key . '"';
377 1
                    break;
378
                }
379
            }
380
        }
381
382 12
        if (! $skipPage) {
383
            // veto hook
384 7
            foreach ($GLOBALS['TYPO3_CONF_VARS']['EXTCONF']['crawler']['pageVeto'] ?? [] as $key => $func) {
385
                $params = [
386 2
                    'pageRow' => $pageRow,
387
                ];
388
                // expects "false" if page is ok and "true" or a skipMessage if this page should _not_ be crawled
389 2
                $veto = GeneralUtility::callUserFunction($func, $params, $this);
390 2
                if ($veto !== false) {
391 2
                    $skipPage = true;
392 2
                    if (is_string($veto)) {
393 1
                        $skipMessage = $veto;
394
                    } else {
395 1
                        $skipMessage = 'Veto from hook "' . htmlspecialchars($key) . '"';
396
                    }
397
                    // no need to execute other hooks if a previous one return a veto
398 2
                    break;
399
                }
400
            }
401
        }
402
403 12
        return $skipPage ? $skipMessage : false;
404
    }
405
406
    /**
407
     * Wrapper method for getUrlsForPageId()
408
     * It returns an array of configurations and no urls!
409
     *
410
     * @param array $pageRow Page record with at least dok-type and uid columns.
411
     * @param string $skipMessage
412
     * @return array
413
     * @see getUrlsForPageId()
414
     */
415 6
    public function getUrlsForPageRow(array $pageRow, &$skipMessage = '')
416
    {
417 6
        $message = $this->checkIfPageShouldBeSkipped($pageRow);
418 6
        if ($message === false) {
419 5
            $res = $this->getUrlsForPageId($pageRow['uid']);
420 5
            $skipMessage = '';
421
        } else {
422 1
            $skipMessage = $message;
423 1
            $res = [];
424
        }
425
426 6
        return $res;
427
    }
428
429
    /**
430
     * Creates a list of URLs from input array (and submits them to queue if asked for)
431
     * See Web > Info module script + "indexed_search"'s crawler hook-client using this!
432
     *
433
     * @param array $vv Information about URLs from pageRow to crawl.
434
     * @param array $pageRow Page row
435
     * @param int $scheduledTime Unix time to schedule indexing to, typically time()
436
     * @param int $reqMinute Number of requests per minute (creates the interleave between requests)
437
     * @param bool $submitCrawlUrls If set, submits the URLs to queue
438
     * @param bool $downloadCrawlUrls If set (and submitcrawlUrls is false) will fill $downloadUrls with entries)
439
     * @param array $duplicateTrack Array which is passed by reference and contains the an id per url to secure we will not crawl duplicates
440
     * @param array $downloadUrls Array which will be filled with URLS for download if flag is set.
441
     * @param array $incomingProcInstructions Array of processing instructions
442
     * @return string List of URLs (meant for display in backend module)
443
     */
444 4
    public function urlListFromUrlArray(
445
        array $vv,
446
        array $pageRow,
447
        $scheduledTime,
448
        $reqMinute,
449
        $submitCrawlUrls,
450
        $downloadCrawlUrls,
451
        array &$duplicateTrack,
452
        array &$downloadUrls,
453
        array $incomingProcInstructions
454
    ) {
455 4
        if (! is_array($vv['URLs'])) {
456
            return 'ERROR - no URL generated';
457
        }
458 4
        $urlLog = [];
459 4
        $pageId = (int) $pageRow['uid'];
460 4
        $configurationHash = $this->getConfigurationHash($vv);
461 4
        $skipInnerCheck = $this->queueRepository->noUnprocessedQueueEntriesForPageWithConfigurationHashExist($pageId, $configurationHash);
462
463 4
        $urlService = new UrlService();
464
465 4
        foreach ($vv['URLs'] as $urlQuery) {
466 4
            if (! $this->drawURLs_PIfilter($vv['subCfg']['procInstrFilter'], $incomingProcInstructions)) {
467
                continue;
468
            }
469 4
            $url = (string) $urlService->getUrlFromPageAndQueryParameters(
470 4
                $pageId,
471 4
                $urlQuery,
472 4
                $vv['subCfg']['baseUrl'] ?? null,
473 4
                $vv['subCfg']['force_ssl'] ?? 0
474
            );
475
476
            // Create key by which to determine unique-ness:
477 4
            $uKey = $url . '|' . $vv['subCfg']['userGroups'] . '|' . $vv['subCfg']['procInstrFilter'];
478
479 4
            if (isset($duplicateTrack[$uKey])) {
480
                //if the url key is registered just display it and do not resubmit is
481
                $urlLog[] = '<em><span class="text-muted">' . htmlspecialchars($url) . '</span></em>';
482
            } else {
483
                // Scheduled time:
484 4
                $schTime = $scheduledTime + round(count($duplicateTrack) * (60 / $reqMinute));
485 4
                $schTime = intval($schTime / 60) * 60;
486 4
                $formattedDate = BackendUtility::datetime($schTime);
487 4
                $this->urlList[] = '[' . $formattedDate . '] ' . $url;
488 4
                $urlList = '[' . $formattedDate . '] ' . htmlspecialchars($url);
489
490
                // Submit for crawling!
491 4
                if ($submitCrawlUrls) {
492 4
                    $added = $this->addUrl(
493 4
                        $pageId,
494 4
                        $url,
495 4
                        $vv['subCfg'],
496 4
                        $scheduledTime,
497 4
                        $configurationHash,
498 4
                        $skipInnerCheck
499
                    );
500 4
                    if ($added === false) {
501 4
                        $urlList .= ' (URL already existed)';
502
                    }
503
                } elseif ($downloadCrawlUrls) {
504
                    $downloadUrls[$url] = $url;
505
                }
506 4
                $urlLog[] = $urlList;
507
            }
508 4
            $duplicateTrack[$uKey] = true;
509
        }
510
511 4
        return implode('<br>', $urlLog);
512
    }
513
514
    /**
515
     * Returns true if input processing instruction is among registered ones.
516
     *
517
     * @param string $piString PI to test
518
     * @param array $incomingProcInstructions Processing instructions
519
     * @return boolean
520
     */
521 5
    public function drawURLs_PIfilter($piString, array $incomingProcInstructions)
522
    {
523 5
        if (empty($incomingProcInstructions)) {
524 1
            return true;
525
        }
526
527 4
        foreach ($incomingProcInstructions as $pi) {
528 4
            if (GeneralUtility::inList($piString, $pi)) {
529 2
                return true;
530
            }
531
        }
532 2
        return false;
533
    }
534
535 5
    public function getPageTSconfigForId($id): array
536
    {
537 5
        if (! $this->MP) {
538 5
            $pageTSconfig = BackendUtility::getPagesTSconfig($id);
0 ignored issues
show
Deprecated Code introduced by
The function TYPO3\CMS\Backend\Utilit...ity::getPagesTSconfig() has been deprecated. ( Ignorable by Annotation )

If this is a false-positive, you can also ignore this issue in your code via the ignore-deprecated  annotation

538
            $pageTSconfig = /** @scrutinizer ignore-deprecated */ BackendUtility::getPagesTSconfig($id);
Loading history...
539
        } else {
540
            // TODO: Please check, this makes no sense to split a boolean value.
541
            [, $mountPointId] = explode('-', $this->MP);
0 ignored issues
show
Bug introduced by
$this->MP of type true is incompatible with the type string expected by parameter $string of explode(). ( Ignorable by Annotation )

If this is a false-positive, you can also ignore this issue in your code via the ignore-type  annotation

541
            [, $mountPointId] = explode('-', /** @scrutinizer ignore-type */ $this->MP);
Loading history...
542
            $pageTSconfig = BackendUtility::getPagesTSconfig($mountPointId);
0 ignored issues
show
Deprecated Code introduced by
The function TYPO3\CMS\Backend\Utilit...ity::getPagesTSconfig() has been deprecated. ( Ignorable by Annotation )

If this is a false-positive, you can also ignore this issue in your code via the ignore-deprecated  annotation

542
            $pageTSconfig = /** @scrutinizer ignore-deprecated */ BackendUtility::getPagesTSconfig($mountPointId);
Loading history...
Bug introduced by
$mountPointId of type string is incompatible with the type integer expected by parameter $id of TYPO3\CMS\Backend\Utilit...ity::getPagesTSconfig(). ( Ignorable by Annotation )

If this is a false-positive, you can also ignore this issue in your code via the ignore-type  annotation

542
            $pageTSconfig = BackendUtility::getPagesTSconfig(/** @scrutinizer ignore-type */ $mountPointId);
Loading history...
543
        }
544
545
        // Call a hook to alter configuration
546 5
        if (is_array($GLOBALS['TYPO3_CONF_VARS']['EXTCONF']['crawler']['getPageTSconfigForId'])) {
547
            $params = [
548
                'pageId' => $id,
549
                'pageTSConfig' => &$pageTSconfig,
550
            ];
551
            foreach ($GLOBALS['TYPO3_CONF_VARS']['EXTCONF']['crawler']['getPageTSconfigForId'] as $userFunc) {
552
                GeneralUtility::callUserFunction($userFunc, $params, $this);
553
            }
554
        }
555 5
        return $pageTSconfig;
556
    }
557
558
    /**
559
     * This methods returns an array of configurations.
560
     * Adds no urls!
561
     */
562 4
    public function getUrlsForPageId(int $pageId): array
563
    {
564
        // Get page TSconfig for page ID
565 4
        $pageTSconfig = $this->getPageTSconfigForId($pageId);
566
567 4
        $res = [];
568
569
        // Fetch Crawler Configuration from pageTSconfig
570 4
        $crawlerCfg = $pageTSconfig['tx_crawler.']['crawlerCfg.']['paramSets.'] ?? [];
571 4
        foreach ($crawlerCfg as $key => $values) {
572 3
            if (! is_array($values)) {
573 3
                continue;
574
            }
575 3
            $key = str_replace('.', '', $key);
576
            // Sub configuration for a single configuration string:
577 3
            $subCfg = (array) $crawlerCfg[$key . '.'];
578 3
            $subCfg['key'] = $key;
579
580 3
            if (strcmp($subCfg['procInstrFilter'] ?? '', '')) {
581 3
                $subCfg['procInstrFilter'] = implode(',', GeneralUtility::trimExplode(',', $subCfg['procInstrFilter']));
582
            }
583 3
            $pidOnlyList = implode(',', GeneralUtility::trimExplode(',', $subCfg['pidsOnly'], true));
584
585
            // process configuration if it is not page-specific or if the specific page is the current page:
586
            // TODO: Check if $pidOnlyList can be kept as Array instead of imploded
587 3
            if (! strcmp((string) $subCfg['pidsOnly'], '') || GeneralUtility::inList($pidOnlyList, strval($pageId))) {
588
589
                // Explode, process etc.:
590 3
                $res[$key] = [];
591 3
                $res[$key]['subCfg'] = $subCfg;
592 3
                $res[$key]['paramParsed'] = GeneralUtility::explodeUrl2Array($crawlerCfg[$key]);
593 3
                $res[$key]['paramExpanded'] = $this->expandParameters($res[$key]['paramParsed'], $pageId);
594 3
                $res[$key]['origin'] = 'pagets';
595
596
                // recognize MP value
597 3
                if (! $this->MP) {
598 3
                    $res[$key]['URLs'] = $this->compileUrls($res[$key]['paramExpanded'], ['?id=' . $pageId]);
599
                } else {
600
                    $res[$key]['URLs'] = $this->compileUrls($res[$key]['paramExpanded'], ['?id=' . $pageId . '&MP=' . $this->MP]);
0 ignored issues
show
Bug introduced by
Are you sure $this->MP of type true can be used in concatenation? ( Ignorable by Annotation )

If this is a false-positive, you can also ignore this issue in your code via the ignore-type  annotation

600
                    $res[$key]['URLs'] = $this->compileUrls($res[$key]['paramExpanded'], ['?id=' . $pageId . '&MP=' . /** @scrutinizer ignore-type */ $this->MP]);
Loading history...
601
                }
602
            }
603
        }
604
605
        // Get configuration from tx_crawler_configuration records up the rootline
606 4
        $crawlerConfigurations = $this->configurationRepository->getCrawlerConfigurationRecordsFromRootLine($pageId);
607 4
        foreach ($crawlerConfigurations as $configurationRecord) {
608
609
            // check access to the configuration record
610 1
            if (empty($configurationRecord['begroups']) || $this->getBackendUser()->isAdmin() || $this->hasGroupAccess($this->getBackendUser()->user['usergroup_cached_list'], $configurationRecord['begroups'])) {
611 1
                $pidOnlyList = implode(',', GeneralUtility::trimExplode(',', $configurationRecord['pidsonly'], true));
612
613
                // process configuration if it is not page-specific or if the specific page is the current page:
614
                // TODO: Check if $pidOnlyList can be kept as Array instead of imploded
615 1
                if (! strcmp($configurationRecord['pidsonly'], '') || GeneralUtility::inList($pidOnlyList, strval($pageId))) {
616 1
                    $key = $configurationRecord['name'];
617
618
                    // don't overwrite previously defined paramSets
619 1
                    if (! isset($res[$key])) {
620
621
                        /* @var $TSparserObject \TYPO3\CMS\Core\TypoScript\Parser\TypoScriptParser */
622 1
                        $TSparserObject = GeneralUtility::makeInstance(TypoScriptParser::class);
623 1
                        $TSparserObject->parse($configurationRecord['processing_instruction_parameters_ts']);
624
625
                        $subCfg = [
626 1
                            'procInstrFilter' => $configurationRecord['processing_instruction_filter'],
627 1
                            'procInstrParams.' => $TSparserObject->setup,
628 1
                            'baseUrl' => $configurationRecord['base_url'],
629 1
                            'force_ssl' => (int) $configurationRecord['force_ssl'],
630 1
                            'userGroups' => $configurationRecord['fegroups'],
631 1
                            'exclude' => $configurationRecord['exclude'],
632 1
                            'key' => $key,
633
                        ];
634
635 1
                        if (! in_array($pageId, $this->expandExcludeString($subCfg['exclude']), true)) {
636 1
                            $res[$key] = [];
637 1
                            $res[$key]['subCfg'] = $subCfg;
638 1
                            $res[$key]['paramParsed'] = GeneralUtility::explodeUrl2Array($configurationRecord['configuration']);
639 1
                            $res[$key]['paramExpanded'] = $this->expandParameters($res[$key]['paramParsed'], $pageId);
640 1
                            $res[$key]['URLs'] = $this->compileUrls($res[$key]['paramExpanded'], ['?id=' . $pageId]);
641 1
                            $res[$key]['origin'] = 'tx_crawler_configuration_' . $configurationRecord['uid'];
642
                        }
643
                    }
644
                }
645
            }
646
        }
647
648 4
        foreach ($GLOBALS['TYPO3_CONF_VARS']['EXTCONF']['crawler']['processUrls'] ?? [] as $func) {
649
            $params = [
650
                'res' => &$res,
651
            ];
652
            GeneralUtility::callUserFunction($func, $params, $this);
653
        }
654 4
        return $res;
655
    }
656
657
    /**
658
     * Find all configurations of subpages of a page
659
     * TODO: Write Functional Tests
660
     */
661 1
    public function getConfigurationsForBranch(int $rootid, int $depth): array
662
    {
663 1
        $configurationsForBranch = [];
664 1
        $pageTSconfig = $this->getPageTSconfigForId($rootid);
665 1
        $sets = $pageTSconfig['tx_crawler.']['crawlerCfg.']['paramSets.'] ?? [];
666 1
        foreach ($sets as $key => $value) {
667
            if (! is_array($value)) {
668
                continue;
669
            }
670
            $configurationsForBranch[] = substr($key, -1) === '.' ? substr($key, 0, -1) : $key;
671
        }
672 1
        $pids = [];
673 1
        $rootLine = BackendUtility::BEgetRootLine($rootid);
674 1
        foreach ($rootLine as $node) {
675 1
            $pids[] = $node['uid'];
676
        }
677
        /* @var PageTreeView $tree */
678 1
        $tree = GeneralUtility::makeInstance(PageTreeView::class);
679 1
        $perms_clause = $this->getBackendUser()->getPagePermsClause(Permission::PAGE_SHOW);
680 1
        $tree->init(empty($perms_clause) ? '' : ('AND ' . $perms_clause));
681 1
        $tree->getTree($rootid, $depth, '');
682 1
        foreach ($tree->tree as $node) {
683
            $pids[] = $node['row']['uid'];
684
        }
685
686 1
        $queryBuilder = $this->getQueryBuilder('tx_crawler_configuration');
687
        $statement = $queryBuilder
688 1
            ->select('name')
689 1
            ->from('tx_crawler_configuration')
690 1
            ->where(
691 1
                $queryBuilder->expr()->in('pid', $queryBuilder->createNamedParameter($pids, Connection::PARAM_INT_ARRAY))
692
            )
693 1
            ->execute();
694
695 1
        while ($row = $statement->fetch()) {
696 1
            $configurationsForBranch[] = $row['name'];
697
        }
698 1
        return $configurationsForBranch;
699
    }
700
701
    /**
702
     * Check if a user has access to an item
703
     * (e.g. get the group list of the current logged in user from $GLOBALS['TSFE']->gr_list)
704
     *
705
     * @param string $groupList Comma-separated list of (fe_)group UIDs from a user
706
     * @param string $accessList Comma-separated list of (fe_)group UIDs of the item to access
707
     * @return bool TRUE if at least one of the users group UIDs is in the access list or the access list is empty
708
     * @see \TYPO3\CMS\Frontend\Page\PageRepository::getMultipleGroupsWhereClause()
709
     */
710 3
    public function hasGroupAccess($groupList, $accessList)
711
    {
712 3
        if (empty($accessList)) {
713 1
            return true;
714
        }
715 2
        foreach (GeneralUtility::intExplode(',', $groupList) as $groupUid) {
716 2
            if (GeneralUtility::inList($accessList, $groupUid)) {
717 1
                return true;
718
            }
719
        }
720 1
        return false;
721
    }
722
723
    /**
724
     * Will expand the parameters configuration to individual values. This follows a certain syntax of the value of each parameter.
725
     * Syntax of values:
726
     * - Basically: If the value is wrapped in [...] it will be expanded according to the following syntax, otherwise the value is taken literally
727
     * - Configuration is splitted by "|" and the parts are processed individually and finally added together
728
     * - For each configuration part:
729
     *         - "[int]-[int]" = Integer range, will be expanded to all values in between, values included, starting from low to high (max. 1000). Example "1-34" or "-40--30"
730
     *         - "_TABLE:[TCA table name];[_PID:[optional page id, default is current page]];[_ENABLELANG:1]" = Look up of table records from PID, filtering out deleted records. Example "_TABLE:tt_content; _PID:123"
731
     *        _ENABLELANG:1 picks only original records without their language overlays
732
     *         - Default: Literal value
733
     *
734
     * @param array $paramArray Array with key (GET var name) and values (value of GET var which is configuration for expansion)
735
     * @param integer $pid Current page ID
736
     * @return array
737
     *
738
     * TODO: Write Functional Tests
739
     */
740 11
    public function expandParameters($paramArray, $pid)
741
    {
742
        // Traverse parameter names:
743 11
        foreach ($paramArray as $p => $v) {
744 11
            $v = trim($v);
745
746
            // If value is encapsulated in square brackets it means there are some ranges of values to find, otherwise the value is literal
747 11
            if (strpos($v, '[') === 0 && substr($v, -1) === ']') {
748
                // So, find the value inside brackets and reset the paramArray value as an array.
749 11
                $v = substr($v, 1, -1);
750 11
                $paramArray[$p] = [];
751
752
                // Explode parts and traverse them:
753 11
                $parts = explode('|', $v);
754 11
                foreach ($parts as $pV) {
755
756
                    // Look for integer range: (fx. 1-34 or -40--30 // reads minus 40 to minus 30)
757 11
                    if (preg_match('/^(-?[0-9]+)\s*-\s*(-?[0-9]+)$/', trim($pV), $reg)) {
758 1
                        $reg = $this->swapIfFirstIsLargerThanSecond($reg);
759
760
                        // Traverse range, add values:
761
                        // Limit to size of range!
762 1
                        $runAwayBrake = 1000;
763 1
                        for ($a = $reg[1]; $a <= $reg[2]; $a++) {
764 1
                            $paramArray[$p][] = $a;
765 1
                            $runAwayBrake--;
766 1
                            if ($runAwayBrake <= 0) {
767
                                break;
768
                            }
769
                        }
770 10
                    } elseif (strpos(trim($pV), '_TABLE:') === 0) {
771
772
                        // Parse parameters:
773 6
                        $subparts = GeneralUtility::trimExplode(';', $pV);
774 6
                        $subpartParams = [];
775 6
                        foreach ($subparts as $spV) {
776 6
                            [$pKey, $pVal] = GeneralUtility::trimExplode(':', $spV);
777 6
                            $subpartParams[$pKey] = $pVal;
778
                        }
779
780
                        // Table exists:
781 6
                        if (isset($GLOBALS['TCA'][$subpartParams['_TABLE']])) {
782 6
                            $lookUpPid = isset($subpartParams['_PID']) ? intval($subpartParams['_PID']) : intval($pid);
783 6
                            $recursiveDepth = isset($subpartParams['_RECURSIVE']) ? intval($subpartParams['_RECURSIVE']) : 0;
784 6
                            $pidField = isset($subpartParams['_PIDFIELD']) ? trim($subpartParams['_PIDFIELD']) : 'pid';
785 6
                            $where = $subpartParams['_WHERE'] ?? '';
786 6
                            $addTable = $subpartParams['_ADDTABLE'] ?? '';
787
788 6
                            $fieldName = $subpartParams['_FIELD'] ? $subpartParams['_FIELD'] : 'uid';
789 6
                            if ($fieldName === 'uid' || $GLOBALS['TCA'][$subpartParams['_TABLE']]['columns'][$fieldName]) {
790 6
                                $queryBuilder = $this->getQueryBuilder($subpartParams['_TABLE']);
791
792 6
                                if ($recursiveDepth > 0) {
793
                                    /** @var \TYPO3\CMS\Core\Database\QueryGenerator $queryGenerator */
794 2
                                    $queryGenerator = GeneralUtility::makeInstance(\TYPO3\CMS\Core\Database\QueryGenerator::class);
795 2
                                    $pidList = $queryGenerator->getTreeList($lookUpPid, $recursiveDepth, 0, 1);
796 2
                                    $pidArray = GeneralUtility::intExplode(',', $pidList);
797
                                } else {
798 4
                                    $pidArray = [(string) $lookUpPid];
799
                                }
800
801 6
                                $queryBuilder->getRestrictions()
802 6
                                    ->removeAll()
803 6
                                    ->add(GeneralUtility::makeInstance(DeletedRestriction::class));
804
805
                                $queryBuilder
806 6
                                    ->select($fieldName)
807 6
                                    ->from($subpartParams['_TABLE'])
808 6
                                    ->where(
809 6
                                        $queryBuilder->expr()->in($pidField, $queryBuilder->createNamedParameter($pidArray, Connection::PARAM_INT_ARRAY)),
810 6
                                        $where
811
                                    );
812
813 6
                                if (! empty($addTable)) {
814
                                    // TODO: Check if this works as intended!
815
                                    $queryBuilder->add('from', $addTable);
816
                                }
817 6
                                $transOrigPointerField = $GLOBALS['TCA'][$subpartParams['_TABLE']]['ctrl']['transOrigPointerField'];
818
819 6
                                if ($subpartParams['_ENABLELANG'] && $transOrigPointerField) {
820
                                    $queryBuilder->andWhere(
821
                                        $queryBuilder->expr()->lte(
822
                                            $transOrigPointerField,
823
                                            0
824
                                        )
825
                                    );
826
                                }
827
828 6
                                $statement = $queryBuilder->execute();
829
830 6
                                $rows = [];
831 6
                                while ($row = $statement->fetch()) {
832 6
                                    $rows[$row[$fieldName]] = $row;
833
                                }
834
835 6
                                if (is_array($rows)) {
836 6
                                    $paramArray[$p] = array_merge($paramArray[$p], array_keys($rows));
837
                                }
838
                            }
839
                        }
840
                    } else {
841
                        // Just add value:
842 4
                        $paramArray[$p][] = $pV;
843
                    }
844
                    // Hook for processing own expandParameters place holder
845 11
                    if (is_array($GLOBALS['TYPO3_CONF_VARS']['SC_OPTIONS']['crawler/class.tx_crawler_lib.php']['expandParameters'])) {
846
                        $_params = [
847
                            'pObj' => &$this,
848
                            'paramArray' => &$paramArray,
849
                            'currentKey' => $p,
850
                            'currentValue' => $pV,
851
                            'pid' => $pid,
852
                        ];
853
                        foreach ($GLOBALS['TYPO3_CONF_VARS']['SC_OPTIONS']['crawler/class.tx_crawler_lib.php']['expandParameters'] as $_funcRef) {
854
                            GeneralUtility::callUserFunction($_funcRef, $_params, $this);
855
                        }
856
                    }
857
                }
858
859
                // Make unique set of values and sort array by key:
860 11
                $paramArray[$p] = array_unique($paramArray[$p]);
861 11
                ksort($paramArray);
862
            } else {
863
                // Set the literal value as only value in array:
864 4
                $paramArray[$p] = [$v];
865
            }
866
        }
867
868 11
        return $paramArray;
869
    }
870
871
    /**
872
     * Compiling URLs from parameter array (output of expandParameters())
873
     * The number of URLs will be the multiplication of the number of parameter values for each key
874
     *
875
     * @param array $paramArray Output of expandParameters(): Array with keys (GET var names) and for each an array of values
876
     * @param array $urls URLs accumulated in this array (for recursion)
877
     * @return array
878
     */
879 7
    public function compileUrls($paramArray, array $urls)
880
    {
881 7
        if (empty($paramArray)) {
882 7
            return $urls;
883
        }
884
        // shift first off stack:
885 6
        reset($paramArray);
886 6
        $varName = key($paramArray);
887 6
        $valueSet = array_shift($paramArray);
888
889
        // Traverse value set:
890 6
        $newUrls = [];
891 6
        foreach ($urls as $url) {
892 5
            foreach ($valueSet as $val) {
893 5
                $newUrls[] = $url . (strcmp((string) $val, '') ? '&' . rawurlencode($varName) . '=' . rawurlencode((string) $val) : '');
894
895 5
                if (count($newUrls) > $this->maximumUrlsToCompile) {
896
                    break;
897
                }
898
            }
899
        }
900 6
        return $this->compileUrls($paramArray, $newUrls);
901
    }
902
903
    /************************************
904
     *
905
     * Crawler log
906
     *
907
     ************************************/
908
909
    /**
910
     * Return array of records from crawler queue for input page ID
911
     *
912
     * @param integer $id Page ID for which to look up log entries.
913
     * @param string $filter Filter: "all" => all entries, "pending" => all that is not yet run, "finished" => all complete ones
914
     * @param boolean $doFlush If TRUE, then entries selected at DELETED(!) instead of selected!
915
     * @param boolean $doFullFlush
916
     * @param integer $itemsPerPage Limit the amount of entries per page default is 10
917
     * @return array
918
     */
919 4
    public function getLogEntriesForPageId($id, $filter = '', $doFlush = false, $doFullFlush = false, $itemsPerPage = 10)
920
    {
921 4
        $queryBuilder = GeneralUtility::makeInstance(ConnectionPool::class)->getQueryBuilderForTable($this->tableName);
922
        $queryBuilder
923 4
            ->select('*')
924 4
            ->from($this->tableName)
925 4
            ->where(
926 4
                $queryBuilder->expr()->eq('page_id', $queryBuilder->createNamedParameter($id, \PDO::PARAM_INT))
927
            )
928 4
            ->orderBy('scheduled', 'DESC');
929
930 4
        $expressionBuilder = GeneralUtility::makeInstance(ConnectionPool::class)
931 4
            ->getConnectionForTable($this->tableName)
932 4
            ->getExpressionBuilder();
933 4
        $query = $expressionBuilder->andX();
0 ignored issues
show
Unused Code introduced by
The assignment to $query is dead and can be removed.
Loading history...
934
        // PHPStorm adds the highlight that the $addWhere is immediately overwritten,
935
        // but the $query = $expressionBuilder->andX() ensures that the $addWhere is written correctly with AND
936
        // between the statements, it's not a mistake in the code.
937 4
        switch ($filter) {
938 4
            case 'pending':
939
                $queryBuilder->andWhere($queryBuilder->expr()->eq('exec_time', 0));
940
                break;
941 4
            case 'finished':
942
                $queryBuilder->andWhere($queryBuilder->expr()->gt('exec_time', 0));
943
                break;
944
        }
945
946 4
        if ($doFlush) {
947 2
            if ($doFullFlush) {
948 1
                $this->queueRepository->flushQueue('all');
949
            } else {
950 1
                $this->queueRepository->flushQueue($filter);
951
            }
952
        }
953 4
        if ($itemsPerPage > 0) {
954
            $queryBuilder
955 4
                ->setMaxResults((int) $itemsPerPage);
956
        }
957
958 4
        return $queryBuilder->execute()->fetchAll();
959
    }
960
961
    /**
962
     * Return array of records from crawler queue for input set ID
963
     *
964
     * @param int $set_id Set ID for which to look up log entries.
965
     * @param string $filter Filter: "all" => all entries, "pending" => all that is not yet run, "finished" => all complete ones
966
     * @param bool $doFlush If TRUE, then entries selected at DELETED(!) instead of selected!
967
     * @param int $itemsPerPage Limit the amount of entries per page default is 10
968
     * @return array
969
     *
970
     * @deprecated
971
     */
972 6
    public function getLogEntriesForSetId(int $set_id, string $filter = '', bool $doFlush = false, bool $doFullFlush = false, int $itemsPerPage = 10)
973
    {
974 6
        $queryBuilder = GeneralUtility::makeInstance(ConnectionPool::class)->getQueryBuilderForTable($this->tableName);
975
        $queryBuilder
976 6
            ->select('*')
977 6
            ->from($this->tableName)
978 6
            ->where(
979 6
                $queryBuilder->expr()->eq('set_id', $queryBuilder->createNamedParameter($set_id, \PDO::PARAM_INT))
980
            )
981 6
            ->orderBy('scheduled', 'DESC');
982
983 6
        $expressionBuilder = GeneralUtility::makeInstance(ConnectionPool::class)
984 6
            ->getConnectionForTable($this->tableName)
985 6
            ->getExpressionBuilder();
986 6
        $query = $expressionBuilder->andX();
987
        // PHPStorm adds the highlight that the $addWhere is immediately overwritten,
988
        // but the $query = $expressionBuilder->andX() ensures that the $addWhere is written correctly with AND
989
        // between the statements, it's not a mistake in the code.
990 6
        $addWhere = '';
991 6
        switch ($filter) {
992 6
            case 'pending':
993 1
                $queryBuilder->andWhere($queryBuilder->expr()->eq('exec_time', 0));
994 1
                $addWhere = $query->add($expressionBuilder->eq('exec_time', 0));
0 ignored issues
show
Unused Code introduced by
The assignment to $addWhere is dead and can be removed.
Loading history...
995 1
                break;
996 5
            case 'finished':
997 1
                $queryBuilder->andWhere($queryBuilder->expr()->gt('exec_time', 0));
998 1
                $addWhere = $query->add($expressionBuilder->gt('exec_time', 0));
999 1
                break;
1000
        }
1001 6
        if ($doFlush) {
1002 4
            $addWhere = $query->add($expressionBuilder->eq('set_id', (int) $set_id));
1003 4
            $this->flushQueue($doFullFlush ? '' : $addWhere);
0 ignored issues
show
Deprecated Code introduced by
The function AOE\Crawler\Controller\C...ontroller::flushQueue() has been deprecated. ( Ignorable by Annotation )

If this is a false-positive, you can also ignore this issue in your code via the ignore-deprecated  annotation

1003
            /** @scrutinizer ignore-deprecated */ $this->flushQueue($doFullFlush ? '' : $addWhere);
Loading history...
1004 4
            return [];
1005
        }
1006 2
        if ($itemsPerPage > 0) {
1007
            $queryBuilder
1008 2
                ->setMaxResults((int) $itemsPerPage);
1009
        }
1010
1011 2
        return $queryBuilder->execute()->fetchAll();
1012
    }
1013
1014
    /**
1015
     * Adding call back entries to log (called from hooks typically, see indexed search class "class.crawler.php"
1016
     *
1017
     * @param integer $setId Set ID
1018
     * @param array $params Parameters to pass to call back function
1019
     * @param string $callBack Call back object reference, eg. 'EXT:indexed_search/class.crawler.php:&tx_indexedsearch_crawler'
1020
     * @param integer $page_id Page ID to attach it to
1021
     * @param integer $schedule Time at which to activate
1022
     */
1023
    public function addQueueEntry_callBack($setId, $params, $callBack, $page_id = 0, $schedule = 0): void
1024
    {
1025
        if (! is_array($params)) {
0 ignored issues
show
introduced by
The condition is_array($params) is always true.
Loading history...
1026
            $params = [];
1027
        }
1028
        $params['_CALLBACKOBJ'] = $callBack;
1029
1030
        GeneralUtility::makeInstance(ConnectionPool::class)->getConnectionForTable('tx_crawler_queue')
1031
            ->insert(
1032
                'tx_crawler_queue',
1033
                [
1034
                    'page_id' => (int) $page_id,
1035
                    'parameters' => json_encode($params),
1036
                    'scheduled' => (int) $schedule ?: $this->getCurrentTime(),
1037
                    'exec_time' => 0,
1038
                    'set_id' => (int) $setId,
1039
                    'result_data' => '',
1040
                ]
1041
            );
1042
    }
1043
1044
    /************************************
1045
     *
1046
     * URL setting
1047
     *
1048
     ************************************/
1049
1050
    /**
1051
     * Setting a URL for crawling:
1052
     *
1053
     * @param integer $id Page ID
1054
     * @param string $url Complete URL
1055
     * @param array $subCfg Sub configuration array (from TS config)
1056
     * @param integer $tstamp Scheduled-time
1057
     * @param string $configurationHash (optional) configuration hash
1058
     * @param bool $skipInnerDuplicationCheck (optional) skip inner duplication check
1059
     * @return bool
1060
     */
1061 8
    public function addUrl(
1062
        $id,
1063
        $url,
1064
        array $subCfg,
1065
        $tstamp,
1066
        $configurationHash = '',
1067
        $skipInnerDuplicationCheck = false
1068
    ) {
1069 8
        $urlAdded = false;
1070 8
        $rows = [];
1071
1072
        // Creating parameters:
1073
        $parameters = [
1074 8
            'url' => $url,
1075
        ];
1076
1077
        // fe user group simulation:
1078 8
        $uGs = implode(',', array_unique(GeneralUtility::intExplode(',', $subCfg['userGroups'], true)));
1079 8
        if ($uGs) {
1080 1
            $parameters['feUserGroupList'] = $uGs;
1081
        }
1082
1083
        // Setting processing instructions
1084 8
        $parameters['procInstructions'] = GeneralUtility::trimExplode(',', $subCfg['procInstrFilter']);
1085 8
        if (is_array($subCfg['procInstrParams.'])) {
1086 5
            $parameters['procInstrParams'] = $subCfg['procInstrParams.'];
1087
        }
1088
1089
        // Compile value array:
1090 8
        $parameters_serialized = json_encode($parameters);
1091
        $fieldArray = [
1092 8
            'page_id' => (int) $id,
1093 8
            'parameters' => $parameters_serialized,
1094 8
            'parameters_hash' => GeneralUtility::shortMD5($parameters_serialized),
1095 8
            'configuration_hash' => $configurationHash,
1096 8
            'scheduled' => $tstamp,
1097 8
            'exec_time' => 0,
1098 8
            'set_id' => (int) $this->setID,
1099 8
            'result_data' => '',
1100 8
            'configuration' => $subCfg['key'],
1101
        ];
1102
1103 8
        if ($this->registerQueueEntriesInternallyOnly) {
1104
            //the entries will only be registered and not stored to the database
1105 1
            $this->queueEntries[] = $fieldArray;
1106
        } else {
1107 7
            if (! $skipInnerDuplicationCheck) {
1108
                // check if there is already an equal entry
1109 6
                $rows = $this->getDuplicateRowsIfExist($tstamp, $fieldArray);
1110
            }
1111
1112 7
            if (empty($rows)) {
1113 6
                $connectionForCrawlerQueue = GeneralUtility::makeInstance(ConnectionPool::class)->getConnectionForTable('tx_crawler_queue');
1114 6
                $connectionForCrawlerQueue->insert(
1115 6
                    'tx_crawler_queue',
1116 6
                    $fieldArray
1117
                );
1118 6
                $uid = $connectionForCrawlerQueue->lastInsertId('tx_crawler_queue', 'qid');
1119 6
                $rows[] = $uid;
1120 6
                $urlAdded = true;
1121
1122 6
                $signalPayload = ['uid' => $uid, 'fieldArray' => $fieldArray];
1123 6
                SignalSlotUtility::emitSignal(
1124 6
                    self::class,
1125 6
                    SignalSlotUtility::SIGNAL_URL_ADDED_TO_QUEUE,
1126 6
                    $signalPayload
1127
                );
1128
            } else {
1129 3
                $signalPayload = ['rows' => $rows, 'fieldArray' => $fieldArray];
1130 3
                SignalSlotUtility::emitSignal(
1131 3
                    self::class,
1132 3
                    SignalSlotUtility::SIGNAL_DUPLICATE_URL_IN_QUEUE,
1133 3
                    $signalPayload
1134
                );
1135
            }
1136
        }
1137
1138 8
        return $urlAdded;
1139
    }
1140
1141
    /**
1142
     * Returns the current system time
1143
     *
1144
     * @return int
1145
     */
1146
    public function getCurrentTime()
1147
    {
1148
        return time();
1149
    }
1150
1151
    /************************************
1152
     *
1153
     * URL reading
1154
     *
1155
     ************************************/
1156
1157
    /**
1158
     * Read URL for single queue entry
1159
     *
1160
     * @param integer $queueId
1161
     * @param boolean $force If set, will process even if exec_time has been set!
1162
     * @return integer
1163
     */
1164
    public function readUrl($queueId, $force = false)
1165
    {
1166
        $queryBuilder = GeneralUtility::makeInstance(ConnectionPool::class)->getQueryBuilderForTable($this->tableName);
1167
        $ret = 0;
1168
        $this->logger->debug('crawler-readurl start ' . microtime(true));
1169
        // Get entry:
1170
        $queryBuilder
1171
            ->select('*')
1172
            ->from('tx_crawler_queue')
1173
            ->where(
1174
                $queryBuilder->expr()->eq('qid', $queryBuilder->createNamedParameter($queueId, \PDO::PARAM_INT))
1175
            );
1176
        if (! $force) {
1177
            $queryBuilder
1178
                ->andWhere('exec_time = 0')
1179
                ->andWhere('process_scheduled > 0');
1180
        }
1181
        $queueRec = $queryBuilder->execute()->fetch();
1182
1183
        if (! is_array($queueRec)) {
1184
            return;
1185
        }
1186
1187
        SignalSlotUtility::emitSignal(
1188
            self::class,
1189
            SignalSlotUtility::SIGNAL_QUEUEITEM_PREPROCESS,
1190
            [$queueId, &$queueRec]
1191
        );
1192
1193
        // Set exec_time to lock record:
1194
        $field_array = ['exec_time' => $this->getCurrentTime()];
1195
1196
        if (isset($this->processID)) {
1197
            //if mulitprocessing is used we need to store the id of the process which has handled this entry
1198
            $field_array['process_id_completed'] = $this->processID;
1199
        }
1200
1201
        GeneralUtility::makeInstance(ConnectionPool::class)->getConnectionForTable('tx_crawler_queue')
1202
            ->update(
1203
                'tx_crawler_queue',
1204
                $field_array,
1205
                ['qid' => (int) $queueId]
1206
            );
1207
1208
        $result = $this->queueExecutor->executeQueueItem($queueRec, $this);
1209
        if ($result['content'] === null) {
1210
            $resultData = 'An errors happened';
1211
        } else {
1212
            /** @var JsonCompatibilityConverter $jsonCompatibilityConverter */
1213
            $jsonCompatibilityConverter = GeneralUtility::makeInstance(JsonCompatibilityConverter::class);
1214
            $resultData = $jsonCompatibilityConverter->convert($result['content']);
1215
        }
1216
1217
        //atm there's no need to point to specific pollable extensions
1218
        if (is_array($GLOBALS['TYPO3_CONF_VARS']['EXTCONF']['crawler']['pollSuccess'])) {
1219
            foreach ($GLOBALS['TYPO3_CONF_VARS']['EXTCONF']['crawler']['pollSuccess'] as $pollable) {
1220
                // only check the success value if the instruction is runnig
1221
                // it is important to name the pollSuccess key same as the procInstructions key
1222
                if (is_array($resultData['parameters']['procInstructions'])
1223
                    && in_array(
1224
                        $pollable,
1225
                        $resultData['parameters']['procInstructions'], true
1226
                    )
1227
                ) {
1228
                    if (! empty($resultData['success'][$pollable]) && $resultData['success'][$pollable]) {
1229
                        $ret |= self::CLI_STATUS_POLLABLE_PROCESSED;
1230
                    }
1231
                }
1232
            }
1233
        }
1234
1235
        // Set result in log which also denotes the end of the processing of this entry.
1236
        $field_array = ['result_data' => json_encode($result)];
1237
1238
        SignalSlotUtility::emitSignal(
1239
            self::class,
1240
            SignalSlotUtility::SIGNAL_QUEUEITEM_POSTPROCESS,
1241
            [$queueId, &$field_array]
1242
        );
1243
1244
        GeneralUtility::makeInstance(ConnectionPool::class)->getConnectionForTable('tx_crawler_queue')
1245
            ->update(
1246
                'tx_crawler_queue',
1247
                $field_array,
1248
                ['qid' => (int) $queueId]
1249
            );
1250
1251
        $this->logger->debug('crawler-readurl stop ' . microtime(true));
1252
        return $ret;
1253
    }
1254
1255
    /**
1256
     * Read URL for not-yet-inserted log-entry
1257
     *
1258
     * @param array $field_array Queue field array,
1259
     *
1260
     * @return array|bool|mixed|string
1261
     */
1262
    public function readUrlFromArray($field_array)
1263
    {
1264
        // Set exec_time to lock record:
1265
        $field_array['exec_time'] = $this->getCurrentTime();
1266
        $connectionForCrawlerQueue = GeneralUtility::makeInstance(ConnectionPool::class)->getConnectionForTable($this->tableName);
1267
        $connectionForCrawlerQueue->insert(
1268
            $this->tableName,
1269
            $field_array
1270
        );
1271
        $queueId = $field_array['qid'] = $connectionForCrawlerQueue->lastInsertId($this->tableName, 'qid');
1272
1273
        $result = $this->queueExecutor->executeQueueItem($field_array, $this);
1274
1275
        // Set result in log which also denotes the end of the processing of this entry.
1276
        $field_array = ['result_data' => json_encode($result)];
1277
1278
        SignalSlotUtility::emitSignal(
1279
            self::class,
1280
            SignalSlotUtility::SIGNAL_QUEUEITEM_POSTPROCESS,
1281
            [$queueId, &$field_array]
1282
        );
1283
1284
        $connectionForCrawlerQueue->update(
1285
            $this->tableName,
1286
            $field_array,
1287
            ['qid' => $queueId]
1288
        );
1289
1290
        return $result;
1291
    }
1292
1293
    /*****************************
1294
     *
1295
     * Compiling URLs to crawl - tools
1296
     *
1297
     *****************************/
1298
1299
    /**
1300
     * @param integer $id Root page id to start from.
1301
     * @param integer $depth Depth of tree, 0=only id-page, 1= on sublevel, 99 = infinite
1302
     * @param integer $scheduledTime Unix Time when the URL is timed to be visited when put in queue
1303
     * @param integer $reqMinute Number of requests per minute (creates the interleave between requests)
1304
     * @param boolean $submitCrawlUrls If set, submits the URLs to queue in database (real crawling)
1305
     * @param boolean $downloadCrawlUrls If set (and submitcrawlUrls is false) will fill $downloadUrls with entries)
1306
     * @param array $incomingProcInstructions Array of processing instructions
1307
     * @param array $configurationSelection Array of configuration keys
1308
     * @return string
1309
     */
1310
    public function getPageTreeAndUrls(
1311
        $id,
1312
        $depth,
1313
        $scheduledTime,
1314
        $reqMinute,
1315
        $submitCrawlUrls,
1316
        $downloadCrawlUrls,
1317
        array $incomingProcInstructions,
1318
        array $configurationSelection
1319
    ) {
1320
        $this->scheduledTime = $scheduledTime;
1321
        $this->reqMinute = $reqMinute;
1322
        $this->submitCrawlUrls = $submitCrawlUrls;
1323
        $this->downloadCrawlUrls = $downloadCrawlUrls;
1324
        $this->incomingProcInstructions = $incomingProcInstructions;
1325
        $this->incomingConfigurationSelection = $configurationSelection;
1326
1327
        $this->duplicateTrack = [];
1328
        $this->downloadUrls = [];
1329
1330
        // Drawing tree:
1331
        /* @var PageTreeView $tree */
1332
        $tree = GeneralUtility::makeInstance(PageTreeView::class);
1333
        $perms_clause = $this->getBackendUser()->getPagePermsClause(Permission::PAGE_SHOW);
1334
        $tree->init('AND ' . $perms_clause);
1335
1336
        $pageInfo = BackendUtility::readPageAccess($id, $perms_clause);
1337
        if (is_array($pageInfo)) {
0 ignored issues
show
introduced by
The condition is_array($pageInfo) is always true.
Loading history...
1338
            // Set root row:
1339
            $tree->tree[] = [
1340
                'row' => $pageInfo,
1341
                'HTML' => $this->iconFactory->getIconForRecord('pages', $pageInfo, Icon::SIZE_SMALL),
1342
            ];
1343
        }
1344
1345
        // Get branch beneath:
1346
        if ($depth) {
1347
            $tree->getTree($id, $depth, '');
1348
        }
1349
1350
        // Traverse page tree:
1351
        $code = '';
1352
1353
        foreach ($tree->tree as $data) {
1354
            $this->MP = false;
1355
1356
            // recognize mount points
1357
            if ($data['row']['doktype'] === PageRepository::DOKTYPE_MOUNTPOINT) {
1358
                $mountpage = $this->pageRepository->getPage($data['row']['uid']);
1359
1360
                // fetch mounted pages
1361
                $this->MP = $mountpage[0]['mount_pid'] . '-' . $data['row']['uid'];
0 ignored issues
show
Documentation Bug introduced by
The property $MP was declared of type boolean, but $mountpage[0]['mount_pid...' . $data['row']['uid'] is of type string. Maybe add a type cast?

This check looks for assignments to scalar types that may be of the wrong type.

To ensure the code behaves as expected, it may be a good idea to add an explicit type cast.

$answer = 42;

$correct = false;

$correct = (bool) $answer;
Loading history...
1362
1363
                $mountTree = GeneralUtility::makeInstance(PageTreeView::class);
1364
                $mountTree->init('AND ' . $perms_clause);
1365
                $mountTree->getTree($mountpage[0]['mount_pid'], $depth);
1366
1367
                foreach ($mountTree->tree as $mountData) {
1368
                    $code .= $this->drawURLs_addRowsForPage(
1369
                        $mountData['row'],
1370
                        $mountData['HTML'] . BackendUtility::getRecordTitle('pages', $mountData['row'], true)
1371
                    );
1372
                }
1373
1374
                // replace page when mount_pid_ol is enabled
1375
                if ($mountpage[0]['mount_pid_ol']) {
1376
                    $data['row']['uid'] = $mountpage[0]['mount_pid'];
1377
                } else {
1378
                    // if the mount_pid_ol is not set the MP must not be used for the mountpoint page
1379
                    $this->MP = false;
1380
                }
1381
            }
1382
1383
            $code .= $this->drawURLs_addRowsForPage(
1384
                $data['row'],
1385
                $data['HTML'] . BackendUtility::getRecordTitle('pages', $data['row'], true)
1386
            );
1387
        }
1388
1389
        return $code;
1390
    }
1391
1392
    /**
1393
     * Expands exclude string
1394
     *
1395
     * @param string $excludeString Exclude string
1396
     * @return array
1397
     */
1398 2
    public function expandExcludeString($excludeString)
1399
    {
1400
        // internal static caches;
1401 2
        static $expandedExcludeStringCache;
1402 2
        static $treeCache;
1403
1404 2
        if (empty($expandedExcludeStringCache[$excludeString])) {
1405 2
            $pidList = [];
1406
1407 2
            if (! empty($excludeString)) {
1408
                /** @var PageTreeView $tree */
1409 1
                $tree = GeneralUtility::makeInstance(PageTreeView::class);
1410 1
                $tree->init('AND ' . $this->getBackendUser()->getPagePermsClause(Permission::PAGE_SHOW));
1411
1412 1
                $excludeParts = GeneralUtility::trimExplode(',', $excludeString);
1413
1414 1
                foreach ($excludeParts as $excludePart) {
1415 1
                    [$pid, $depth] = GeneralUtility::trimExplode('+', $excludePart);
1416
1417
                    // default is "page only" = "depth=0"
1418 1
                    if (empty($depth)) {
1419 1
                        $depth = (stristr($excludePart, '+')) ? 99 : 0;
1420
                    }
1421
1422 1
                    $pidList[] = (int) $pid;
1423
1424 1
                    if ($depth > 0) {
1425
                        if (empty($treeCache[$pid][$depth])) {
1426
                            $tree->reset();
1427
                            $tree->getTree($pid, $depth);
0 ignored issues
show
Bug introduced by
$pid of type string is incompatible with the type integer expected by parameter $uid of TYPO3\CMS\Backend\Tree\V...ractTreeView::getTree(). ( Ignorable by Annotation )

If this is a false-positive, you can also ignore this issue in your code via the ignore-type  annotation

1427
                            $tree->getTree(/** @scrutinizer ignore-type */ $pid, $depth);
Loading history...
1428
                            $treeCache[$pid][$depth] = $tree->tree;
1429
                        }
1430
1431
                        foreach ($treeCache[$pid][$depth] as $data) {
1432
                            $pidList[] = (int) $data['row']['uid'];
1433
                        }
1434
                    }
1435
                }
1436
            }
1437
1438 2
            $expandedExcludeStringCache[$excludeString] = array_unique($pidList);
1439
        }
1440
1441 2
        return $expandedExcludeStringCache[$excludeString];
1442
    }
1443
1444
    /**
1445
     * Create the rows for display of the page tree
1446
     * For each page a number of rows are shown displaying GET variable configuration
1447
     */
1448
    public function drawURLs_addRowsForPage(array $pageRow, string $pageTitle): string
1449
    {
1450
        $skipMessage = '';
1451
1452
        // Get list of configurations
1453
        $configurations = $this->getUrlsForPageRow($pageRow, $skipMessage);
1454
1455
        if (! empty($this->incomingConfigurationSelection)) {
1456
            // remove configuration that does not match the current selection
1457
            foreach ($configurations as $confKey => $confArray) {
1458
                if (! in_array($confKey, $this->incomingConfigurationSelection, true)) {
1459
                    unset($configurations[$confKey]);
1460
                }
1461
            }
1462
        }
1463
1464
        // Traverse parameter combinations:
1465
        $c = 0;
1466
        $content = '';
1467
        if (! empty($configurations)) {
1468
            foreach ($configurations as $confKey => $confArray) {
1469
1470
                // Title column:
1471
                if (! $c) {
1472
                    $titleClm = '<td rowspan="' . count($configurations) . '">' . $pageTitle . '</td>';
1473
                } else {
1474
                    $titleClm = '';
1475
                }
1476
1477
                if (! in_array($pageRow['uid'], $this->expandExcludeString($confArray['subCfg']['exclude']), true)) {
1478
1479
                    // URL list:
1480
                    $urlList = $this->urlListFromUrlArray(
1481
                        $confArray,
1482
                        $pageRow,
1483
                        $this->scheduledTime,
1484
                        $this->reqMinute,
1485
                        $this->submitCrawlUrls,
1486
                        $this->downloadCrawlUrls,
1487
                        $this->duplicateTrack,
1488
                        $this->downloadUrls,
1489
                        // if empty the urls won't be filtered by processing instructions
1490
                        $this->incomingProcInstructions
1491
                    );
1492
1493
                    // Expanded parameters:
1494
                    $paramExpanded = '';
1495
                    $calcAccu = [];
1496
                    $calcRes = 1;
1497
                    foreach ($confArray['paramExpanded'] as $gVar => $gVal) {
1498
                        $paramExpanded .= '
1499
                            <tr>
1500
                                <td>' . htmlspecialchars('&' . $gVar . '=') . '<br/>' .
1501
                            '(' . count($gVal) . ')' .
1502
                            '</td>
1503
                                <td nowrap="nowrap">' . nl2br(htmlspecialchars(implode(chr(10), $gVal))) . '</td>
1504
                            </tr>
1505
                        ';
1506
                        $calcRes *= count($gVal);
1507
                        $calcAccu[] = count($gVal);
1508
                    }
1509
                    $paramExpanded = '<table>' . $paramExpanded . '</table>';
1510
                    $paramExpanded .= 'Comb: ' . implode('*', $calcAccu) . '=' . $calcRes;
1511
1512
                    // Options
1513
                    $optionValues = '';
1514
                    if ($confArray['subCfg']['userGroups']) {
1515
                        $optionValues .= 'User Groups: ' . $confArray['subCfg']['userGroups'] . '<br/>';
1516
                    }
1517
                    if ($confArray['subCfg']['procInstrFilter']) {
1518
                        $optionValues .= 'ProcInstr: ' . $confArray['subCfg']['procInstrFilter'] . '<br/>';
1519
                    }
1520
1521
                    // Compile row:
1522
                    $content .= '
1523
                        <tr>
1524
                            ' . $titleClm . '
1525
                            <td>' . htmlspecialchars($confKey) . '</td>
1526
                            <td>' . nl2br(htmlspecialchars(rawurldecode(trim(str_replace('&', chr(10) . '&', GeneralUtility::implodeArrayForUrl('', $confArray['paramParsed'])))))) . '</td>
1527
                            <td>' . $paramExpanded . '</td>
1528
                            <td nowrap="nowrap">' . $urlList . '</td>
1529
                            <td nowrap="nowrap">' . $optionValues . '</td>
1530
                            <td nowrap="nowrap">' . DebugUtility::viewArray($confArray['subCfg']['procInstrParams.']) . '</td>
1531
                        </tr>';
1532
                } else {
1533
                    $content .= '<tr>
1534
                            ' . $titleClm . '
1535
                            <td>' . htmlspecialchars($confKey) . '</td>
1536
                            <td colspan="5"><em>No entries</em> (Page is excluded in this configuration)</td>
1537
                        </tr>';
1538
                }
1539
1540
                $c++;
1541
            }
1542
        } else {
1543
            $message = ! empty($skipMessage) ? ' (' . $skipMessage . ')' : '';
1544
1545
            // Compile row:
1546
            $content .= '
1547
                <tr>
1548
                    <td>' . $pageTitle . '</td>
1549
                    <td colspan="6"><em>No entries</em>' . $message . '</td>
1550
                </tr>';
1551
        }
1552
1553
        return $content;
1554
    }
1555
1556
    /*****************************
1557
     *
1558
     * CLI functions
1559
     *
1560
     *****************************/
1561
1562
    /**
1563
     * Running the functionality of the CLI (crawling URLs from queue)
1564
     */
1565
    public function CLI_run(int $countInARun, int $sleepTime, int $sleepAfterFinish): int
1566
    {
1567
        $result = 0;
1568
        $counter = 0;
1569
1570
        // First, run hooks:
1571
        $this->CLI_runHooks();
1572
1573
        // Clean up the queue
1574
        $this->queueRepository->cleanupQueue();
1575
1576
        // Select entries:
1577
        $rows = $this->queueRepository->fetchRecordsToBeCrawled($countInARun);
1578
1579
        if (! empty($rows)) {
1580
            $quidList = [];
1581
1582
            foreach ($rows as $r) {
1583
                $quidList[] = $r['qid'];
1584
            }
1585
1586
            $processId = $this->CLI_buildProcessId();
1587
1588
            //save the number of assigned queue entries to determine how many have been processed later
1589
            $numberOfAffectedRows = $this->queueRepository->updateProcessIdAndSchedulerForQueueIds($quidList, $processId);
1590
            $this->processRepository->updateProcessAssignItemsCount($numberOfAffectedRows, $processId);
1591
1592
            if ($numberOfAffectedRows !== count($quidList)) {
1593
                $this->CLI_debug('Nothing processed due to multi-process collision (' . $this->CLI_buildProcessId() . ')');
0 ignored issues
show
Deprecated Code introduced by
The function AOE\Crawler\Controller\C...Controller::CLI_debug() has been deprecated. ( Ignorable by Annotation )

If this is a false-positive, you can also ignore this issue in your code via the ignore-deprecated  annotation

1593
                /** @scrutinizer ignore-deprecated */ $this->CLI_debug('Nothing processed due to multi-process collision (' . $this->CLI_buildProcessId() . ')');
Loading history...
1594
                return ($result | self::CLI_STATUS_ABORTED);
1595
            }
1596
1597
            foreach ($rows as $r) {
1598
                $result |= $this->readUrl($r['qid']);
1599
1600
                $counter++;
1601
                // Just to relax the system
1602
                usleep((int) $sleepTime);
1603
1604
                // if during the start and the current read url the cli has been disable we need to return from the function
1605
                // mark the process NOT as ended.
1606
                if ($this->getDisabled()) {
1607
                    return ($result | self::CLI_STATUS_ABORTED);
1608
                }
1609
1610
                if (! $this->processRepository->isProcessActive($this->CLI_buildProcessId())) {
1611
                    $this->CLI_debug('conflict / timeout (' . $this->CLI_buildProcessId() . ')');
0 ignored issues
show
Deprecated Code introduced by
The function AOE\Crawler\Controller\C...Controller::CLI_debug() has been deprecated. ( Ignorable by Annotation )

If this is a false-positive, you can also ignore this issue in your code via the ignore-deprecated  annotation

1611
                    /** @scrutinizer ignore-deprecated */ $this->CLI_debug('conflict / timeout (' . $this->CLI_buildProcessId() . ')');
Loading history...
1612
                    $result |= self::CLI_STATUS_ABORTED;
1613
                    //possible timeout
1614
                    break;
1615
                }
1616
            }
1617
1618
            sleep((int) $sleepAfterFinish);
1619
1620
            $msg = 'Rows: ' . $counter;
1621
            $this->CLI_debug($msg . ' (' . $this->CLI_buildProcessId() . ')');
0 ignored issues
show
Deprecated Code introduced by
The function AOE\Crawler\Controller\C...Controller::CLI_debug() has been deprecated. ( Ignorable by Annotation )

If this is a false-positive, you can also ignore this issue in your code via the ignore-deprecated  annotation

1621
            /** @scrutinizer ignore-deprecated */ $this->CLI_debug($msg . ' (' . $this->CLI_buildProcessId() . ')');
Loading history...
1622
        } else {
1623
            $this->CLI_debug('Nothing within queue which needs to be processed (' . $this->CLI_buildProcessId() . ')');
0 ignored issues
show
Deprecated Code introduced by
The function AOE\Crawler\Controller\C...Controller::CLI_debug() has been deprecated. ( Ignorable by Annotation )

If this is a false-positive, you can also ignore this issue in your code via the ignore-deprecated  annotation

1623
            /** @scrutinizer ignore-deprecated */ $this->CLI_debug('Nothing within queue which needs to be processed (' . $this->CLI_buildProcessId() . ')');
Loading history...
1624
        }
1625
1626
        if ($counter > 0) {
1627
            $result |= self::CLI_STATUS_PROCESSED;
1628
        }
1629
1630
        return $result;
1631
    }
1632
1633
    /**
1634
     * Activate hooks
1635
     */
1636
    public function CLI_runHooks(): void
1637
    {
1638
        foreach ($GLOBALS['TYPO3_CONF_VARS']['EXTCONF']['crawler']['cli_hooks'] ?? [] as $objRef) {
1639
            $hookObj = GeneralUtility::makeInstance($objRef);
1640
            if (is_object($hookObj)) {
1641
                $hookObj->crawler_init($this);
1642
            }
1643
        }
1644
    }
1645
1646
    /**
1647
     * Try to acquire a new process with the given id
1648
     * also performs some auto-cleanup for orphan processes
1649
     * @param string $id identification string for the process
1650
     * @return boolean
1651
     * @todo preemption might not be the most elegant way to clean up
1652
     */
1653
    public function CLI_checkAndAcquireNewProcess($id)
1654
    {
1655
        $queryBuilder = GeneralUtility::makeInstance(ConnectionPool::class)->getQueryBuilderForTable($this->tableName);
1656
        $ret = true;
1657
1658
        $systemProcessId = getmypid();
1659
        if ($systemProcessId < 1) {
1660
            return false;
1661
        }
1662
1663
        $processCount = 0;
1664
        $orphanProcesses = [];
1665
1666
        $statement = $queryBuilder
1667
            ->select('process_id', 'ttl')
1668
            ->from('tx_crawler_process')
1669
            ->where(
1670
                'active = 1 AND deleted = 0'
1671
            )
1672
            ->execute();
1673
1674
        $currentTime = $this->getCurrentTime();
1675
1676
        while ($row = $statement->fetch()) {
1677
            if ($row['ttl'] < $currentTime) {
1678
                $orphanProcesses[] = $row['process_id'];
1679
            } else {
1680
                $processCount++;
1681
            }
1682
        }
1683
1684
        // if there are less than allowed active processes then add a new one
1685
        if ($processCount < (int) $this->extensionSettings['processLimit']) {
1686
            $this->CLI_debug('add process ' . $this->CLI_buildProcessId() . ' (' . ($processCount + 1) . '/' . (int) $this->extensionSettings['processLimit'] . ')');
0 ignored issues
show
Deprecated Code introduced by
The function AOE\Crawler\Controller\C...Controller::CLI_debug() has been deprecated. ( Ignorable by Annotation )

If this is a false-positive, you can also ignore this issue in your code via the ignore-deprecated  annotation

1686
            /** @scrutinizer ignore-deprecated */ $this->CLI_debug('add process ' . $this->CLI_buildProcessId() . ' (' . ($processCount + 1) . '/' . (int) $this->extensionSettings['processLimit'] . ')');
Loading history...
1687
1688
            GeneralUtility::makeInstance(ConnectionPool::class)->getConnectionForTable('tx_crawler_process')->insert(
1689
                'tx_crawler_process',
1690
                [
1691
                    'process_id' => $id,
1692
                    'active' => 1,
1693
                    'ttl' => $currentTime + (int) $this->extensionSettings['processMaxRunTime'],
1694
                    'system_process_id' => $systemProcessId,
1695
                ]
1696
            );
1697
        } else {
1698
            $this->CLI_debug('Processlimit reached (' . ($processCount) . '/' . (int) $this->extensionSettings['processLimit'] . ')');
0 ignored issues
show
Deprecated Code introduced by
The function AOE\Crawler\Controller\C...Controller::CLI_debug() has been deprecated. ( Ignorable by Annotation )

If this is a false-positive, you can also ignore this issue in your code via the ignore-deprecated  annotation

1698
            /** @scrutinizer ignore-deprecated */ $this->CLI_debug('Processlimit reached (' . ($processCount) . '/' . (int) $this->extensionSettings['processLimit'] . ')');
Loading history...
1699
            $ret = false;
1700
        }
1701
1702
        $this->processRepository->deleteProcessesMarkedAsDeleted();
1703
        $this->CLI_releaseProcesses($orphanProcesses);
1704
1705
        return $ret;
1706
    }
1707
1708
    /**
1709
     * Release a process and the required resources
1710
     *
1711
     * @param mixed $releaseIds string with a single process-id or array with multiple process-ids
1712
     * @return boolean
1713
     */
1714
    public function CLI_releaseProcesses($releaseIds)
1715
    {
1716
        $queryBuilder = GeneralUtility::makeInstance(ConnectionPool::class)->getQueryBuilderForTable($this->tableName);
1717
1718
        if (! is_array($releaseIds)) {
1719
            $releaseIds = [$releaseIds];
1720
        }
1721
1722
        if (empty($releaseIds)) {
1723
            //nothing to release
1724
            return false;
1725
        }
1726
1727
        // some kind of 2nd chance algo - this way you need at least 2 processes to have a real cleanup
1728
        // this ensures that a single process can't mess up the entire process table
1729
1730
        // mark all processes as deleted which have no "waiting" queue-entires and which are not active
1731
1732
        $queryBuilder
1733
            ->update($this->tableName, 'q')
1734
            ->where(
1735
                'q.process_id IN(SELECT p.process_id FROM tx_crawler_process as p WHERE p.active = 0)'
1736
            )
1737
            ->set('q.process_scheduled', 0)
1738
            ->set('q.process_id', '')
1739
            ->execute();
1740
1741
        // FIXME: Not entirely sure that this is equivalent to the previous version
1742
        $queryBuilder->resetQueryPart('set');
1743
1744
        $queryBuilder
1745
            ->update('tx_crawler_process')
1746
            ->where(
1747
                $queryBuilder->expr()->eq('active', 0),
1748
                'process_id IN(SELECT q.process_id FROM tx_crawler_queue as q WHERE q.exec_time = 0)'
1749
            )
1750
            ->set('system_process_id', 0)
1751
            ->execute();
1752
1753
        $this->processRepository->markRequestedProcessesAsNotActive($releaseIds);
1754
        $this->queueRepository->unsetProcessScheduledAndProcessIdForQueueEntries($releaseIds);
1755
1756
        return true;
1757
    }
1758
1759
    /**
1760
     * Create a unique Id for the current process
1761
     *
1762
     * @return string the ID
1763
     */
1764 1
    public function CLI_buildProcessId()
1765
    {
1766 1
        if (! $this->processID) {
1767
            $this->processID = GeneralUtility::shortMD5(microtime(true));
1768
        }
1769 1
        return $this->processID;
1770
    }
1771
1772
    /**
1773
     * Prints a message to the stdout (only if debug-mode is enabled)
1774
     *
1775
     * @param string $msg the message
1776
     * @deprecated
1777
     */
1778
    public function CLI_debug($msg): void
1779
    {
1780
        if ((int) $this->extensionSettings['processDebug']) {
1781
            echo $msg . "\n";
1782
            flush();
1783
        }
1784
    }
1785
1786
    /**
1787
     * Cleans up entries that stayed for too long in the queue. These are:
1788
     * - processed entries that are over 1.5 days in age
1789
     * - scheduled entries that are over 7 days old
1790
     *
1791
     * @deprecated
1792
     */
1793 1
    public function cleanUpOldQueueEntries(): void
1794
    {
1795
        // 24*60*60 Seconds in 24 hours
1796 1
        $processedAgeInSeconds = $this->extensionSettings['cleanUpProcessedAge'] * 86400;
1797 1
        $scheduledAgeInSeconds = $this->extensionSettings['cleanUpScheduledAge'] * 86400;
1798
1799 1
        $now = time();
1800 1
        $condition = '(exec_time<>0 AND exec_time<' . ($now - $processedAgeInSeconds) . ') OR scheduled<=' . ($now - $scheduledAgeInSeconds);
1801 1
        $this->flushQueue($condition);
0 ignored issues
show
Deprecated Code introduced by
The function AOE\Crawler\Controller\C...ontroller::flushQueue() has been deprecated. ( Ignorable by Annotation )

If this is a false-positive, you can also ignore this issue in your code via the ignore-deprecated  annotation

1801
        /** @scrutinizer ignore-deprecated */ $this->flushQueue($condition);
Loading history...
1802 1
    }
1803
1804
    /**
1805
     * Removes queue entries
1806
     *
1807
     * @param string $where SQL related filter for the entries which should be removed
1808
     *
1809
     * @deprecated
1810
     */
1811 5
    protected function flushQueue($where = ''): void
1812
    {
1813 5
        $realWhere = strlen((string) $where) > 0 ? $where : '1=1';
1814
1815 5
        $queryBuilder = $this->getQueryBuilder($this->tableName);
1816
1817
        $groups = $queryBuilder
1818 5
            ->selectLiteral('DISTINCT set_id')
1819 5
            ->from($this->tableName)
1820 5
            ->where($realWhere)
1821 5
            ->execute()
1822 5
            ->fetchAll();
1823 5
        if (is_array($groups)) {
0 ignored issues
show
introduced by
The condition is_array($groups) is always true.
Loading history...
1824 5
            foreach ($groups as $group) {
1825
                $subSet = $queryBuilder
1826 4
                    ->select('qid', 'set_id')
1827 4
                    ->from($this->tableName)
1828 4
                    ->where(
1829 4
                        $realWhere,
1830 4
                        $queryBuilder->expr()->eq('set_id', $group['set_id'])
1831
                    )
1832 4
                    ->execute()
1833 4
                    ->fetchAll();
1834
1835 4
                $payLoad = ['subSet' => $subSet];
1836 4
                SignalSlotUtility::emitSignal(
1837 4
                    self::class,
1838 4
                    SignalSlotUtility::SIGNAL_QUEUE_ENTRY_FLUSH,
1839 4
                    $payLoad
1840
                );
1841
            }
1842
        }
1843
1844
        $queryBuilder
1845 5
            ->delete($this->tableName)
1846 5
            ->where($realWhere)
1847 5
            ->execute();
1848 5
    }
1849
1850
    /**
1851
     * This method determines duplicates for a queue entry with the same parameters and this timestamp.
1852
     * If the timestamp is in the past, it will check if there is any unprocessed queue entry in the past.
1853
     * If the timestamp is in the future it will check, if the queued entry has exactly the same timestamp
1854
     *
1855
     * @param int $tstamp
1856
     * @param array $fieldArray
1857
     *
1858
     * @return array
1859
     */
1860 9
    protected function getDuplicateRowsIfExist($tstamp, $fieldArray)
1861
    {
1862 9
        $rows = [];
1863
1864 9
        $currentTime = $this->getCurrentTime();
1865
1866 9
        $queryBuilder = GeneralUtility::makeInstance(ConnectionPool::class)->getQueryBuilderForTable($this->tableName);
1867
        $queryBuilder
1868 9
            ->select('qid')
1869 9
            ->from('tx_crawler_queue');
1870
        //if this entry is scheduled with "now"
1871 9
        if ($tstamp <= $currentTime) {
1872 3
            if ($this->extensionSettings['enableTimeslot']) {
1873 2
                $timeBegin = $currentTime - 100;
1874 2
                $timeEnd = $currentTime + 100;
1875
                $queryBuilder
1876 2
                    ->where(
1877 2
                        'scheduled BETWEEN ' . $timeBegin . ' AND ' . $timeEnd . ''
1878
                    )
1879 2
                    ->orWhere(
1880 2
                        $queryBuilder->expr()->lte('scheduled', $currentTime)
1881
                    );
1882
            } else {
1883
                $queryBuilder
1884 1
                    ->where(
1885 3
                        $queryBuilder->expr()->lte('scheduled', $currentTime)
1886
                    );
1887
            }
1888 6
        } elseif ($tstamp > $currentTime) {
1889
            //entry with a timestamp in the future need to have the same schedule time
1890
            $queryBuilder
1891 6
                ->where(
1892 6
                    $queryBuilder->expr()->eq('scheduled', $tstamp)
1893
                );
1894
        }
1895
1896
        $queryBuilder
1897 9
            ->andWhere('NOT exec_time')
1898 9
            ->andWhere('NOT process_id')
1899 9
            ->andWhere($queryBuilder->expr()->eq('page_id', $queryBuilder->createNamedParameter($fieldArray['page_id'], \PDO::PARAM_INT)))
1900 9
            ->andWhere($queryBuilder->expr()->eq('parameters_hash', $queryBuilder->createNamedParameter($fieldArray['parameters_hash'], \PDO::PARAM_STR)));
1901
1902 9
        $statement = $queryBuilder->execute();
1903
1904 9
        while ($row = $statement->fetch()) {
1905 7
            $rows[] = $row['qid'];
1906
        }
1907
1908 9
        return $rows;
1909
    }
1910
1911
    /**
1912
     * Returns a md5 hash generated from a serialized configuration array.
1913
     *
1914
     * @return string
1915
     */
1916 10
    protected function getConfigurationHash(array $configuration)
1917
    {
1918 10
        unset($configuration['paramExpanded']);
1919 10
        unset($configuration['URLs']);
1920 10
        return md5(serialize($configuration));
1921
    }
1922
1923
    /**
1924
     * Build a URL from a Page and the Query String. If the page has a Site configuration, it can be built by using
1925
     * the Site instance.
1926
     *
1927
     * @param int $httpsOrHttp see tx_crawler_configuration.force_ssl
1928
     * @throws \TYPO3\CMS\Core\Exception\SiteNotFoundException
1929
     * @throws \TYPO3\CMS\Core\Routing\InvalidRouteArgumentsException
1930
     *
1931
     * @deprecated Using CrawlerController::getUrlFromPageAndQueryParameters() is deprecated since 9.1.1 and will be removed in v11.x, please use UrlService->getUrlFromPageAndQueryParameters() instead.
1932
     */
1933
    protected function getUrlFromPageAndQueryParameters(int $pageId, string $queryString, ?string $alternativeBaseUrl, int $httpsOrHttp): UriInterface
1934
    {
1935
        $urlService = new UrlService();
1936
        return $urlService->getUrlFromPageAndQueryParameters($pageId, $queryString, $alternativeBaseUrl, $httpsOrHttp);
1937
    }
1938
1939 1
    protected function swapIfFirstIsLargerThanSecond(array $reg): array
1940
    {
1941
        // Swap if first is larger than last:
1942 1
        if ($reg[1] > $reg[2]) {
1943
            $temp = $reg[2];
1944
            $reg[2] = $reg[1];
1945
            $reg[1] = $temp;
1946
        }
1947
1948 1
        return $reg;
1949
    }
1950
1951
    /**
1952
     * @return BackendUserAuthentication
1953
     */
1954 2
    private function getBackendUser()
1955
    {
1956
        // Make sure the _cli_ user is loaded
1957 2
        Bootstrap::initializeBackendAuthentication();
1958 2
        if ($this->backendUser === null) {
1959 2
            $this->backendUser = $GLOBALS['BE_USER'];
1960
        }
1961 2
        return $this->backendUser;
1962
    }
1963
1964
    /**
1965
     * Get querybuilder for given table
1966
     *
1967
     * @return \TYPO3\CMS\Core\Database\Query\QueryBuilder
1968
     */
1969 12
    private function getQueryBuilder(string $table)
1970
    {
1971 12
        return GeneralUtility::makeInstance(ConnectionPool::class)->getQueryBuilderForTable($table);
1972
    }
1973
}
1974