Passed
Push — refactor/crawlerController ( b87fbb...2c0518 )
by Tomas Norre
07:14
created

CrawlerController::expandParameters()   F

Complexity

Conditions 25
Paths 831

Size

Total Lines 129
Code Lines 74

Duplication

Lines 0
Ratio 0 %

Code Coverage

Tests 58
CRAP Score 28.1471

Importance

Changes 1
Bugs 0 Features 0
Metric Value
eloc 74
nc 831
nop 2
dl 0
loc 129
c 1
b 0
f 0
cc 25
ccs 58
cts 70
cp 0.8286
crap 28.1471
rs 0.2347

How to fix   Long Method    Complexity   

Long Method

Small methods make your code easier to understand, in particular if combined with a good name. Besides, if your method is small, finding a good name is usually much easier.

For example, if you find yourself adding comments to a method's body, this is usually a good sign to extract the commented part to a new method, and use the comment as a starting point when coming up with a good name for this new method.

Commonly applied refactorings include:

1
<?php
2
3
declare(strict_types=1);
4
5
namespace AOE\Crawler\Controller;
6
7
/***************************************************************
8
 *  Copyright notice
9
 *
10
 *  (c) 2020 AOE GmbH <[email protected]>
11
 *
12
 *  All rights reserved
13
 *
14
 *  This script is part of the TYPO3 project. The TYPO3 project is
15
 *  free software; you can redistribute it and/or modify
16
 *  it under the terms of the GNU General Public License as published by
17
 *  the Free Software Foundation; either version 3 of the License, or
18
 *  (at your option) any later version.
19
 *
20
 *  The GNU General Public License can be found at
21
 *  http://www.gnu.org/copyleft/gpl.html.
22
 *
23
 *  This script is distributed in the hope that it will be useful,
24
 *  but WITHOUT ANY WARRANTY; without even the implied warranty of
25
 *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
26
 *  GNU General Public License for more details.
27
 *
28
 *  This copyright notice MUST APPEAR in all copies of the script!
29
 ***************************************************************/
30
31
use AOE\Crawler\Configuration\ExtensionConfigurationProvider;
32
use AOE\Crawler\Converter\JsonCompatibilityConverter;
33
use AOE\Crawler\CrawlStrategy\CrawlStrategyFactory;
34
use AOE\Crawler\Domain\Model\Process;
35
use AOE\Crawler\Domain\Repository\ConfigurationRepository;
36
use AOE\Crawler\Domain\Repository\ProcessRepository;
37
use AOE\Crawler\Domain\Repository\QueueRepository;
38
use AOE\Crawler\QueueExecutor;
39
use AOE\Crawler\Service\UrlService;
40
use AOE\Crawler\Utility\SignalSlotUtility;
41
use Psr\Http\Message\UriInterface;
42
use Psr\Log\LoggerAwareInterface;
43
use Psr\Log\LoggerAwareTrait;
44
use TYPO3\CMS\Backend\Tree\View\PageTreeView;
45
use TYPO3\CMS\Backend\Utility\BackendUtility;
46
use TYPO3\CMS\Core\Authentication\BackendUserAuthentication;
47
use TYPO3\CMS\Core\Compatibility\PublicMethodDeprecationTrait;
48
use TYPO3\CMS\Core\Compatibility\PublicPropertyDeprecationTrait;
49
use TYPO3\CMS\Core\Core\Bootstrap;
50
use TYPO3\CMS\Core\Core\Environment;
51
use TYPO3\CMS\Core\Database\Connection;
52
use TYPO3\CMS\Core\Database\ConnectionPool;
53
use TYPO3\CMS\Core\Database\Query\Restriction\DeletedRestriction;
54
use TYPO3\CMS\Core\Imaging\Icon;
55
use TYPO3\CMS\Core\Imaging\IconFactory;
56
use TYPO3\CMS\Core\Site\Entity\Site;
57
use TYPO3\CMS\Core\Type\Bitmask\Permission;
58
use TYPO3\CMS\Core\TypoScript\Parser\TypoScriptParser;
59
use TYPO3\CMS\Core\Utility\DebugUtility;
60
use TYPO3\CMS\Core\Utility\GeneralUtility;
61
use TYPO3\CMS\Core\Utility\MathUtility;
62
use TYPO3\CMS\Extbase\Object\ObjectManager;
63
use TYPO3\CMS\Frontend\Page\PageRepository;
64
65
/**
66
 * Class CrawlerController
67
 *
68
 * @package AOE\Crawler\Controller
69
 */
70
class CrawlerController implements LoggerAwareInterface
71
{
72
    use LoggerAwareTrait;
73
    use PublicMethodDeprecationTrait;
74
    use PublicPropertyDeprecationTrait;
75
76
    public const CLI_STATUS_NOTHING_PROCCESSED = 0;
77
78
    //queue not empty
79
    public const CLI_STATUS_REMAIN = 1;
80
81
    //(some) queue items where processed
82
    public const CLI_STATUS_PROCESSED = 2;
83
84
    //instance didn't finish
85
    public const CLI_STATUS_ABORTED = 4;
86
87
    public const CLI_STATUS_POLLABLE_PROCESSED = 8;
88
89
    /**
90
     * @var integer
91
     */
92
    public $setID = 0;
93
94
    /**
95
     * @var string
96
     */
97
    public $processID = '';
98
99
    /**
100
     * @var array
101
     */
102
    public $duplicateTrack = [];
103
104
    /**
105
     * @var array
106
     */
107
    public $downloadUrls = [];
108
109
    /**
110
     * @var array
111
     */
112
    public $incomingProcInstructions = [];
113
114
    /**
115
     * @var array
116
     */
117
    public $incomingConfigurationSelection = [];
118
119
    /**
120
     * @var bool
121
     */
122
    public $registerQueueEntriesInternallyOnly = false;
123
124
    /**
125
     * @var array
126
     */
127
    public $queueEntries = [];
128
129
    /**
130
     * @var array
131
     */
132
    public $urlList = [];
133
134
    /**
135
     * @var array
136
     */
137
    public $extensionSettings = [];
138
139
    /**
140
     * Mount Point
141
     *
142
     * @var bool
143
     * Todo: Check what this is used for and adjust the type hint or code, as bool doesn't match the current code.
144
     */
145
    public $MP = false;
146
147
    /**
148
     * @var string
149
     */
150
    protected $processFilename;
151
152
    /**
153
     * Holds the internal access mode can be 'gui','cli' or 'cli_im'
154
     *
155
     * @var string
156
     * @deprecated
157
     */
158
    protected $accessMode;
159
160
    /**
161
     * @var QueueRepository
162
     */
163
    protected $queueRepository;
164
165
    /**
166
     * @var ProcessRepository
167
     */
168
    protected $processRepository;
169
170
    /**
171
     * @var ConfigurationRepository
172
     */
173
    protected $configurationRepository;
174
175
    /**
176
     * @var string
177
     */
178
    protected $tableName = 'tx_crawler_queue';
179
180
    /**
181
     * @var QueueExecutor
182
     */
183
    protected $queueExecutor;
184
185
    /**
186
     * @var int
187
     */
188
    protected $maximumUrlsToCompile = 10000;
189
190
    /**
191
     * @var IconFactory
192
     */
193
    protected $iconFactory;
194
195
    /**
196
     * @var string[]
197
     */
198
    private $deprecatedPublicMethods = [
0 ignored issues
show
introduced by
The private property $deprecatedPublicMethods is not used, and could be removed.
Loading history...
199
        'cleanUpOldQueueEntries' => 'Using CrawlerController::cleanUpOldQueueEntries() is deprecated since 9.0.1 and will be removed in v11.x, please use QueueRepository->cleanUpOldQueueEntries() instead.',
200
        'CLI_debug' => 'Using CrawlerController->CLI_debug() is deprecated since 9.1.3 and will be removed in v11.x',
201
        'getAccessMode' => 'Using CrawlerController->getAccessMode() is deprecated since 9.1.3 and will be removed in v11.x',
202
        'getLogEntriesForSetId' => 'Using crawlerController::getLogEntriesForSetId() is deprecated since 9.0.1 and will be removed in v11.x',
203
        'flushQueue' => 'Using CrawlerController::flushQueue() is deprecated since 9.0.1 and will be removed in v11.x, please use QueueRepository->flushQueue() instead.',
204
        'setAccessMode' => 'Using CrawlerController->setAccessMode() is deprecated since 9.1.3 and will be removed in v11.x'
205
    ];
206
207
    /**
208
     * @var string[]
209
     */
210
    private $deprecatedPublicProperties = [
1 ignored issue
show
introduced by
The private property $deprecatedPublicProperties is not used, and could be removed.
Loading history...
211
        'accessMode' => 'Using CrawlerController->accessMode is deprecated since 9.1.3 and will be removed in v11.x'
212
    ];
213
214
    /**
215
     * @var BackendUserAuthentication|null
216
     */
217
    private $backendUser;
218
219
    /**
220
     * @var integer
221
     */
222
    private $scheduledTime = 0;
223
224
    /**
225
     * @var integer
226
     */
227
    private $reqMinute = 0;
228
229
    /**
230
     * @var bool
231
     */
232
    private $submitCrawlUrls = false;
233
234
    /**
235
     * @var bool
236
     */
237
    private $downloadCrawlUrls = false;
238
239
    /**
240
     * @var PageRepository
241
     */
242
    private $pageRepository;
243
244
    /************************************
245
     *
246
     * Getting URLs based on Page TSconfig
247
     *
248
     ************************************/
249
250 36
    public function __construct()
251
    {
252 36
        $objectManager = GeneralUtility::makeInstance(ObjectManager::class);
253 36
        $crawlStrategyFactory = GeneralUtility::makeInstance(CrawlStrategyFactory::class);
254 36
        $this->queueRepository = $objectManager->get(QueueRepository::class);
255 36
        $this->processRepository = $objectManager->get(ProcessRepository::class);
256 36
        $this->configurationRepository = $objectManager->get(ConfigurationRepository::class);
257 36
        $this->pageRepository = $objectManager->get(PageRepository::class);
258 36
        $this->queueExecutor = GeneralUtility::makeInstance(QueueExecutor::class, $crawlStrategyFactory);
259 36
        $this->iconFactory = GeneralUtility::makeInstance(IconFactory::class);
260
261 36
        $this->processFilename = Environment::getVarPath() . '/lock/tx_crawler.proc';
262
263
        /** @var ExtensionConfigurationProvider $configurationProvider */
264 36
        $configurationProvider = GeneralUtility::makeInstance(ExtensionConfigurationProvider::class);
265 36
        $settings = $configurationProvider->getExtensionConfiguration();
266 36
        $this->extensionSettings = is_array($settings) ? $settings : [];
0 ignored issues
show
introduced by
The condition is_array($settings) is always true.
Loading history...
267
268
        // set defaults:
269 36
        if (MathUtility::convertToPositiveInteger($this->extensionSettings['countInARun']) === 0) {
270
            $this->extensionSettings['countInARun'] = 100;
271
        }
272
273 36
        $this->extensionSettings['processLimit'] = MathUtility::forceIntegerInRange($this->extensionSettings['processLimit'], 1, 99, 1);
274 36
        $this->maximumUrlsToCompile = MathUtility::forceIntegerInRange($this->extensionSettings['maxCompileUrls'], 1, 1000000000, 10000);
275 36
    }
276
277
    /**
278
     * Method to set the accessMode can be gui, cli or cli_im
279
     *
280
     * @return string
281
     * @deprecated
282
     */
283 1
    public function getAccessMode()
284
    {
285 1
        return $this->accessMode;
0 ignored issues
show
Deprecated Code introduced by
The property AOE\Crawler\Controller\C...Controller::$accessMode has been deprecated. ( Ignorable by Annotation )

If this is a false-positive, you can also ignore this issue in your code via the ignore-deprecated  annotation

285
        return /** @scrutinizer ignore-deprecated */ $this->accessMode;
Loading history...
286
    }
287
288
    /**
289
     * @param string $accessMode
290
     * @deprecated
291
     */
292 1
    public function setAccessMode($accessMode): void
293
    {
294 1
        $this->accessMode = $accessMode;
0 ignored issues
show
Deprecated Code introduced by
The property AOE\Crawler\Controller\C...Controller::$accessMode has been deprecated. ( Ignorable by Annotation )

If this is a false-positive, you can also ignore this issue in your code via the ignore-deprecated  annotation

294
        /** @scrutinizer ignore-deprecated */ $this->accessMode = $accessMode;
Loading history...
295 1
    }
296
297
    /**
298
     * Set disabled status to prevent processes from being processed
299
     *
300
     * @param bool $disabled (optional, defaults to true)
301
     */
302 2
    public function setDisabled($disabled = true): void
303
    {
304 2
        if ($disabled) {
305 1
            GeneralUtility::writeFile($this->processFilename, '');
306
        } else {
307 1
            if (is_file($this->processFilename)) {
308 1
                unlink($this->processFilename);
309
            }
310
        }
311 2
    }
312
313
    /**
314
     * Get disable status
315
     *
316
     * @return bool true if disabled
317
     */
318 2
    public function getDisabled()
319
    {
320 2
        return is_file($this->processFilename);
321
    }
322
323
    /**
324
     * @param string $filenameWithPath
325
     */
326 3
    public function setProcessFilename($filenameWithPath): void
327
    {
328 3
        $this->processFilename = $filenameWithPath;
329 3
    }
330
331
    /**
332
     * @return string
333
     */
334 1
    public function getProcessFilename()
335
    {
336 1
        return $this->processFilename;
337
    }
338
339
    /**
340
     * Sets the extensions settings (unserialized pendant of $TYPO3_CONF_VARS['EXT']['extConf']['crawler']).
341
     */
342 14
    public function setExtensionSettings(array $extensionSettings): void
343
    {
344 14
        $this->extensionSettings = $extensionSettings;
345 14
    }
346
347
    /**
348
     * Check if the given page should be crawled
349
     *
350
     * @return false|string false if the page should be crawled (not excluded), true / skipMessage if it should be skipped
351
     */
352 12
    public function checkIfPageShouldBeSkipped(array $pageRow)
353
    {
354 12
        $skipPage = false;
355
        // message will be overwritten later
356 12
        $skipMessage = 'Skipped';
357
358
        // if page is hidden
359 12
        if (! $this->extensionSettings['crawlHiddenPages']) {
360 12
            if ($pageRow['hidden']) {
361 1
                $skipPage = true;
362 1
                $skipMessage = 'Because page is hidden';
363
            }
364
        }
365
366 12
        if (! $skipPage) {
367 11
            if (GeneralUtility::inList('3,4,199,254,255', $pageRow['doktype'])) {
368 3
                $skipPage = true;
369 3
                $skipMessage = 'Because doktype is not allowed';
370
            }
371
        }
372
373 12
        if (! $skipPage) {
374 8
            foreach ($GLOBALS['TYPO3_CONF_VARS']['EXTCONF']['crawler']['excludeDoktype'] ?? [] as $key => $doktypeList) {
375 1
                if (GeneralUtility::inList($doktypeList, $pageRow['doktype'])) {
376 1
                    $skipPage = true;
377 1
                    $skipMessage = 'Doktype was excluded by "' . $key . '"';
378 1
                    break;
379
                }
380
            }
381
        }
382
383 12
        if (! $skipPage) {
384
            // veto hook
385 7
            foreach ($GLOBALS['TYPO3_CONF_VARS']['EXTCONF']['crawler']['pageVeto'] ?? [] as $key => $func) {
386
                $params = [
387 2
                    'pageRow' => $pageRow,
388
                ];
389
                // expects "false" if page is ok and "true" or a skipMessage if this page should _not_ be crawled
390 2
                $veto = GeneralUtility::callUserFunction($func, $params, $this);
391 2
                if ($veto !== false) {
392 2
                    $skipPage = true;
393 2
                    if (is_string($veto)) {
394 1
                        $skipMessage = $veto;
395
                    } else {
396 1
                        $skipMessage = 'Veto from hook "' . htmlspecialchars($key) . '"';
397
                    }
398
                    // no need to execute other hooks if a previous one return a veto
399 2
                    break;
400
                }
401
            }
402
        }
403
404 12
        return $skipPage ? $skipMessage : false;
405
    }
406
407
    /**
408
     * Wrapper method for getUrlsForPageId()
409
     * It returns an array of configurations and no urls!
410
     *
411
     * @param array $pageRow Page record with at least dok-type and uid columns.
412
     * @param string $skipMessage
413
     * @return array
414
     * @see getUrlsForPageId()
415
     */
416 6
    public function getUrlsForPageRow(array $pageRow, &$skipMessage = '')
417
    {
418 6
        $message = $this->checkIfPageShouldBeSkipped($pageRow);
419 6
        if ($message === false) {
420 5
            $res = $this->getUrlsForPageId($pageRow['uid']);
421 5
            $skipMessage = '';
422
        } else {
423 1
            $skipMessage = $message;
424 1
            $res = [];
425
        }
426
427 6
        return $res;
428
    }
429
430
    /**
431
     * Creates a list of URLs from input array (and submits them to queue if asked for)
432
     * See Web > Info module script + "indexed_search"'s crawler hook-client using this!
433
     *
434
     * @param array $vv Information about URLs from pageRow to crawl.
435
     * @param array $pageRow Page row
436
     * @param int $scheduledTime Unix time to schedule indexing to, typically time()
437
     * @param int $reqMinute Number of requests per minute (creates the interleave between requests)
438
     * @param bool $submitCrawlUrls If set, submits the URLs to queue
439
     * @param bool $downloadCrawlUrls If set (and submitcrawlUrls is false) will fill $downloadUrls with entries)
440
     * @param array $duplicateTrack Array which is passed by reference and contains the an id per url to secure we will not crawl duplicates
441
     * @param array $downloadUrls Array which will be filled with URLS for download if flag is set.
442
     * @param array $incomingProcInstructions Array of processing instructions
443
     * @return string List of URLs (meant for display in backend module)
444
     */
445 4
    public function urlListFromUrlArray(
446
        array $vv,
447
        array $pageRow,
448
        $scheduledTime,
449
        $reqMinute,
450
        $submitCrawlUrls,
451
        $downloadCrawlUrls,
452
        array &$duplicateTrack,
453
        array &$downloadUrls,
454
        array $incomingProcInstructions
455
    ) {
456 4
        if (! is_array($vv['URLs'])) {
457
            return 'ERROR - no URL generated';
458
        }
459 4
        $urlLog = [];
460 4
        $pageId = (int) $pageRow['uid'];
461 4
        $configurationHash = $this->getConfigurationHash($vv);
462 4
        $skipInnerCheck = $this->queueRepository->noUnprocessedQueueEntriesForPageWithConfigurationHashExist($pageId, $configurationHash);
463
464 4
        $urlService = new UrlService();
465
466 4
        foreach ($vv['URLs'] as $urlQuery) {
467 4
            if (! $this->drawURLs_PIfilter($vv['subCfg']['procInstrFilter'], $incomingProcInstructions)) {
468
                continue;
469
            }
470 4
            $url = (string) $urlService->getUrlFromPageAndQueryParameters(
471 4
                $pageId,
472 4
                $urlQuery,
473 4
                $vv['subCfg']['baseUrl'] ?? null,
474 4
                $vv['subCfg']['force_ssl'] ?? 0
475
            );
476
477
            // Create key by which to determine unique-ness:
478 4
            $uKey = $url . '|' . $vv['subCfg']['userGroups'] . '|' . $vv['subCfg']['procInstrFilter'];
479
480 4
            if (isset($duplicateTrack[$uKey])) {
481
                //if the url key is registered just display it and do not resubmit is
482
                $urlLog[] = '<em><span class="text-muted">' . htmlspecialchars($url) . '</span></em>';
483
            } else {
484
                // Scheduled time:
485 4
                $schTime = $scheduledTime + round(count($duplicateTrack) * (60 / $reqMinute));
486 4
                $schTime = intval($schTime / 60) * 60;
487 4
                $formattedDate = BackendUtility::datetime($schTime);
488 4
                $this->urlList[] = '[' . $formattedDate . '] ' . $url;
489 4
                $urlList = '[' . $formattedDate . '] ' . htmlspecialchars($url);
490
491
                // Submit for crawling!
492 4
                if ($submitCrawlUrls) {
493 4
                    $added = $this->addUrl(
494 4
                        $pageId,
495 4
                        $url,
496 4
                        $vv['subCfg'],
497 4
                        $scheduledTime,
498 4
                        $configurationHash,
499 4
                        $skipInnerCheck
500
                    );
501 4
                    if ($added === false) {
502 4
                        $urlList .= ' (URL already existed)';
503
                    }
504
                } elseif ($downloadCrawlUrls) {
505
                    $downloadUrls[$url] = $url;
506
                }
507 4
                $urlLog[] = $urlList;
508
            }
509 4
            $duplicateTrack[$uKey] = true;
510
        }
511
512 4
        return implode('<br>', $urlLog);
513
    }
514
515
    /**
516
     * Returns true if input processing instruction is among registered ones.
517
     *
518
     * @param string $piString PI to test
519
     * @param array $incomingProcInstructions Processing instructions
520
     * @return boolean
521
     */
522 5
    public function drawURLs_PIfilter($piString, array $incomingProcInstructions)
523
    {
524 5
        if (empty($incomingProcInstructions)) {
525 1
            return true;
526
        }
527
528 4
        foreach ($incomingProcInstructions as $pi) {
529 4
            if (GeneralUtility::inList($piString, $pi)) {
530 2
                return true;
531
            }
532
        }
533 2
        return false;
534
    }
535
536 5
    public function getPageTSconfigForId($id): array
537
    {
538 5
        if (! $this->MP) {
539 5
            $pageTSconfig = BackendUtility::getPagesTSconfig($id);
0 ignored issues
show
Deprecated Code introduced by
The function TYPO3\CMS\Backend\Utilit...ity::getPagesTSconfig() has been deprecated. ( Ignorable by Annotation )

If this is a false-positive, you can also ignore this issue in your code via the ignore-deprecated  annotation

539
            $pageTSconfig = /** @scrutinizer ignore-deprecated */ BackendUtility::getPagesTSconfig($id);
Loading history...
540
        } else {
541
            // TODO: Please check, this makes no sense to split a boolean value.
542
            [, $mountPointId] = explode('-', $this->MP);
0 ignored issues
show
Bug introduced by
$this->MP of type true is incompatible with the type string expected by parameter $string of explode(). ( Ignorable by Annotation )

If this is a false-positive, you can also ignore this issue in your code via the ignore-type  annotation

542
            [, $mountPointId] = explode('-', /** @scrutinizer ignore-type */ $this->MP);
Loading history...
543
            $pageTSconfig = BackendUtility::getPagesTSconfig($mountPointId);
0 ignored issues
show
Deprecated Code introduced by
The function TYPO3\CMS\Backend\Utilit...ity::getPagesTSconfig() has been deprecated. ( Ignorable by Annotation )

If this is a false-positive, you can also ignore this issue in your code via the ignore-deprecated  annotation

543
            $pageTSconfig = /** @scrutinizer ignore-deprecated */ BackendUtility::getPagesTSconfig($mountPointId);
Loading history...
Bug introduced by
$mountPointId of type string is incompatible with the type integer expected by parameter $id of TYPO3\CMS\Backend\Utilit...ity::getPagesTSconfig(). ( Ignorable by Annotation )

If this is a false-positive, you can also ignore this issue in your code via the ignore-type  annotation

543
            $pageTSconfig = BackendUtility::getPagesTSconfig(/** @scrutinizer ignore-type */ $mountPointId);
Loading history...
544
        }
545
546
        // Call a hook to alter configuration
547 5
        if (is_array($GLOBALS['TYPO3_CONF_VARS']['EXTCONF']['crawler']['getPageTSconfigForId'])) {
548
            $params = [
549
                'pageId' => $id,
550
                'pageTSConfig' => &$pageTSconfig,
551
            ];
552
            foreach ($GLOBALS['TYPO3_CONF_VARS']['EXTCONF']['crawler']['getPageTSconfigForId'] as $userFunc) {
553
                GeneralUtility::callUserFunction($userFunc, $params, $this);
554
            }
555
        }
556 5
        return $pageTSconfig;
557
    }
558
559
    /**
560
     * This methods returns an array of configurations.
561
     * Adds no urls!
562
     */
563 4
    public function getUrlsForPageId(int $pageId): array
564
    {
565
        // Get page TSconfig for page ID
566 4
        $pageTSconfig = $this->getPageTSconfigForId($pageId);
567
568 4
        $res = [];
569
570
        // Fetch Crawler Configuration from pageTSconfig
571 4
        $crawlerCfg = $pageTSconfig['tx_crawler.']['crawlerCfg.']['paramSets.'] ?? [];
572 4
        foreach ($crawlerCfg as $key => $values) {
573 3
            if (! is_array($values)) {
574 3
                continue;
575
            }
576 3
            $key = str_replace('.', '', $key);
577
            // Sub configuration for a single configuration string:
578 3
            $subCfg = (array) $crawlerCfg[$key . '.'];
579 3
            $subCfg['key'] = $key;
580
581 3
            if (strcmp($subCfg['procInstrFilter'] ?? '', '')) {
582 3
                $subCfg['procInstrFilter'] = implode(',', GeneralUtility::trimExplode(',', $subCfg['procInstrFilter']));
583
            }
584 3
            $pidOnlyList = implode(',', GeneralUtility::trimExplode(',', $subCfg['pidsOnly'], true));
585
586
            // process configuration if it is not page-specific or if the specific page is the current page:
587
            // TODO: Check if $pidOnlyList can be kept as Array instead of imploded
588 3
            if (! strcmp((string) $subCfg['pidsOnly'], '') || GeneralUtility::inList($pidOnlyList, strval($pageId))) {
589
590
                // Explode, process etc.:
591 3
                $res[$key] = [];
592 3
                $res[$key]['subCfg'] = $subCfg;
593 3
                $res[$key]['paramParsed'] = GeneralUtility::explodeUrl2Array($crawlerCfg[$key]);
594 3
                $res[$key]['paramExpanded'] = $this->expandParameters($res[$key]['paramParsed'], $pageId);
595 3
                $res[$key]['origin'] = 'pagets';
596
597
                // recognize MP value
598 3
                if (! $this->MP) {
599 3
                    $res[$key]['URLs'] = $this->compileUrls($res[$key]['paramExpanded'], ['?id=' . $pageId]);
600
                } else {
601
                    $res[$key]['URLs'] = $this->compileUrls($res[$key]['paramExpanded'], ['?id=' . $pageId . '&MP=' . $this->MP]);
0 ignored issues
show
Bug introduced by
Are you sure $this->MP of type true can be used in concatenation? ( Ignorable by Annotation )

If this is a false-positive, you can also ignore this issue in your code via the ignore-type  annotation

601
                    $res[$key]['URLs'] = $this->compileUrls($res[$key]['paramExpanded'], ['?id=' . $pageId . '&MP=' . /** @scrutinizer ignore-type */ $this->MP]);
Loading history...
602
                }
603
            }
604
        }
605
606
        // Get configuration from tx_crawler_configuration records up the rootline
607 4
        $crawlerConfigurations = $this->configurationRepository->getCrawlerConfigurationRecordsFromRootLine($pageId);
608 4
        foreach ($crawlerConfigurations as $configurationRecord) {
609
610
            // check access to the configuration record
611 1
            if (empty($configurationRecord['begroups']) || $this->getBackendUser()->isAdmin() || $this->hasGroupAccess($this->getBackendUser()->user['usergroup_cached_list'], $configurationRecord['begroups'])) {
612 1
                $pidOnlyList = implode(',', GeneralUtility::trimExplode(',', $configurationRecord['pidsonly'], true));
613
614
                // process configuration if it is not page-specific or if the specific page is the current page:
615
                // TODO: Check if $pidOnlyList can be kept as Array instead of imploded
616 1
                if (! strcmp($configurationRecord['pidsonly'], '') || GeneralUtility::inList($pidOnlyList, strval($pageId))) {
617 1
                    $key = $configurationRecord['name'];
618
619
                    // don't overwrite previously defined paramSets
620 1
                    if (! isset($res[$key])) {
621
622
                        /* @var $TSparserObject \TYPO3\CMS\Core\TypoScript\Parser\TypoScriptParser */
623 1
                        $TSparserObject = GeneralUtility::makeInstance(TypoScriptParser::class);
624 1
                        $TSparserObject->parse($configurationRecord['processing_instruction_parameters_ts']);
625
626
                        $subCfg = [
627 1
                            'procInstrFilter' => $configurationRecord['processing_instruction_filter'],
628 1
                            'procInstrParams.' => $TSparserObject->setup,
629 1
                            'baseUrl' => $configurationRecord['base_url'],
630 1
                            'force_ssl' => (int) $configurationRecord['force_ssl'],
631 1
                            'userGroups' => $configurationRecord['fegroups'],
632 1
                            'exclude' => $configurationRecord['exclude'],
633 1
                            'key' => $key,
634
                        ];
635
636 1
                        if (! in_array($pageId, $this->expandExcludeString($subCfg['exclude']), true)) {
637 1
                            $res[$key] = [];
638 1
                            $res[$key]['subCfg'] = $subCfg;
639 1
                            $res[$key]['paramParsed'] = GeneralUtility::explodeUrl2Array($configurationRecord['configuration']);
640 1
                            $res[$key]['paramExpanded'] = $this->expandParameters($res[$key]['paramParsed'], $pageId);
641 1
                            $res[$key]['URLs'] = $this->compileUrls($res[$key]['paramExpanded'], ['?id=' . $pageId]);
642 1
                            $res[$key]['origin'] = 'tx_crawler_configuration_' . $configurationRecord['uid'];
643
                        }
644
                    }
645
                }
646
            }
647
        }
648
649 4
        foreach ($GLOBALS['TYPO3_CONF_VARS']['EXTCONF']['crawler']['processUrls'] ?? [] as $func) {
650
            $params = [
651
                'res' => &$res,
652
            ];
653
            GeneralUtility::callUserFunction($func, $params, $this);
654
        }
655 4
        return $res;
656
    }
657
658
    /**
659
     * Find all configurations of subpages of a page
660
     * TODO: Write Functional Tests
661
     */
662 1
    public function getConfigurationsForBranch(int $rootid, int $depth): array
663
    {
664 1
        $configurationsForBranch = [];
665 1
        $pageTSconfig = $this->getPageTSconfigForId($rootid);
666 1
        $sets = $pageTSconfig['tx_crawler.']['crawlerCfg.']['paramSets.'] ?? [];
667 1
        foreach ($sets as $key => $value) {
668
            if (! is_array($value)) {
669
                continue;
670
            }
671
            $configurationsForBranch[] = substr($key, -1) === '.' ? substr($key, 0, -1) : $key;
672
        }
673 1
        $pids = [];
674 1
        $rootLine = BackendUtility::BEgetRootLine($rootid);
675 1
        foreach ($rootLine as $node) {
676 1
            $pids[] = $node['uid'];
677
        }
678
        /* @var PageTreeView $tree */
679 1
        $tree = GeneralUtility::makeInstance(PageTreeView::class);
680 1
        $perms_clause = $this->getBackendUser()->getPagePermsClause(Permission::PAGE_SHOW);
681 1
        $tree->init(empty($perms_clause) ? '' : ('AND ' . $perms_clause));
682 1
        $tree->getTree($rootid, $depth, '');
683 1
        foreach ($tree->tree as $node) {
684
            $pids[] = $node['row']['uid'];
685
        }
686
687 1
        $queryBuilder = $this->getQueryBuilder('tx_crawler_configuration');
688
        $statement = $queryBuilder
689 1
            ->select('name')
690 1
            ->from('tx_crawler_configuration')
691 1
            ->where(
692 1
                $queryBuilder->expr()->in('pid', $queryBuilder->createNamedParameter($pids, Connection::PARAM_INT_ARRAY))
693
            )
694 1
            ->execute();
695
696 1
        while ($row = $statement->fetch()) {
697 1
            $configurationsForBranch[] = $row['name'];
698
        }
699 1
        return $configurationsForBranch;
700
    }
701
702
    /**
703
     * Check if a user has access to an item
704
     * (e.g. get the group list of the current logged in user from $GLOBALS['TSFE']->gr_list)
705
     *
706
     * @param string $groupList Comma-separated list of (fe_)group UIDs from a user
707
     * @param string $accessList Comma-separated list of (fe_)group UIDs of the item to access
708
     * @return bool TRUE if at least one of the users group UIDs is in the access list or the access list is empty
709
     * @see \TYPO3\CMS\Frontend\Page\PageRepository::getMultipleGroupsWhereClause()
710
     */
711 3
    public function hasGroupAccess($groupList, $accessList)
712
    {
713 3
        if (empty($accessList)) {
714 1
            return true;
715
        }
716 2
        foreach (GeneralUtility::intExplode(',', $groupList) as $groupUid) {
717 2
            if (GeneralUtility::inList($accessList, $groupUid)) {
718 1
                return true;
719
            }
720
        }
721 1
        return false;
722
    }
723
724
    /**
725
     * Will expand the parameters configuration to individual values. This follows a certain syntax of the value of each parameter.
726
     * Syntax of values:
727
     * - Basically: If the value is wrapped in [...] it will be expanded according to the following syntax, otherwise the value is taken literally
728
     * - Configuration is splitted by "|" and the parts are processed individually and finally added together
729
     * - For each configuration part:
730
     *         - "[int]-[int]" = Integer range, will be expanded to all values in between, values included, starting from low to high (max. 1000). Example "1-34" or "-40--30"
731
     *         - "_TABLE:[TCA table name];[_PID:[optional page id, default is current page]];[_ENABLELANG:1]" = Look up of table records from PID, filtering out deleted records. Example "_TABLE:tt_content; _PID:123"
732
     *        _ENABLELANG:1 picks only original records without their language overlays
733
     *         - Default: Literal value
734
     *
735
     * @param array $paramArray Array with key (GET var name) and values (value of GET var which is configuration for expansion)
736
     * @param integer $pid Current page ID
737
     * @return array
738
     *
739
     * TODO: Write Functional Tests
740
     */
741 11
    public function expandParameters($paramArray, $pid)
742
    {
743
        // Traverse parameter names:
744 11
        foreach ($paramArray as $p => $v) {
745 11
            $v = trim($v);
746
747
            // If value is encapsulated in square brackets it means there are some ranges of values to find, otherwise the value is literal
748 11
            if (strpos($v, '[') === 0 && substr($v, -1) === ']') {
749
                // So, find the value inside brackets and reset the paramArray value as an array.
750 11
                $v = substr($v, 1, -1);
751 11
                $paramArray[$p] = [];
752
753
                // Explode parts and traverse them:
754 11
                $parts = explode('|', $v);
755 11
                foreach ($parts as $pV) {
756
757
                    // Look for integer range: (fx. 1-34 or -40--30 // reads minus 40 to minus 30)
758 11
                    if (preg_match('/^(-?[0-9]+)\s*-\s*(-?[0-9]+)$/', trim($pV), $reg)) {
759 1
                        $reg = $this->swapIfFirstIsLargerThanSecond($reg);
760
761
                        // Traverse range, add values:
762
                        // Limit to size of range!
763 1
                        $runAwayBrake = 1000;
764 1
                        for ($a = $reg[1]; $a <= $reg[2]; $a++) {
765 1
                            $paramArray[$p][] = $a;
766 1
                            $runAwayBrake--;
767 1
                            if ($runAwayBrake <= 0) {
768
                                break;
769
                            }
770
                        }
771 10
                    } elseif (strpos(trim($pV), '_TABLE:') === 0) {
772
773
                        // Parse parameters:
774 6
                        $subparts = GeneralUtility::trimExplode(';', $pV);
775 6
                        $subpartParams = [];
776 6
                        foreach ($subparts as $spV) {
777 6
                            [$pKey, $pVal] = GeneralUtility::trimExplode(':', $spV);
778 6
                            $subpartParams[$pKey] = $pVal;
779
                        }
780
781
                        // Table exists:
782 6
                        if (isset($GLOBALS['TCA'][$subpartParams['_TABLE']])) {
783 6
                            $lookUpPid = isset($subpartParams['_PID']) ? intval($subpartParams['_PID']) : intval($pid);
784 6
                            $recursiveDepth = isset($subpartParams['_RECURSIVE']) ? intval($subpartParams['_RECURSIVE']) : 0;
785 6
                            $pidField = isset($subpartParams['_PIDFIELD']) ? trim($subpartParams['_PIDFIELD']) : 'pid';
786 6
                            $where = $subpartParams['_WHERE'] ?? '';
787 6
                            $addTable = $subpartParams['_ADDTABLE'] ?? '';
788
789 6
                            $fieldName = $subpartParams['_FIELD'] ? $subpartParams['_FIELD'] : 'uid';
790 6
                            if ($fieldName === 'uid' || $GLOBALS['TCA'][$subpartParams['_TABLE']]['columns'][$fieldName]) {
791 6
                                $queryBuilder = $this->getQueryBuilder($subpartParams['_TABLE']);
792
793 6
                                if ($recursiveDepth > 0) {
794
                                    /** @var \TYPO3\CMS\Core\Database\QueryGenerator $queryGenerator */
795 2
                                    $queryGenerator = GeneralUtility::makeInstance(\TYPO3\CMS\Core\Database\QueryGenerator::class);
796 2
                                    $pidList = $queryGenerator->getTreeList($lookUpPid, $recursiveDepth, 0, 1);
797 2
                                    $pidArray = GeneralUtility::intExplode(',', $pidList);
798
                                } else {
799 4
                                    $pidArray = [(string) $lookUpPid];
800
                                }
801
802 6
                                $queryBuilder->getRestrictions()
803 6
                                    ->removeAll()
804 6
                                    ->add(GeneralUtility::makeInstance(DeletedRestriction::class));
805
806
                                $queryBuilder
807 6
                                    ->select($fieldName)
808 6
                                    ->from($subpartParams['_TABLE'])
809 6
                                    ->where(
810 6
                                        $queryBuilder->expr()->in($pidField, $queryBuilder->createNamedParameter($pidArray, Connection::PARAM_INT_ARRAY)),
811 6
                                        $where
812
                                    );
813
814 6
                                if (! empty($addTable)) {
815
                                    // TODO: Check if this works as intended!
816
                                    $queryBuilder->add('from', $addTable);
817
                                }
818 6
                                $transOrigPointerField = $GLOBALS['TCA'][$subpartParams['_TABLE']]['ctrl']['transOrigPointerField'];
819
820 6
                                if ($subpartParams['_ENABLELANG'] && $transOrigPointerField) {
821
                                    $queryBuilder->andWhere(
822
                                        $queryBuilder->expr()->lte(
823
                                            $transOrigPointerField,
824
                                            0
825
                                        )
826
                                    );
827
                                }
828
829 6
                                $statement = $queryBuilder->execute();
830
831 6
                                $rows = [];
832 6
                                while ($row = $statement->fetch()) {
833 6
                                    $rows[$row[$fieldName]] = $row;
834
                                }
835
836 6
                                if (is_array($rows)) {
837 6
                                    $paramArray[$p] = array_merge($paramArray[$p], array_keys($rows));
838
                                }
839
                            }
840
                        }
841
                    } else {
842
                        // Just add value:
843 4
                        $paramArray[$p][] = $pV;
844
                    }
845
                    // Hook for processing own expandParameters place holder
846 11
                    if (is_array($GLOBALS['TYPO3_CONF_VARS']['SC_OPTIONS']['crawler/class.tx_crawler_lib.php']['expandParameters'])) {
847
                        $_params = [
848
                            'pObj' => &$this,
849
                            'paramArray' => &$paramArray,
850
                            'currentKey' => $p,
851
                            'currentValue' => $pV,
852
                            'pid' => $pid,
853
                        ];
854
                        foreach ($GLOBALS['TYPO3_CONF_VARS']['SC_OPTIONS']['crawler/class.tx_crawler_lib.php']['expandParameters'] as $_funcRef) {
855
                            GeneralUtility::callUserFunction($_funcRef, $_params, $this);
856
                        }
857
                    }
858
                }
859
860
                // Make unique set of values and sort array by key:
861 11
                $paramArray[$p] = array_unique($paramArray[$p]);
862 11
                ksort($paramArray);
863
            } else {
864
                // Set the literal value as only value in array:
865 4
                $paramArray[$p] = [$v];
866
            }
867
        }
868
869 11
        return $paramArray;
870
    }
871
872
    /**
873
     * Compiling URLs from parameter array (output of expandParameters())
874
     * The number of URLs will be the multiplication of the number of parameter values for each key
875
     *
876
     * @param array $paramArray Output of expandParameters(): Array with keys (GET var names) and for each an array of values
877
     * @param array $urls URLs accumulated in this array (for recursion)
878
     * @return array
879
     */
880 7
    public function compileUrls($paramArray, array $urls)
881
    {
882 7
        if (empty($paramArray)) {
883 7
            return $urls;
884
        }
885
        // shift first off stack:
886 6
        reset($paramArray);
887 6
        $varName = key($paramArray);
888 6
        $valueSet = array_shift($paramArray);
889
890
        // Traverse value set:
891 6
        $newUrls = [];
892 6
        foreach ($urls as $url) {
893 5
            foreach ($valueSet as $val) {
894 5
                $newUrls[] = $url . (strcmp((string) $val, '') ? '&' . rawurlencode($varName) . '=' . rawurlencode((string) $val) : '');
895
896 5
                if (count($newUrls) > $this->maximumUrlsToCompile) {
897
                    break;
898
                }
899
            }
900
        }
901 6
        return $this->compileUrls($paramArray, $newUrls);
902
    }
903
904
    /************************************
905
     *
906
     * Crawler log
907
     *
908
     ************************************/
909
910
    /**
911
     * Return array of records from crawler queue for input page ID
912
     *
913
     * @param integer $id Page ID for which to look up log entries.
914
     * @param string $filter Filter: "all" => all entries, "pending" => all that is not yet run, "finished" => all complete ones
915
     * @param boolean $doFlush If TRUE, then entries selected at DELETED(!) instead of selected!
916
     * @param boolean $doFullFlush
917
     * @param integer $itemsPerPage Limit the amount of entries per page default is 10
918
     * @return array
919
     */
920 4
    public function getLogEntriesForPageId($id, $filter = '', $doFlush = false, $doFullFlush = false, $itemsPerPage = 10)
921
    {
922 4
        $queryBuilder = GeneralUtility::makeInstance(ConnectionPool::class)->getQueryBuilderForTable($this->tableName);
923
        $queryBuilder
924 4
            ->select('*')
925 4
            ->from($this->tableName)
926 4
            ->where(
927 4
                $queryBuilder->expr()->eq('page_id', $queryBuilder->createNamedParameter($id, \PDO::PARAM_INT))
928
            )
929 4
            ->orderBy('scheduled', 'DESC');
930
931 4
        $expressionBuilder = GeneralUtility::makeInstance(ConnectionPool::class)
932 4
            ->getConnectionForTable($this->tableName)
933 4
            ->getExpressionBuilder();
934 4
        $query = $expressionBuilder->andX();
0 ignored issues
show
Unused Code introduced by
The assignment to $query is dead and can be removed.
Loading history...
935
        // PHPStorm adds the highlight that the $addWhere is immediately overwritten,
936
        // but the $query = $expressionBuilder->andX() ensures that the $addWhere is written correctly with AND
937
        // between the statements, it's not a mistake in the code.
938 4
        switch ($filter) {
939 4
            case 'pending':
940
                $queryBuilder->andWhere($queryBuilder->expr()->eq('exec_time', 0));
941
                break;
942 4
            case 'finished':
943
                $queryBuilder->andWhere($queryBuilder->expr()->gt('exec_time', 0));
944
                break;
945
        }
946
947 4
        if ($doFlush) {
948 2
            if ($doFullFlush) {
949 1
                $this->queueRepository->flushQueue('all');
950
            } else {
951 1
                $this->queueRepository->flushQueue($filter);
952
            }
953
        }
954 4
        if ($itemsPerPage > 0) {
955
            $queryBuilder
956 4
                ->setMaxResults((int) $itemsPerPage);
957
        }
958
959 4
        return $queryBuilder->execute()->fetchAll();
960
    }
961
962
    /**
963
     * Return array of records from crawler queue for input set ID
964
     *
965
     * @param int $set_id Set ID for which to look up log entries.
966
     * @param string $filter Filter: "all" => all entries, "pending" => all that is not yet run, "finished" => all complete ones
967
     * @param bool $doFlush If TRUE, then entries selected at DELETED(!) instead of selected!
968
     * @param int $itemsPerPage Limit the amount of entries per page default is 10
969
     * @return array
970
     *
971
     * @deprecated
972
     */
973 6
    public function getLogEntriesForSetId(int $set_id, string $filter = '', bool $doFlush = false, bool $doFullFlush = false, int $itemsPerPage = 10)
974
    {
975 6
        $queryBuilder = GeneralUtility::makeInstance(ConnectionPool::class)->getQueryBuilderForTable($this->tableName);
976
        $queryBuilder
977 6
            ->select('*')
978 6
            ->from($this->tableName)
979 6
            ->where(
980 6
                $queryBuilder->expr()->eq('set_id', $queryBuilder->createNamedParameter($set_id, \PDO::PARAM_INT))
981
            )
982 6
            ->orderBy('scheduled', 'DESC');
983
984 6
        $expressionBuilder = GeneralUtility::makeInstance(ConnectionPool::class)
985 6
            ->getConnectionForTable($this->tableName)
986 6
            ->getExpressionBuilder();
987 6
        $query = $expressionBuilder->andX();
988
        // PHPStorm adds the highlight that the $addWhere is immediately overwritten,
989
        // but the $query = $expressionBuilder->andX() ensures that the $addWhere is written correctly with AND
990
        // between the statements, it's not a mistake in the code.
991 6
        $addWhere = '';
992 6
        switch ($filter) {
993 6
            case 'pending':
994 1
                $queryBuilder->andWhere($queryBuilder->expr()->eq('exec_time', 0));
995 1
                $addWhere = $query->add($expressionBuilder->eq('exec_time', 0));
0 ignored issues
show
Unused Code introduced by
The assignment to $addWhere is dead and can be removed.
Loading history...
996 1
                break;
997 5
            case 'finished':
998 1
                $queryBuilder->andWhere($queryBuilder->expr()->gt('exec_time', 0));
999 1
                $addWhere = $query->add($expressionBuilder->gt('exec_time', 0));
1000 1
                break;
1001
        }
1002 6
        if ($doFlush) {
1003 4
            $addWhere = $query->add($expressionBuilder->eq('set_id', (int) $set_id));
1004 4
            $this->flushQueue($doFullFlush ? '' : $addWhere);
0 ignored issues
show
Deprecated Code introduced by
The function AOE\Crawler\Controller\C...ontroller::flushQueue() has been deprecated. ( Ignorable by Annotation )

If this is a false-positive, you can also ignore this issue in your code via the ignore-deprecated  annotation

1004
            /** @scrutinizer ignore-deprecated */ $this->flushQueue($doFullFlush ? '' : $addWhere);
Loading history...
1005 4
            return [];
1006
        }
1007 2
        if ($itemsPerPage > 0) {
1008
            $queryBuilder
1009 2
                ->setMaxResults((int) $itemsPerPage);
1010
        }
1011
1012 2
        return $queryBuilder->execute()->fetchAll();
1013
    }
1014
1015
    /**
1016
     * Adding call back entries to log (called from hooks typically, see indexed search class "class.crawler.php"
1017
     *
1018
     * @param integer $setId Set ID
1019
     * @param array $params Parameters to pass to call back function
1020
     * @param string $callBack Call back object reference, eg. 'EXT:indexed_search/class.crawler.php:&tx_indexedsearch_crawler'
1021
     * @param integer $page_id Page ID to attach it to
1022
     * @param integer $schedule Time at which to activate
1023
     */
1024
    public function addQueueEntry_callBack($setId, $params, $callBack, $page_id = 0, $schedule = 0): void
1025
    {
1026
        if (! is_array($params)) {
0 ignored issues
show
introduced by
The condition is_array($params) is always true.
Loading history...
1027
            $params = [];
1028
        }
1029
        $params['_CALLBACKOBJ'] = $callBack;
1030
1031
        GeneralUtility::makeInstance(ConnectionPool::class)->getConnectionForTable('tx_crawler_queue')
1032
            ->insert(
1033
                'tx_crawler_queue',
1034
                [
1035
                    'page_id' => (int) $page_id,
1036
                    'parameters' => json_encode($params),
1037
                    'scheduled' => (int) $schedule ?: $this->getCurrentTime(),
1038
                    'exec_time' => 0,
1039
                    'set_id' => (int) $setId,
1040
                    'result_data' => '',
1041
                ]
1042
            );
1043
    }
1044
1045
    /************************************
1046
     *
1047
     * URL setting
1048
     *
1049
     ************************************/
1050
1051
    /**
1052
     * Setting a URL for crawling:
1053
     *
1054
     * @param integer $id Page ID
1055
     * @param string $url Complete URL
1056
     * @param array $subCfg Sub configuration array (from TS config)
1057
     * @param integer $tstamp Scheduled-time
1058
     * @param string $configurationHash (optional) configuration hash
1059
     * @param bool $skipInnerDuplicationCheck (optional) skip inner duplication check
1060
     * @return bool
1061
     */
1062 8
    public function addUrl(
1063
        $id,
1064
        $url,
1065
        array $subCfg,
1066
        $tstamp,
1067
        $configurationHash = '',
1068
        $skipInnerDuplicationCheck = false
1069
    ) {
1070 8
        $urlAdded = false;
1071 8
        $rows = [];
1072
1073
        // Creating parameters:
1074
        $parameters = [
1075 8
            'url' => $url,
1076
        ];
1077
1078
        // fe user group simulation:
1079 8
        $uGs = implode(',', array_unique(GeneralUtility::intExplode(',', $subCfg['userGroups'], true)));
1080 8
        if ($uGs) {
1081 1
            $parameters['feUserGroupList'] = $uGs;
1082
        }
1083
1084
        // Setting processing instructions
1085 8
        $parameters['procInstructions'] = GeneralUtility::trimExplode(',', $subCfg['procInstrFilter']);
1086 8
        if (is_array($subCfg['procInstrParams.'])) {
1087 5
            $parameters['procInstrParams'] = $subCfg['procInstrParams.'];
1088
        }
1089
1090
        // Compile value array:
1091 8
        $parameters_serialized = json_encode($parameters);
1092
        $fieldArray = [
1093 8
            'page_id' => (int) $id,
1094 8
            'parameters' => $parameters_serialized,
1095 8
            'parameters_hash' => GeneralUtility::shortMD5($parameters_serialized),
1096 8
            'configuration_hash' => $configurationHash,
1097 8
            'scheduled' => $tstamp,
1098 8
            'exec_time' => 0,
1099 8
            'set_id' => (int) $this->setID,
1100 8
            'result_data' => '',
1101 8
            'configuration' => $subCfg['key'],
1102
        ];
1103
1104 8
        if ($this->registerQueueEntriesInternallyOnly) {
1105
            //the entries will only be registered and not stored to the database
1106 1
            $this->queueEntries[] = $fieldArray;
1107
        } else {
1108 7
            if (! $skipInnerDuplicationCheck) {
1109
                // check if there is already an equal entry
1110 6
                $rows = $this->getDuplicateRowsIfExist($tstamp, $fieldArray);
1111
            }
1112
1113 7
            if (empty($rows)) {
1114 6
                $connectionForCrawlerQueue = GeneralUtility::makeInstance(ConnectionPool::class)->getConnectionForTable('tx_crawler_queue');
1115 6
                $connectionForCrawlerQueue->insert(
1116 6
                    'tx_crawler_queue',
1117 6
                    $fieldArray
1118
                );
1119 6
                $uid = $connectionForCrawlerQueue->lastInsertId('tx_crawler_queue', 'qid');
1120 6
                $rows[] = $uid;
1121 6
                $urlAdded = true;
1122
1123 6
                $signalPayload = ['uid' => $uid, 'fieldArray' => $fieldArray];
1124 6
                SignalSlotUtility::emitSignal(
1125 6
                    self::class,
1126 6
                    SignalSlotUtility::SIGNAL_URL_ADDED_TO_QUEUE,
1127 6
                    $signalPayload
1128
                );
1129
            } else {
1130 3
                $signalPayload = ['rows' => $rows, 'fieldArray' => $fieldArray];
1131 3
                SignalSlotUtility::emitSignal(
1132 3
                    self::class,
1133 3
                    SignalSlotUtility::SIGNAL_DUPLICATE_URL_IN_QUEUE,
1134 3
                    $signalPayload
1135
                );
1136
            }
1137
        }
1138
1139 8
        return $urlAdded;
1140
    }
1141
1142
    /**
1143
     * Returns the current system time
1144
     *
1145
     * @return int
1146
     */
1147
    public function getCurrentTime()
1148
    {
1149
        return time();
1150
    }
1151
1152
    /************************************
1153
     *
1154
     * URL reading
1155
     *
1156
     ************************************/
1157
1158
    /**
1159
     * Read URL for single queue entry
1160
     *
1161
     * @param integer $queueId
1162
     * @param boolean $force If set, will process even if exec_time has been set!
1163
     * @return integer
1164
     */
1165
    public function readUrl($queueId, $force = false)
1166
    {
1167
        $queryBuilder = GeneralUtility::makeInstance(ConnectionPool::class)->getQueryBuilderForTable($this->tableName);
1168
        $ret = 0;
1169
        $this->logger->debug('crawler-readurl start ' . microtime(true));
1170
        // Get entry:
1171
        $queryBuilder
1172
            ->select('*')
1173
            ->from('tx_crawler_queue')
1174
            ->where(
1175
                $queryBuilder->expr()->eq('qid', $queryBuilder->createNamedParameter($queueId, \PDO::PARAM_INT))
1176
            );
1177
        if (! $force) {
1178
            $queryBuilder
1179
                ->andWhere('exec_time = 0')
1180
                ->andWhere('process_scheduled > 0');
1181
        }
1182
        $queueRec = $queryBuilder->execute()->fetch();
1183
1184
        if (! is_array($queueRec)) {
1185
            return;
1186
        }
1187
1188
        SignalSlotUtility::emitSignal(
1189
            self::class,
1190
            SignalSlotUtility::SIGNAL_QUEUEITEM_PREPROCESS,
1191
            [$queueId, &$queueRec]
1192
        );
1193
1194
        // Set exec_time to lock record:
1195
        $field_array = ['exec_time' => $this->getCurrentTime()];
1196
1197
        if (isset($this->processID)) {
1198
            //if mulitprocessing is used we need to store the id of the process which has handled this entry
1199
            $field_array['process_id_completed'] = $this->processID;
1200
        }
1201
1202
        GeneralUtility::makeInstance(ConnectionPool::class)->getConnectionForTable('tx_crawler_queue')
1203
            ->update(
1204
                'tx_crawler_queue',
1205
                $field_array,
1206
                ['qid' => (int) $queueId]
1207
            );
1208
1209
        $result = $this->queueExecutor->executeQueueItem($queueRec, $this);
1210
        if ($result['content'] === null) {
1211
            $resultData = 'An errors happened';
1212
        } else {
1213
            /** @var JsonCompatibilityConverter $jsonCompatibilityConverter */
1214
            $jsonCompatibilityConverter = GeneralUtility::makeInstance(JsonCompatibilityConverter::class);
1215
            $resultData = $jsonCompatibilityConverter->convert($result['content']);
1216
        }
1217
1218
        //atm there's no need to point to specific pollable extensions
1219
        if (is_array($GLOBALS['TYPO3_CONF_VARS']['EXTCONF']['crawler']['pollSuccess'])) {
1220
            foreach ($GLOBALS['TYPO3_CONF_VARS']['EXTCONF']['crawler']['pollSuccess'] as $pollable) {
1221
                // only check the success value if the instruction is runnig
1222
                // it is important to name the pollSuccess key same as the procInstructions key
1223
                if (is_array($resultData['parameters']['procInstructions'])
1224
                    && in_array(
1225
                        $pollable,
1226
                        $resultData['parameters']['procInstructions'], true
1227
                    )
1228
                ) {
1229
                    if (! empty($resultData['success'][$pollable]) && $resultData['success'][$pollable]) {
1230
                        $ret |= self::CLI_STATUS_POLLABLE_PROCESSED;
1231
                    }
1232
                }
1233
            }
1234
        }
1235
1236
        // Set result in log which also denotes the end of the processing of this entry.
1237
        $field_array = ['result_data' => json_encode($result)];
1238
1239
        SignalSlotUtility::emitSignal(
1240
            self::class,
1241
            SignalSlotUtility::SIGNAL_QUEUEITEM_POSTPROCESS,
1242
            [$queueId, &$field_array]
1243
        );
1244
1245
        GeneralUtility::makeInstance(ConnectionPool::class)->getConnectionForTable('tx_crawler_queue')
1246
            ->update(
1247
                'tx_crawler_queue',
1248
                $field_array,
1249
                ['qid' => (int) $queueId]
1250
            );
1251
1252
        $this->logger->debug('crawler-readurl stop ' . microtime(true));
1253
        return $ret;
1254
    }
1255
1256
    /**
1257
     * Read URL for not-yet-inserted log-entry
1258
     *
1259
     * @param array $field_array Queue field array,
1260
     *
1261
     * @return array|bool|mixed|string
1262
     */
1263
    public function readUrlFromArray($field_array)
1264
    {
1265
        // Set exec_time to lock record:
1266
        $field_array['exec_time'] = $this->getCurrentTime();
1267
        $connectionForCrawlerQueue = GeneralUtility::makeInstance(ConnectionPool::class)->getConnectionForTable($this->tableName);
1268
        $connectionForCrawlerQueue->insert(
1269
            $this->tableName,
1270
            $field_array
1271
        );
1272
        $queueId = $connectionForCrawlerQueue->lastInsertId($this->tableName, 'qid');
1273
        $result = $this->queueExecutor->executeQueueItem($field_array, $this);
1274
1275
        // Set result in log which also denotes the end of the processing of this entry.
1276
        $field_array = ['result_data' => json_encode($result)];
1277
1278
        SignalSlotUtility::emitSignal(
1279
            self::class,
1280
            SignalSlotUtility::SIGNAL_QUEUEITEM_POSTPROCESS,
1281
            [$queueId, &$field_array]
1282
        );
1283
1284
        $connectionForCrawlerQueue->update(
1285
            $this->tableName,
1286
            $field_array,
1287
            ['qid' => $queueId]
1288
        );
1289
1290
        return $result;
1291
    }
1292
1293
    /*****************************
1294
     *
1295
     * Compiling URLs to crawl - tools
1296
     *
1297
     *****************************/
1298
1299
    /**
1300
     * @param integer $id Root page id to start from.
1301
     * @param integer $depth Depth of tree, 0=only id-page, 1= on sublevel, 99 = infinite
1302
     * @param integer $scheduledTime Unix Time when the URL is timed to be visited when put in queue
1303
     * @param integer $reqMinute Number of requests per minute (creates the interleave between requests)
1304
     * @param boolean $submitCrawlUrls If set, submits the URLs to queue in database (real crawling)
1305
     * @param boolean $downloadCrawlUrls If set (and submitcrawlUrls is false) will fill $downloadUrls with entries)
1306
     * @param array $incomingProcInstructions Array of processing instructions
1307
     * @param array $configurationSelection Array of configuration keys
1308
     * @return string
1309
     */
1310
    public function getPageTreeAndUrls(
1311
        $id,
1312
        $depth,
1313
        $scheduledTime,
1314
        $reqMinute,
1315
        $submitCrawlUrls,
1316
        $downloadCrawlUrls,
1317
        array $incomingProcInstructions,
1318
        array $configurationSelection
1319
    ) {
1320
        $this->scheduledTime = $scheduledTime;
1321
        $this->reqMinute = $reqMinute;
1322
        $this->submitCrawlUrls = $submitCrawlUrls;
1323
        $this->downloadCrawlUrls = $downloadCrawlUrls;
1324
        $this->incomingProcInstructions = $incomingProcInstructions;
1325
        $this->incomingConfigurationSelection = $configurationSelection;
1326
1327
        $this->duplicateTrack = [];
1328
        $this->downloadUrls = [];
1329
1330
        // Drawing tree:
1331
        /* @var PageTreeView $tree */
1332
        $tree = GeneralUtility::makeInstance(PageTreeView::class);
1333
        $perms_clause = $this->getBackendUser()->getPagePermsClause(Permission::PAGE_SHOW);
1334
        $tree->init('AND ' . $perms_clause);
1335
1336
        $pageInfo = BackendUtility::readPageAccess($id, $perms_clause);
1337
        if (is_array($pageInfo)) {
0 ignored issues
show
introduced by
The condition is_array($pageInfo) is always true.
Loading history...
1338
            // Set root row:
1339
            $tree->tree[] = [
1340
                'row' => $pageInfo,
1341
                'HTML' => $this->iconFactory->getIconForRecord('pages', $pageInfo, Icon::SIZE_SMALL),
1342
            ];
1343
        }
1344
1345
        // Get branch beneath:
1346
        if ($depth) {
1347
            $tree->getTree($id, $depth, '');
1348
        }
1349
1350
        // Traverse page tree:
1351
        $code = '';
1352
1353
        foreach ($tree->tree as $data) {
1354
            $this->MP = false;
1355
1356
            // recognize mount points
1357
            if ($data['row']['doktype'] === PageRepository::DOKTYPE_MOUNTPOINT) {
1358
                $mountpage = $this->pageRepository->getPage($data['row']['uid']);
1359
1360
                // fetch mounted pages
1361
                $this->MP = $mountpage[0]['mount_pid'] . '-' . $data['row']['uid'];
0 ignored issues
show
Documentation Bug introduced by
The property $MP was declared of type boolean, but $mountpage[0]['mount_pid...' . $data['row']['uid'] is of type string. Maybe add a type cast?

This check looks for assignments to scalar types that may be of the wrong type.

To ensure the code behaves as expected, it may be a good idea to add an explicit type cast.

$answer = 42;

$correct = false;

$correct = (bool) $answer;
Loading history...
1362
1363
                $mountTree = GeneralUtility::makeInstance(PageTreeView::class);
1364
                $mountTree->init('AND ' . $perms_clause);
1365
                $mountTree->getTree($mountpage[0]['mount_pid'], $depth);
1366
1367
                foreach ($mountTree->tree as $mountData) {
1368
                    $code .= $this->drawURLs_addRowsForPage(
1369
                        $mountData['row'],
1370
                        $mountData['HTML'] . BackendUtility::getRecordTitle('pages', $mountData['row'], true)
1371
                    );
1372
                }
1373
1374
                // replace page when mount_pid_ol is enabled
1375
                if ($mountpage[0]['mount_pid_ol']) {
1376
                    $data['row']['uid'] = $mountpage[0]['mount_pid'];
1377
                } else {
1378
                    // if the mount_pid_ol is not set the MP must not be used for the mountpoint page
1379
                    $this->MP = false;
1380
                }
1381
            }
1382
1383
            $code .= $this->drawURLs_addRowsForPage(
1384
                $data['row'],
1385
                $data['HTML'] . BackendUtility::getRecordTitle('pages', $data['row'], true)
1386
            );
1387
        }
1388
1389
        return $code;
1390
    }
1391
1392
    /**
1393
     * Expands exclude string
1394
     *
1395
     * @param string $excludeString Exclude string
1396
     * @return array
1397
     */
1398 2
    public function expandExcludeString($excludeString)
1399
    {
1400
        // internal static caches;
1401 2
        static $expandedExcludeStringCache;
1402 2
        static $treeCache;
1403
1404 2
        if (empty($expandedExcludeStringCache[$excludeString])) {
1405 2
            $pidList = [];
1406
1407 2
            if (! empty($excludeString)) {
1408
                /** @var PageTreeView $tree */
1409 1
                $tree = GeneralUtility::makeInstance(PageTreeView::class);
1410 1
                $tree->init('AND ' . $this->getBackendUser()->getPagePermsClause(Permission::PAGE_SHOW));
1411
1412 1
                $excludeParts = GeneralUtility::trimExplode(',', $excludeString);
1413
1414 1
                foreach ($excludeParts as $excludePart) {
1415 1
                    [$pid, $depth] = GeneralUtility::trimExplode('+', $excludePart);
1416
1417
                    // default is "page only" = "depth=0"
1418 1
                    if (empty($depth)) {
1419 1
                        $depth = (stristr($excludePart, '+')) ? 99 : 0;
1420
                    }
1421
1422 1
                    $pidList[] = (int) $pid;
1423
1424 1
                    if ($depth > 0) {
1425
                        if (empty($treeCache[$pid][$depth])) {
1426
                            $tree->reset();
1427
                            $tree->getTree($pid, $depth);
0 ignored issues
show
Bug introduced by
$pid of type string is incompatible with the type integer expected by parameter $uid of TYPO3\CMS\Backend\Tree\V...ractTreeView::getTree(). ( Ignorable by Annotation )

If this is a false-positive, you can also ignore this issue in your code via the ignore-type  annotation

1427
                            $tree->getTree(/** @scrutinizer ignore-type */ $pid, $depth);
Loading history...
1428
                            $treeCache[$pid][$depth] = $tree->tree;
1429
                        }
1430
1431
                        foreach ($treeCache[$pid][$depth] as $data) {
1432
                            $pidList[] = (int) $data['row']['uid'];
1433
                        }
1434
                    }
1435
                }
1436
            }
1437
1438 2
            $expandedExcludeStringCache[$excludeString] = array_unique($pidList);
1439
        }
1440
1441 2
        return $expandedExcludeStringCache[$excludeString];
1442
    }
1443
1444
    /**
1445
     * Create the rows for display of the page tree
1446
     * For each page a number of rows are shown displaying GET variable configuration
1447
     */
1448
    public function drawURLs_addRowsForPage(array $pageRow, string $pageTitle): string
1449
    {
1450
        $skipMessage = '';
1451
1452
        // Get list of configurations
1453
        $configurations = $this->getUrlsForPageRow($pageRow, $skipMessage);
1454
1455
        if (! empty($this->incomingConfigurationSelection)) {
1456
            // remove configuration that does not match the current selection
1457
            foreach ($configurations as $confKey => $confArray) {
1458
                if (! in_array($confKey, $this->incomingConfigurationSelection, true)) {
1459
                    unset($configurations[$confKey]);
1460
                }
1461
            }
1462
        }
1463
1464
        // Traverse parameter combinations:
1465
        $c = 0;
1466
        $content = '';
1467
        if (! empty($configurations)) {
1468
            foreach ($configurations as $confKey => $confArray) {
1469
1470
                // Title column:
1471
                if (! $c) {
1472
                    $titleClm = '<td rowspan="' . count($configurations) . '">' . $pageTitle . '</td>';
1473
                } else {
1474
                    $titleClm = '';
1475
                }
1476
1477
                if (! in_array($pageRow['uid'], $this->expandExcludeString($confArray['subCfg']['exclude']), true)) {
1478
1479
                    // URL list:
1480
                    $urlList = $this->urlListFromUrlArray(
1481
                        $confArray,
1482
                        $pageRow,
1483
                        $this->scheduledTime,
1484
                        $this->reqMinute,
1485
                        $this->submitCrawlUrls,
1486
                        $this->downloadCrawlUrls,
1487
                        $this->duplicateTrack,
1488
                        $this->downloadUrls,
1489
                        // if empty the urls won't be filtered by processing instructions
1490
                        $this->incomingProcInstructions
1491
                    );
1492
1493
                    // Expanded parameters:
1494
                    $paramExpanded = '';
1495
                    $calcAccu = [];
1496
                    $calcRes = 1;
1497
                    foreach ($confArray['paramExpanded'] as $gVar => $gVal) {
1498
                        $paramExpanded .= '
1499
                            <tr>
1500
                                <td>' . htmlspecialchars('&' . $gVar . '=') . '<br/>' .
1501
                            '(' . count($gVal) . ')' .
1502
                            '</td>
1503
                                <td nowrap="nowrap">' . nl2br(htmlspecialchars(implode(chr(10), $gVal))) . '</td>
1504
                            </tr>
1505
                        ';
1506
                        $calcRes *= count($gVal);
1507
                        $calcAccu[] = count($gVal);
1508
                    }
1509
                    $paramExpanded = '<table>' . $paramExpanded . '</table>';
1510
                    $paramExpanded .= 'Comb: ' . implode('*', $calcAccu) . '=' . $calcRes;
1511
1512
                    // Options
1513
                    $optionValues = '';
1514
                    if ($confArray['subCfg']['userGroups']) {
1515
                        $optionValues .= 'User Groups: ' . $confArray['subCfg']['userGroups'] . '<br/>';
1516
                    }
1517
                    if ($confArray['subCfg']['procInstrFilter']) {
1518
                        $optionValues .= 'ProcInstr: ' . $confArray['subCfg']['procInstrFilter'] . '<br/>';
1519
                    }
1520
1521
                    // Compile row:
1522
                    $content .= '
1523
                        <tr>
1524
                            ' . $titleClm . '
1525
                            <td>' . htmlspecialchars($confKey) . '</td>
1526
                            <td>' . nl2br(htmlspecialchars(rawurldecode(trim(str_replace('&', chr(10) . '&', GeneralUtility::implodeArrayForUrl('', $confArray['paramParsed'])))))) . '</td>
1527
                            <td>' . $paramExpanded . '</td>
1528
                            <td nowrap="nowrap">' . $urlList . '</td>
1529
                            <td nowrap="nowrap">' . $optionValues . '</td>
1530
                            <td nowrap="nowrap">' . DebugUtility::viewArray($confArray['subCfg']['procInstrParams.']) . '</td>
1531
                        </tr>';
1532
                } else {
1533
                    $content .= '<tr>
1534
                            ' . $titleClm . '
1535
                            <td>' . htmlspecialchars($confKey) . '</td>
1536
                            <td colspan="5"><em>No entries</em> (Page is excluded in this configuration)</td>
1537
                        </tr>';
1538
                }
1539
1540
                $c++;
1541
            }
1542
        } else {
1543
            $message = ! empty($skipMessage) ? ' (' . $skipMessage . ')' : '';
1544
1545
            // Compile row:
1546
            $content .= '
1547
                <tr>
1548
                    <td>' . $pageTitle . '</td>
1549
                    <td colspan="6"><em>No entries</em>' . $message . '</td>
1550
                </tr>';
1551
        }
1552
1553
        return $content;
1554
    }
1555
1556
    /*****************************
1557
     *
1558
     * CLI functions
1559
     *
1560
     *****************************/
1561
1562
    /**
1563
     * Running the functionality of the CLI (crawling URLs from queue)
1564
     */
1565
    public function CLI_run(int $countInARun, int $sleepTime, int $sleepAfterFinish): int
1566
    {
1567
        $result = 0;
1568
        $counter = 0;
1569
1570
        // First, run hooks:
1571
        $this->CLI_runHooks();
1572
1573
        // Clean up the queue
1574
        $this->queueRepository->cleanupQueue();
1575
1576
        // Select entries:
1577
        $rows = $this->queueRepository->fetchRecordsToBeCrawled($countInARun);
1578
1579
        if (! empty($rows)) {
1580
            $quidList = [];
1581
1582
            foreach ($rows as $r) {
1583
                $quidList[] = $r['qid'];
1584
            }
1585
1586
            $processId = $this->CLI_buildProcessId();
1587
1588
            //save the number of assigned queue entries to determine how many have been processed later
1589
            $numberOfAffectedRows = $this->queueRepository->updateProcessIdAndSchedulerForQueueIds($quidList, $processId);
1590
            $this->processRepository->updateProcessAssignItemsCount($numberOfAffectedRows, $processId);
1591
1592
            if ($numberOfAffectedRows !== count($quidList)) {
1593
                $this->CLI_debug('Nothing processed due to multi-process collision (' . $this->CLI_buildProcessId() . ')');
0 ignored issues
show
Deprecated Code introduced by
The function AOE\Crawler\Controller\C...Controller::CLI_debug() has been deprecated. ( Ignorable by Annotation )

If this is a false-positive, you can also ignore this issue in your code via the ignore-deprecated  annotation

1593
                /** @scrutinizer ignore-deprecated */ $this->CLI_debug('Nothing processed due to multi-process collision (' . $this->CLI_buildProcessId() . ')');
Loading history...
1594
                return ($result | self::CLI_STATUS_ABORTED);
1595
            }
1596
1597
            foreach ($rows as $r) {
1598
                $result |= $this->readUrl($r['qid']);
1599
1600
                $counter++;
1601
                // Just to relax the system
1602
                usleep((int) $sleepTime);
1603
1604
                // if during the start and the current read url the cli has been disable we need to return from the function
1605
                // mark the process NOT as ended.
1606
                if ($this->getDisabled()) {
1607
                    return ($result | self::CLI_STATUS_ABORTED);
1608
                }
1609
1610
                if (! $this->processRepository->isProcessActive($this->CLI_buildProcessId())) {
1611
                    $this->CLI_debug('conflict / timeout (' . $this->CLI_buildProcessId() . ')');
0 ignored issues
show
Deprecated Code introduced by
The function AOE\Crawler\Controller\C...Controller::CLI_debug() has been deprecated. ( Ignorable by Annotation )

If this is a false-positive, you can also ignore this issue in your code via the ignore-deprecated  annotation

1611
                    /** @scrutinizer ignore-deprecated */ $this->CLI_debug('conflict / timeout (' . $this->CLI_buildProcessId() . ')');
Loading history...
1612
                    $result |= self::CLI_STATUS_ABORTED;
1613
                    //possible timeout
1614
                    break;
1615
                }
1616
            }
1617
1618
            sleep((int) $sleepAfterFinish);
1619
1620
            $msg = 'Rows: ' . $counter;
1621
            $this->CLI_debug($msg . ' (' . $this->CLI_buildProcessId() . ')');
0 ignored issues
show
Deprecated Code introduced by
The function AOE\Crawler\Controller\C...Controller::CLI_debug() has been deprecated. ( Ignorable by Annotation )

If this is a false-positive, you can also ignore this issue in your code via the ignore-deprecated  annotation

1621
            /** @scrutinizer ignore-deprecated */ $this->CLI_debug($msg . ' (' . $this->CLI_buildProcessId() . ')');
Loading history...
1622
        } else {
1623
            $this->CLI_debug('Nothing within queue which needs to be processed (' . $this->CLI_buildProcessId() . ')');
0 ignored issues
show
Deprecated Code introduced by
The function AOE\Crawler\Controller\C...Controller::CLI_debug() has been deprecated. ( Ignorable by Annotation )

If this is a false-positive, you can also ignore this issue in your code via the ignore-deprecated  annotation

1623
            /** @scrutinizer ignore-deprecated */ $this->CLI_debug('Nothing within queue which needs to be processed (' . $this->CLI_buildProcessId() . ')');
Loading history...
1624
        }
1625
1626
        if ($counter > 0) {
1627
            $result |= self::CLI_STATUS_PROCESSED;
1628
        }
1629
1630
        return $result;
1631
    }
1632
1633
    /**
1634
     * Activate hooks
1635
     */
1636
    public function CLI_runHooks(): void
1637
    {
1638
        foreach ($GLOBALS['TYPO3_CONF_VARS']['EXTCONF']['crawler']['cli_hooks'] ?? [] as $objRef) {
1639
            $hookObj = GeneralUtility::makeInstance($objRef);
1640
            if (is_object($hookObj)) {
1641
                $hookObj->crawler_init($this);
1642
            }
1643
        }
1644
    }
1645
1646
    /**
1647
     * Try to acquire a new process with the given id
1648
     * also performs some auto-cleanup for orphan processes
1649
     * @param string $id identification string for the process
1650
     * @return boolean
1651
     * @todo preemption might not be the most elegant way to clean up
1652
     */
1653
    public function CLI_checkAndAcquireNewProcess($id)
1654
    {
1655
        $queryBuilder = GeneralUtility::makeInstance(ConnectionPool::class)->getQueryBuilderForTable($this->tableName);
0 ignored issues
show
Unused Code introduced by
The assignment to $queryBuilder is dead and can be removed.
Loading history...
1656
        $ret = true;
1657
1658
        $systemProcessId = getmypid();
1659
        if (!$systemProcessId) {
1660
            return false;
1661
        }
1662
1663
        $processCount = 0;
1664
        $orphanProcesses = [];
1665
1666
        $activeProcesses = $this->processRepository->findAllActive();
1667
        $currentTime = $this->getCurrentTime();
1668
1669
        /** @var Process $process */
1670
        foreach ($activeProcesses as $process) {
1671
            if ($process->getTtl() < $currentTime) {
1672
                $orphanProcesses[] = $process->getProcessId();
1673
            } else {
1674
                $processCount++;
1675
            }
1676
        }
1677
1678
        // if there are less than allowed active processes then add a new one
1679
        if ($processCount < (int) $this->extensionSettings['processLimit']) {
1680
            $this->CLI_debug('add process ' . $this->CLI_buildProcessId() . ' (' . ($processCount + 1) . '/' . (int) $this->extensionSettings['processLimit'] . ')');
0 ignored issues
show
Deprecated Code introduced by
The function AOE\Crawler\Controller\C...Controller::CLI_debug() has been deprecated. ( Ignorable by Annotation )

If this is a false-positive, you can also ignore this issue in your code via the ignore-deprecated  annotation

1680
            /** @scrutinizer ignore-deprecated */ $this->CLI_debug('add process ' . $this->CLI_buildProcessId() . ' (' . ($processCount + 1) . '/' . (int) $this->extensionSettings['processLimit'] . ')');
Loading history...
1681
1682
            GeneralUtility::makeInstance(ConnectionPool::class)->getConnectionForTable('tx_crawler_process')->insert(
1683
                'tx_crawler_process',
1684
                [
1685
                    'process_id' => $id,
1686
                    'active' => 1,
1687
                    'ttl' => $currentTime + (int) $this->extensionSettings['processMaxRunTime'],
1688
                    'system_process_id' => $systemProcessId,
1689
                ]
1690
            );
1691
        } else {
1692
            $this->CLI_debug('Processlimit reached (' . ($processCount) . '/' . (int) $this->extensionSettings['processLimit'] . ')');
0 ignored issues
show
Deprecated Code introduced by
The function AOE\Crawler\Controller\C...Controller::CLI_debug() has been deprecated. ( Ignorable by Annotation )

If this is a false-positive, you can also ignore this issue in your code via the ignore-deprecated  annotation

1692
            /** @scrutinizer ignore-deprecated */ $this->CLI_debug('Processlimit reached (' . ($processCount) . '/' . (int) $this->extensionSettings['processLimit'] . ')');
Loading history...
1693
            $ret = false;
1694
        }
1695
1696
        $this->processRepository->deleteProcessesMarkedAsDeleted();
1697
        $this->CLI_releaseProcesses($orphanProcesses);
1698
1699
        return $ret;
1700
    }
1701
1702
    /**
1703
     * Release a process and the required resources
1704
     *
1705
     * @param mixed $releaseIds string with a single process-id or array with multiple process-ids
1706
     * @return boolean
1707
     */
1708
    public function CLI_releaseProcesses($releaseIds)
1709
    {
1710
        $queryBuilder = GeneralUtility::makeInstance(ConnectionPool::class)->getQueryBuilderForTable($this->tableName);
1711
1712
        if (! is_array($releaseIds)) {
1713
            $releaseIds = [$releaseIds];
1714
        }
1715
1716
        if (empty($releaseIds)) {
1717
            //nothing to release
1718
            return false;
1719
        }
1720
1721
        // some kind of 2nd chance algo - this way you need at least 2 processes to have a real cleanup
1722
        // this ensures that a single process can't mess up the entire process table
1723
1724
        // mark all processes as deleted which have no "waiting" queue-entires and which are not active
1725
1726
        $queryBuilder
1727
            ->update($this->tableName, 'q')
1728
            ->where(
1729
                'q.process_id IN(SELECT p.process_id FROM tx_crawler_process as p WHERE p.active = 0)'
1730
            )
1731
            ->set('q.process_scheduled', 0)
1732
            ->set('q.process_id', '')
1733
            ->execute();
1734
1735
        // FIXME: Not entirely sure that this is equivalent to the previous version
1736
        $queryBuilder->resetQueryPart('set');
1737
1738
        $queryBuilder
1739
            ->update('tx_crawler_process')
1740
            ->where(
1741
                $queryBuilder->expr()->eq('active', 0),
1742
                'process_id IN(SELECT q.process_id FROM tx_crawler_queue as q WHERE q.exec_time = 0)'
1743
            )
1744
            ->set('system_process_id', 0)
1745
            ->execute();
1746
1747
        $this->processRepository->markRequestedProcessesAsNotActive($releaseIds);
1748
        $this->queueRepository->unsetProcessScheduledAndProcessIdForQueueEntries($releaseIds);
1749
1750
        return true;
1751
    }
1752
1753
    /**
1754
     * Create a unique Id for the current process
1755
     *
1756
     * @return string the ID
1757
     */
1758 1
    public function CLI_buildProcessId()
1759
    {
1760 1
        if (! $this->processID) {
1761
            $this->processID = GeneralUtility::shortMD5(microtime(true));
1762
        }
1763 1
        return $this->processID;
1764
    }
1765
1766
    /**
1767
     * Prints a message to the stdout (only if debug-mode is enabled)
1768
     *
1769
     * @param string $msg the message
1770
     * @deprecated
1771
     */
1772
    public function CLI_debug($msg): void
1773
    {
1774
        if ((int) $this->extensionSettings['processDebug']) {
1775
            echo $msg . "\n";
1776
            flush();
1777
        }
1778
    }
1779
1780
    /**
1781
     * Cleans up entries that stayed for too long in the queue. These are:
1782
     * - processed entries that are over 1.5 days in age
1783
     * - scheduled entries that are over 7 days old
1784
     *
1785
     * @deprecated
1786
     */
1787 1
    public function cleanUpOldQueueEntries(): void
1788
    {
1789
        // 24*60*60 Seconds in 24 hours
1790 1
        $processedAgeInSeconds = $this->extensionSettings['cleanUpProcessedAge'] * 86400;
1791 1
        $scheduledAgeInSeconds = $this->extensionSettings['cleanUpScheduledAge'] * 86400;
1792
1793 1
        $now = time();
1794 1
        $condition = '(exec_time<>0 AND exec_time<' . ($now - $processedAgeInSeconds) . ') OR scheduled<=' . ($now - $scheduledAgeInSeconds);
1795 1
        $this->flushQueue($condition);
0 ignored issues
show
Deprecated Code introduced by
The function AOE\Crawler\Controller\C...ontroller::flushQueue() has been deprecated. ( Ignorable by Annotation )

If this is a false-positive, you can also ignore this issue in your code via the ignore-deprecated  annotation

1795
        /** @scrutinizer ignore-deprecated */ $this->flushQueue($condition);
Loading history...
1796 1
    }
1797
1798
    /**
1799
     * Removes queue entries
1800
     *
1801
     * @param string $where SQL related filter for the entries which should be removed
1802
     *
1803
     * @deprecated
1804
     */
1805 5
    protected function flushQueue($where = ''): void
1806
    {
1807 5
        $realWhere = strlen((string) $where) > 0 ? $where : '1=1';
1808
1809 5
        $queryBuilder = $this->getQueryBuilder($this->tableName);
1810
1811
        $groups = $queryBuilder
1812 5
            ->selectLiteral('DISTINCT set_id')
1813 5
            ->from($this->tableName)
1814 5
            ->where($realWhere)
1815 5
            ->execute()
1816 5
            ->fetchAll();
1817 5
        if (is_array($groups)) {
0 ignored issues
show
introduced by
The condition is_array($groups) is always true.
Loading history...
1818 5
            foreach ($groups as $group) {
1819
                $subSet = $queryBuilder
1820 4
                    ->select('qid', 'set_id')
1821 4
                    ->from($this->tableName)
1822 4
                    ->where(
1823 4
                        $realWhere,
1824 4
                        $queryBuilder->expr()->eq('set_id', $group['set_id'])
1825
                    )
1826 4
                    ->execute()
1827 4
                    ->fetchAll();
1828
1829 4
                $payLoad = ['subSet' => $subSet];
1830 4
                SignalSlotUtility::emitSignal(
1831 4
                    self::class,
1832 4
                    SignalSlotUtility::SIGNAL_QUEUE_ENTRY_FLUSH,
1833 4
                    $payLoad
1834
                );
1835
            }
1836
        }
1837
1838
        $queryBuilder
1839 5
            ->delete($this->tableName)
1840 5
            ->where($realWhere)
1841 5
            ->execute();
1842 5
    }
1843
1844
    /**
1845
     * This method determines duplicates for a queue entry with the same parameters and this timestamp.
1846
     * If the timestamp is in the past, it will check if there is any unprocessed queue entry in the past.
1847
     * If the timestamp is in the future it will check, if the queued entry has exactly the same timestamp
1848
     *
1849
     * @param int $tstamp
1850
     * @param array $fieldArray
1851
     *
1852
     * @return array
1853
     */
1854 9
    protected function getDuplicateRowsIfExist($tstamp, $fieldArray)
1855
    {
1856 9
        $rows = [];
1857
1858 9
        $currentTime = $this->getCurrentTime();
1859
1860 9
        $queryBuilder = GeneralUtility::makeInstance(ConnectionPool::class)->getQueryBuilderForTable($this->tableName);
1861
        $queryBuilder
1862 9
            ->select('qid')
1863 9
            ->from('tx_crawler_queue');
1864
        //if this entry is scheduled with "now"
1865 9
        if ($tstamp <= $currentTime) {
1866 3
            if ($this->extensionSettings['enableTimeslot']) {
1867 2
                $timeBegin = $currentTime - 100;
1868 2
                $timeEnd = $currentTime + 100;
1869
                $queryBuilder
1870 2
                    ->where(
1871 2
                        'scheduled BETWEEN ' . $timeBegin . ' AND ' . $timeEnd . ''
1872
                    )
1873 2
                    ->orWhere(
1874 2
                        $queryBuilder->expr()->lte('scheduled', $currentTime)
1875
                    );
1876
            } else {
1877
                $queryBuilder
1878 1
                    ->where(
1879 3
                        $queryBuilder->expr()->lte('scheduled', $currentTime)
1880
                    );
1881
            }
1882 6
        } elseif ($tstamp > $currentTime) {
1883
            //entry with a timestamp in the future need to have the same schedule time
1884
            $queryBuilder
1885 6
                ->where(
1886 6
                    $queryBuilder->expr()->eq('scheduled', $tstamp)
1887
                );
1888
        }
1889
1890
        $queryBuilder
1891 9
            ->andWhere('NOT exec_time')
1892 9
            ->andWhere('NOT process_id')
1893 9
            ->andWhere($queryBuilder->expr()->eq('page_id', $queryBuilder->createNamedParameter($fieldArray['page_id'], \PDO::PARAM_INT)))
1894 9
            ->andWhere($queryBuilder->expr()->eq('parameters_hash', $queryBuilder->createNamedParameter($fieldArray['parameters_hash'], \PDO::PARAM_STR)));
1895
1896 9
        $statement = $queryBuilder->execute();
1897
1898 9
        while ($row = $statement->fetch()) {
1899 7
            $rows[] = $row['qid'];
1900
        }
1901
1902 9
        return $rows;
1903
    }
1904
1905
    /**
1906
     * Returns a md5 hash generated from a serialized configuration array.
1907
     *
1908
     * @return string
1909
     */
1910 10
    protected function getConfigurationHash(array $configuration)
1911
    {
1912 10
        unset($configuration['paramExpanded']);
1913 10
        unset($configuration['URLs']);
1914 10
        return md5(serialize($configuration));
1915
    }
1916
1917
    /**
1918
     * Build a URL from a Page and the Query String. If the page has a Site configuration, it can be built by using
1919
     * the Site instance.
1920
     *
1921
     * @param int $httpsOrHttp see tx_crawler_configuration.force_ssl
1922
     * @throws \TYPO3\CMS\Core\Exception\SiteNotFoundException
1923
     * @throws \TYPO3\CMS\Core\Routing\InvalidRouteArgumentsException
1924
     *
1925
     * @deprecated Using CrawlerController::getUrlFromPageAndQueryParameters() is deprecated since 9.1.1 and will be removed in v11.x, please use UrlService->getUrlFromPageAndQueryParameters() instead.
1926
     */
1927
    protected function getUrlFromPageAndQueryParameters(int $pageId, string $queryString, ?string $alternativeBaseUrl, int $httpsOrHttp): UriInterface
1928
    {
1929
        $urlService = new UrlService();
1930
        return $urlService->getUrlFromPageAndQueryParameters($pageId, $queryString, $alternativeBaseUrl, $httpsOrHttp);
1931
    }
1932
1933 1
    protected function swapIfFirstIsLargerThanSecond(array $reg): array
1934
    {
1935
        // Swap if first is larger than last:
1936 1
        if ($reg[1] > $reg[2]) {
1937
            $temp = $reg[2];
1938
            $reg[2] = $reg[1];
1939
            $reg[1] = $temp;
1940
        }
1941
1942 1
        return $reg;
1943
    }
1944
1945
    /**
1946
     * @return BackendUserAuthentication
1947
     */
1948 2
    private function getBackendUser()
1949
    {
1950
        // Make sure the _cli_ user is loaded
1951 2
        Bootstrap::initializeBackendAuthentication();
1952 2
        if ($this->backendUser === null) {
1953 2
            $this->backendUser = $GLOBALS['BE_USER'];
1954
        }
1955 2
        return $this->backendUser;
1956
    }
1957
1958
    /**
1959
     * Get querybuilder for given table
1960
     *
1961
     * @return \TYPO3\CMS\Core\Database\Query\QueryBuilder
1962
     */
1963 12
    private function getQueryBuilder(string $table)
1964
    {
1965 12
        return GeneralUtility::makeInstance(ConnectionPool::class)->getQueryBuilderForTable($table);
1966
    }
1967
}
1968