Passed
Push — refactor/crawlerController ( aa4668 )
by Tomas Norre
07:48
created

CrawlerController::compileUrls()   A

Complexity

Conditions 6
Paths 7

Size

Total Lines 22
Code Lines 12

Duplication

Lines 0
Ratio 0 %

Code Coverage

Tests 12
CRAP Score 6.0163

Importance

Changes 1
Bugs 0 Features 0
Metric Value
cc 6
eloc 12
c 1
b 0
f 0
nc 7
nop 2
dl 0
loc 22
ccs 12
cts 13
cp 0.9231
crap 6.0163
rs 9.2222
1
<?php
2
3
declare(strict_types=1);
4
5
namespace AOE\Crawler\Controller;
6
7
/***************************************************************
8
 *  Copyright notice
9
 *
10
 *  (c) 2020 AOE GmbH <[email protected]>
11
 *
12
 *  All rights reserved
13
 *
14
 *  This script is part of the TYPO3 project. The TYPO3 project is
15
 *  free software; you can redistribute it and/or modify
16
 *  it under the terms of the GNU General Public License as published by
17
 *  the Free Software Foundation; either version 3 of the License, or
18
 *  (at your option) any later version.
19
 *
20
 *  The GNU General Public License can be found at
21
 *  http://www.gnu.org/copyleft/gpl.html.
22
 *
23
 *  This script is distributed in the hope that it will be useful,
24
 *  but WITHOUT ANY WARRANTY; without even the implied warranty of
25
 *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
26
 *  GNU General Public License for more details.
27
 *
28
 *  This copyright notice MUST APPEAR in all copies of the script!
29
 ***************************************************************/
30
31
use AOE\Crawler\Configuration\ExtensionConfigurationProvider;
32
use AOE\Crawler\Converter\JsonCompatibilityConverter;
33
use AOE\Crawler\CrawlStrategy\CrawlStrategyFactory;
34
use AOE\Crawler\Domain\Repository\ConfigurationRepository;
35
use AOE\Crawler\Domain\Repository\ProcessRepository;
36
use AOE\Crawler\Domain\Repository\QueueRepository;
37
use AOE\Crawler\QueueExecutor;
38
use AOE\Crawler\Service\ProcessService;
39
use AOE\Crawler\Service\UrlService;
40
use AOE\Crawler\Utility\SignalSlotUtility;
41
use Psr\Http\Message\UriInterface;
42
use Psr\Log\LoggerAwareInterface;
43
use Psr\Log\LoggerAwareTrait;
44
use TYPO3\CMS\Backend\Tree\View\PageTreeView;
45
use TYPO3\CMS\Backend\Utility\BackendUtility;
46
use TYPO3\CMS\Core\Authentication\BackendUserAuthentication;
47
use TYPO3\CMS\Core\Compatibility\PublicMethodDeprecationTrait;
48
use TYPO3\CMS\Core\Core\Bootstrap;
49
use TYPO3\CMS\Core\Core\Environment;
50
use TYPO3\CMS\Core\Database\Connection;
51
use TYPO3\CMS\Core\Database\ConnectionPool;
52
use TYPO3\CMS\Core\Database\Query\Restriction\DeletedRestriction;
53
use TYPO3\CMS\Core\Imaging\Icon;
54
use TYPO3\CMS\Core\Imaging\IconFactory;
55
use TYPO3\CMS\Core\Site\Entity\Site;
56
use TYPO3\CMS\Core\Type\Bitmask\Permission;
57
use TYPO3\CMS\Core\TypoScript\Parser\TypoScriptParser;
58
use TYPO3\CMS\Core\Utility\DebugUtility;
59
use TYPO3\CMS\Core\Utility\GeneralUtility;
60
use TYPO3\CMS\Core\Utility\MathUtility;
61
use TYPO3\CMS\Extbase\Object\ObjectManager;
62
use TYPO3\CMS\Frontend\Page\PageRepository;
63
64
/**
65
 * Class CrawlerController
66
 *
67
 * @package AOE\Crawler\Controller
68
 */
69
class CrawlerController implements LoggerAwareInterface
70
{
71
    use LoggerAwareTrait;
72
    use PublicMethodDeprecationTrait;
73
74
    public const CLI_STATUS_NOTHING_PROCCESSED = 0;
75
76
    //queue not empty
77
    public const CLI_STATUS_REMAIN = 1;
78
79
    //(some) queue items where processed
80
    public const CLI_STATUS_PROCESSED = 2;
81
82
    //instance didn't finish
83
    public const CLI_STATUS_ABORTED = 4;
84
85
    public const CLI_STATUS_POLLABLE_PROCESSED = 8;
86
87
    /**
88
     * @var integer
89
     */
90
    public $setID = 0;
91
92
    /**
93
     * @var string
94
     */
95
    public $processID = '';
96
97
    /**
98
     * @var array
99
     */
100
    public $duplicateTrack = [];
101
102
    /**
103
     * @var array
104
     */
105
    public $downloadUrls = [];
106
107
    /**
108
     * @var array
109
     */
110
    public $incomingProcInstructions = [];
111
112
    /**
113
     * @var array
114
     */
115
    public $incomingConfigurationSelection = [];
116
117
    /**
118
     * @var bool
119
     */
120
    public $registerQueueEntriesInternallyOnly = false;
121
122
    /**
123
     * @var array
124
     */
125
    public $queueEntries = [];
126
127
    /**
128
     * @var array
129
     */
130
    public $urlList = [];
131
132
    /**
133
     * @var array
134
     */
135
    public $extensionSettings = [];
136
137
    /**
138
     * Mount Point
139
     *
140
     * @var bool
141
     * Todo: Check what this is used for and adjust the type hint or code, as bool doesn't match the current code.
142
     */
143
    public $MP = false;
144
145
    /**
146
     * @var string
147
     */
148
    protected $processFilename;
149
150
    /**
151
     * Holds the internal access mode can be 'gui','cli' or 'cli_im'
152
     *
153
     * @var string
154
     */
155
    protected $accessMode;
156
157
    /**
158
     * @var QueueRepository
159
     */
160
    protected $queueRepository;
161
162
    /**
163
     * @var ProcessRepository
164
     */
165
    protected $processRepository;
166
167
    /**
168
     * @var ConfigurationRepository
169
     */
170
    protected $configurationRepository;
171
172
    /**
173
     * @var string
174
     */
175
    protected $tableName = 'tx_crawler_queue';
176
177
    /**
178
     * @var QueueExecutor
179
     */
180
    protected $queueExecutor;
181
182
    /**
183
     * @var int
184
     */
185
    protected $maximumUrlsToCompile = 10000;
186
187
    /**
188
     * @var IconFactory
189
     */
190
    protected $iconFactory;
191
192
    /**
193
     * @var string[]
194
     */
195
    private $deprecatedPublicMethods = [
0 ignored issues
show
introduced by
The private property $deprecatedPublicMethods is not used, and could be removed.
Loading history...
196
        'CLI_buildProcessId' => 'Using CrawlerController->CLI_buildProcessId() is deprecated since 9.1.3 and will be removed in v11.x, please use ProcessService::createProcessId() instead',
197
        'getLogEntriesForSetId' => 'Using crawlerController::getLogEntriesForSetId() is deprecated since 9.0.1 and will be removed in v11.x',
198
        'flushQueue' => 'Using CrawlerController::flushQueue() is deprecated since 9.0.1 and will be removed in v11.x, please use QueueRepository->flushQueue() instead.',
199
        'cleanUpOldQueueEntries' => 'Using CrawlerController::cleanUpOldQueueEntries() is deprecated since 9.0.1 and will be removed in v11.x, please use QueueRepository->cleanUpOldQueueEntries() instead.',
200
    ];
201
202
    /**
203
     * @var BackendUserAuthentication|null
204
     */
205
    private $backendUser;
206
207
    /**
208
     * @var integer
209
     */
210
    private $scheduledTime = 0;
211
212
    /**
213
     * @var integer
214
     */
215
    private $reqMinute = 0;
216
217
    /**
218
     * @var bool
219
     */
220
    private $submitCrawlUrls = false;
221
222
    /**
223
     * @var bool
224
     */
225
    private $downloadCrawlUrls = false;
226
227
    /************************************
228
     *
229
     * Getting URLs based on Page TSconfig
230
     *
231
     ************************************/
232
233 36
    public function __construct()
234
    {
235 36
        $objectManager = GeneralUtility::makeInstance(ObjectManager::class);
236 36
        $crawlStrategyFactory = GeneralUtility::makeInstance(CrawlStrategyFactory::class);
237 36
        $this->queueRepository = $objectManager->get(QueueRepository::class);
238 36
        $this->processRepository = $objectManager->get(ProcessRepository::class);
239 36
        $this->configurationRepository = $objectManager->get(ConfigurationRepository::class);
240 36
        $this->queueExecutor = GeneralUtility::makeInstance(QueueExecutor::class, $crawlStrategyFactory);
241 36
        $this->iconFactory = GeneralUtility::makeInstance(IconFactory::class);
242
243 36
        $this->processFilename = Environment::getVarPath() . '/lock/tx_crawler.proc';
244
245
        /** @var ExtensionConfigurationProvider $configurationProvider */
246 36
        $configurationProvider = GeneralUtility::makeInstance(ExtensionConfigurationProvider::class);
247 36
        $settings = $configurationProvider->getExtensionConfiguration();
248 36
        $this->extensionSettings = is_array($settings) ? $settings : [];
0 ignored issues
show
introduced by
The condition is_array($settings) is always true.
Loading history...
249
250
        // set defaults:
251 36
        if (MathUtility::convertToPositiveInteger($this->extensionSettings['countInARun']) === 0) {
252
            $this->extensionSettings['countInARun'] = 100;
253
        }
254
255 36
        $this->extensionSettings['processLimit'] = MathUtility::forceIntegerInRange($this->extensionSettings['processLimit'], 1, 99, 1);
256 36
        $this->maximumUrlsToCompile = MathUtility::forceIntegerInRange($this->extensionSettings['maxCompileUrls'], 1, 1000000000, 10000);
257 36
    }
258
259
    /**
260
     * Method to set the accessMode can be gui, cli or cli_im
261
     *
262
     * @return string
263
     */
264 1
    public function getAccessMode()
265
    {
266 1
        return $this->accessMode;
267
    }
268
269
    /**
270
     * @param string $accessMode
271
     */
272 1
    public function setAccessMode($accessMode): void
273
    {
274 1
        $this->accessMode = $accessMode;
275 1
    }
276
277
    /**
278
     * Set disabled status to prevent processes from being processed
279
     *
280
     * @param bool $disabled (optional, defaults to true)
281
     */
282 2
    public function setDisabled($disabled = true): void
283
    {
284 2
        if ($disabled) {
285 1
            GeneralUtility::writeFile($this->processFilename, '');
286
        } else {
287 1
            if (is_file($this->processFilename)) {
288 1
                unlink($this->processFilename);
289
            }
290
        }
291 2
    }
292
293
    /**
294
     * Get disable status
295
     *
296
     * @return bool true if disabled
297
     */
298 2
    public function getDisabled()
299
    {
300 2
        return is_file($this->processFilename);
301
    }
302
303
    /**
304
     * @param string $filenameWithPath
305
     */
306 3
    public function setProcessFilename($filenameWithPath): void
307
    {
308 3
        $this->processFilename = $filenameWithPath;
309 3
    }
310
311
    /**
312
     * @return string
313
     */
314 1
    public function getProcessFilename()
315
    {
316 1
        return $this->processFilename;
317
    }
318
319
    /**
320
     * Sets the extensions settings (unserialized pendant of $TYPO3_CONF_VARS['EXT']['extConf']['crawler']).
321
     */
322 14
    public function setExtensionSettings(array $extensionSettings): void
323
    {
324 14
        $this->extensionSettings = $extensionSettings;
325 14
    }
326
327
    /**
328
     * Check if the given page should be crawled
329
     *
330
     * @return false|string false if the page should be crawled (not excluded), true / skipMessage if it should be skipped
331
     */
332 12
    public function checkIfPageShouldBeSkipped(array $pageRow)
333
    {
334 12
        $skipPage = false;
335
        // message will be overwritten later
336 12
        $skipMessage = 'Skipped';
337
338
        // if page is hidden
339 12
        if (! $this->extensionSettings['crawlHiddenPages']) {
340 12
            if ($pageRow['hidden']) {
341 1
                $skipPage = true;
342 1
                $skipMessage = 'Because page is hidden';
343
            }
344
        }
345
346 12
        if (! $skipPage) {
347 11
            if (GeneralUtility::inList('3,4,199,254,255', $pageRow['doktype'])) {
348 3
                $skipPage = true;
349 3
                $skipMessage = 'Because doktype is not allowed';
350
            }
351
        }
352
353 12
        if (! $skipPage) {
354 8
            foreach ($GLOBALS['TYPO3_CONF_VARS']['EXTCONF']['crawler']['excludeDoktype'] ?? [] as $key => $doktypeList) {
355 1
                if (GeneralUtility::inList($doktypeList, $pageRow['doktype'])) {
356 1
                    $skipPage = true;
357 1
                    $skipMessage = 'Doktype was excluded by "' . $key . '"';
358 1
                    break;
359
                }
360
            }
361
        }
362
363 12
        if (! $skipPage) {
364
            // veto hook
365 7
            foreach ($GLOBALS['TYPO3_CONF_VARS']['EXTCONF']['crawler']['pageVeto'] ?? [] as $key => $func) {
366
                $params = [
367 2
                    'pageRow' => $pageRow,
368
                ];
369
                // expects "false" if page is ok and "true" or a skipMessage if this page should _not_ be crawled
370 2
                $veto = GeneralUtility::callUserFunction($func, $params, $this);
371 2
                if ($veto !== false) {
372 2
                    $skipPage = true;
373 2
                    if (is_string($veto)) {
374 1
                        $skipMessage = $veto;
375
                    } else {
376 1
                        $skipMessage = 'Veto from hook "' . htmlspecialchars($key) . '"';
377
                    }
378
                    // no need to execute other hooks if a previous one return a veto
379 2
                    break;
380
                }
381
            }
382
        }
383
384 12
        return $skipPage ? $skipMessage : false;
385
    }
386
387
    /**
388
     * Wrapper method for getUrlsForPageId()
389
     * It returns an array of configurations and no urls!
390
     *
391
     * @param array $pageRow Page record with at least dok-type and uid columns.
392
     * @param string $skipMessage
393
     * @return array
394
     * @see getUrlsForPageId()
395
     */
396 6
    public function getUrlsForPageRow(array $pageRow, &$skipMessage = '')
397
    {
398 6
        $message = $this->checkIfPageShouldBeSkipped($pageRow);
399 6
        if ($message === false) {
400 5
            $res = $this->getUrlsForPageId($pageRow['uid']);
401 5
            $skipMessage = '';
402
        } else {
403 1
            $skipMessage = $message;
404 1
            $res = [];
405
        }
406
407 6
        return $res;
408
    }
409
410
    /**
411
     * Creates a list of URLs from input array (and submits them to queue if asked for)
412
     * See Web > Info module script + "indexed_search"'s crawler hook-client using this!
413
     *
414
     * @param array $vv Information about URLs from pageRow to crawl.
415
     * @param array $pageRow Page row
416
     * @param int $scheduledTime Unix time to schedule indexing to, typically time()
417
     * @param int $reqMinute Number of requests per minute (creates the interleave between requests)
418
     * @param bool $submitCrawlUrls If set, submits the URLs to queue
419
     * @param bool $downloadCrawlUrls If set (and submitcrawlUrls is false) will fill $downloadUrls with entries)
420
     * @param array $duplicateTrack Array which is passed by reference and contains the an id per url to secure we will not crawl duplicates
421
     * @param array $downloadUrls Array which will be filled with URLS for download if flag is set.
422
     * @param array $incomingProcInstructions Array of processing instructions
423
     * @return string List of URLs (meant for display in backend module)
424
     */
425 4
    public function urlListFromUrlArray(
426
        array $vv,
427
        array $pageRow,
428
        $scheduledTime,
429
        $reqMinute,
430
        $submitCrawlUrls,
431
        $downloadCrawlUrls,
432
        array &$duplicateTrack,
433
        array &$downloadUrls,
434
        array $incomingProcInstructions
435
    ) {
436 4
        if (! is_array($vv['URLs'])) {
437
            return 'ERROR - no URL generated';
438
        }
439 4
        $urlLog = [];
440 4
        $pageId = (int) $pageRow['uid'];
441 4
        $configurationHash = $this->getConfigurationHash($vv);
442 4
        $skipInnerCheck = $this->queueRepository->noUnprocessedQueueEntriesForPageWithConfigurationHashExist($pageId, $configurationHash);
443
444 4
        $urlService = new UrlService();
445
446 4
        foreach ($vv['URLs'] as $urlQuery) {
447 4
            if (! $this->drawURLs_PIfilter($vv['subCfg']['procInstrFilter'], $incomingProcInstructions)) {
448
                continue;
449
            }
450 4
            $url = (string) $urlService->getUrlFromPageAndQueryParameters(
451 4
                $pageId,
452 4
                $urlQuery,
453 4
                $vv['subCfg']['baseUrl'] ?? null,
454 4
                $vv['subCfg']['force_ssl'] ?? 0
455
            );
456
457
            // Create key by which to determine unique-ness:
458 4
            $uKey = $url . '|' . $vv['subCfg']['userGroups'] . '|' . $vv['subCfg']['procInstrFilter'];
459
460 4
            if (isset($duplicateTrack[$uKey])) {
461
                //if the url key is registered just display it and do not resubmit is
462
                $urlLog[] = '<em><span class="text-muted">' . htmlspecialchars($url) . '</span></em>';
463
            } else {
464
                // Scheduled time:
465 4
                $schTime = $scheduledTime + round(count($duplicateTrack) * (60 / $reqMinute));
466 4
                $schTime = intval($schTime / 60) * 60;
467 4
                $formattedDate = BackendUtility::datetime($schTime);
468 4
                $this->urlList[] = '[' . $formattedDate . '] ' . $url;
469 4
                $urlList = '[' . $formattedDate . '] ' . htmlspecialchars($url);
470
471
                // Submit for crawling!
472 4
                if ($submitCrawlUrls) {
473 4
                    $added = $this->addUrl(
474 4
                        $pageId,
475 4
                        $url,
476 4
                        $vv['subCfg'],
477 4
                        $scheduledTime,
478 4
                        $configurationHash,
479 4
                        $skipInnerCheck
480
                    );
481 4
                    if ($added === false) {
482 4
                        $urlList .= ' (URL already existed)';
483
                    }
484
                } elseif ($downloadCrawlUrls) {
485
                    $downloadUrls[$url] = $url;
486
                }
487 4
                $urlLog[] = $urlList;
488
            }
489 4
            $duplicateTrack[$uKey] = true;
490
        }
491
492 4
        return implode('<br>', $urlLog);
493
    }
494
495
    /**
496
     * Returns true if input processing instruction is among registered ones.
497
     *
498
     * @param string $piString PI to test
499
     * @param array $incomingProcInstructions Processing instructions
500
     * @return boolean
501
     */
502 5
    public function drawURLs_PIfilter($piString, array $incomingProcInstructions)
503
    {
504 5
        if (empty($incomingProcInstructions)) {
505 1
            return true;
506
        }
507
508 4
        foreach ($incomingProcInstructions as $pi) {
509 4
            if (GeneralUtility::inList($piString, $pi)) {
510 2
                return true;
511
            }
512
        }
513 2
        return false;
514
    }
515
516 5
    public function getPageTSconfigForId($id): array
517
    {
518 5
        if (! $this->MP) {
519 5
            $pageTSconfig = BackendUtility::getPagesTSconfig($id);
0 ignored issues
show
Deprecated Code introduced by
The function TYPO3\CMS\Backend\Utilit...ity::getPagesTSconfig() has been deprecated. ( Ignorable by Annotation )

If this is a false-positive, you can also ignore this issue in your code via the ignore-deprecated  annotation

519
            $pageTSconfig = /** @scrutinizer ignore-deprecated */ BackendUtility::getPagesTSconfig($id);
Loading history...
520
        } else {
521
            // TODO: Please check, this makes no sense to split a boolean value.
522
            [, $mountPointId] = explode('-', $this->MP);
0 ignored issues
show
Bug introduced by
$this->MP of type true is incompatible with the type string expected by parameter $string of explode(). ( Ignorable by Annotation )

If this is a false-positive, you can also ignore this issue in your code via the ignore-type  annotation

522
            [, $mountPointId] = explode('-', /** @scrutinizer ignore-type */ $this->MP);
Loading history...
523
            $pageTSconfig = BackendUtility::getPagesTSconfig($mountPointId);
0 ignored issues
show
Deprecated Code introduced by
The function TYPO3\CMS\Backend\Utilit...ity::getPagesTSconfig() has been deprecated. ( Ignorable by Annotation )

If this is a false-positive, you can also ignore this issue in your code via the ignore-deprecated  annotation

523
            $pageTSconfig = /** @scrutinizer ignore-deprecated */ BackendUtility::getPagesTSconfig($mountPointId);
Loading history...
Bug introduced by
$mountPointId of type string is incompatible with the type integer expected by parameter $id of TYPO3\CMS\Backend\Utilit...ity::getPagesTSconfig(). ( Ignorable by Annotation )

If this is a false-positive, you can also ignore this issue in your code via the ignore-type  annotation

523
            $pageTSconfig = BackendUtility::getPagesTSconfig(/** @scrutinizer ignore-type */ $mountPointId);
Loading history...
524
        }
525
526
        // Call a hook to alter configuration
527 5
        if (is_array($GLOBALS['TYPO3_CONF_VARS']['EXTCONF']['crawler']['getPageTSconfigForId'])) {
528
            $params = [
529
                'pageId' => $id,
530
                'pageTSConfig' => &$pageTSconfig,
531
            ];
532
            foreach ($GLOBALS['TYPO3_CONF_VARS']['EXTCONF']['crawler']['getPageTSconfigForId'] as $userFunc) {
533
                GeneralUtility::callUserFunction($userFunc, $params, $this);
534
            }
535
        }
536 5
        return $pageTSconfig;
537
    }
538
539
    /**
540
     * This methods returns an array of configurations.
541
     * Adds no urls!
542
     */
543 4
    public function getUrlsForPageId(int $pageId): array
544
    {
545
        // Get page TSconfig for page ID
546 4
        $pageTSconfig = $this->getPageTSconfigForId($pageId);
547
548 4
        $res = [];
549
550
        // Fetch Crawler Configuration from pageTSconfig
551 4
        $crawlerCfg = $pageTSconfig['tx_crawler.']['crawlerCfg.']['paramSets.'] ?? [];
552 4
        foreach ($crawlerCfg as $key => $values) {
553 3
            if (! is_array($values)) {
554 3
                continue;
555
            }
556 3
            $key = str_replace('.', '', $key);
557
            // Sub configuration for a single configuration string:
558 3
            $subCfg = (array) $crawlerCfg[$key . '.'];
559 3
            $subCfg['key'] = $key;
560
561 3
            if (strcmp($subCfg['procInstrFilter'] ?? '', '')) {
562 3
                $subCfg['procInstrFilter'] = implode(',', GeneralUtility::trimExplode(',', $subCfg['procInstrFilter']));
563
            }
564 3
            $pidOnlyList = implode(',', GeneralUtility::trimExplode(',', $subCfg['pidsOnly'], true));
565
566
            // process configuration if it is not page-specific or if the specific page is the current page:
567
            // TODO: Check if $pidOnlyList can be kept as Array instead of imploded
568 3
            if (! strcmp((string) $subCfg['pidsOnly'], '') || GeneralUtility::inList($pidOnlyList, strval($pageId))) {
569
570
                // Explode, process etc.:
571 3
                $res[$key] = [];
572 3
                $res[$key]['subCfg'] = $subCfg;
573 3
                $res[$key]['paramParsed'] = GeneralUtility::explodeUrl2Array($crawlerCfg[$key]);
574 3
                $res[$key]['paramExpanded'] = $this->expandParameters($res[$key]['paramParsed'], $pageId);
575 3
                $res[$key]['origin'] = 'pagets';
576
577
                // recognize MP value
578 3
                if (! $this->MP) {
579 3
                    $res[$key]['URLs'] = $this->compileUrls($res[$key]['paramExpanded'], ['?id=' . $pageId]);
580
                } else {
581
                    $res[$key]['URLs'] = $this->compileUrls($res[$key]['paramExpanded'], ['?id=' . $pageId . '&MP=' . $this->MP]);
0 ignored issues
show
Bug introduced by
Are you sure $this->MP of type true can be used in concatenation? ( Ignorable by Annotation )

If this is a false-positive, you can also ignore this issue in your code via the ignore-type  annotation

581
                    $res[$key]['URLs'] = $this->compileUrls($res[$key]['paramExpanded'], ['?id=' . $pageId . '&MP=' . /** @scrutinizer ignore-type */ $this->MP]);
Loading history...
582
                }
583
            }
584
        }
585
586
        // Get configuration from tx_crawler_configuration records up the rootline
587 4
        $crawlerConfigurations = $this->configurationRepository->getCrawlerConfigurationRecordsFromRootLine($pageId);
588 4
        foreach ($crawlerConfigurations as $configurationRecord) {
589
590
            // check access to the configuration record
591 1
            if (empty($configurationRecord['begroups']) || $this->getBackendUser()->isAdmin() || $this->hasGroupAccess($this->getBackendUser()->user['usergroup_cached_list'], $configurationRecord['begroups'])) {
592 1
                $pidOnlyList = implode(',', GeneralUtility::trimExplode(',', $configurationRecord['pidsonly'], true));
593
594
                // process configuration if it is not page-specific or if the specific page is the current page:
595
                // TODO: Check if $pidOnlyList can be kept as Array instead of imploded
596 1
                if (! strcmp($configurationRecord['pidsonly'], '') || GeneralUtility::inList($pidOnlyList, strval($pageId))) {
597 1
                    $key = $configurationRecord['name'];
598
599
                    // don't overwrite previously defined paramSets
600 1
                    if (! isset($res[$key])) {
601
602
                        /* @var $TSparserObject \TYPO3\CMS\Core\TypoScript\Parser\TypoScriptParser */
603 1
                        $TSparserObject = GeneralUtility::makeInstance(TypoScriptParser::class);
604 1
                        $TSparserObject->parse($configurationRecord['processing_instruction_parameters_ts']);
605
606
                        $subCfg = [
607 1
                            'procInstrFilter' => $configurationRecord['processing_instruction_filter'],
608 1
                            'procInstrParams.' => $TSparserObject->setup,
609 1
                            'baseUrl' => $configurationRecord['base_url'],
610 1
                            'force_ssl' => (int) $configurationRecord['force_ssl'],
611 1
                            'userGroups' => $configurationRecord['fegroups'],
612 1
                            'exclude' => $configurationRecord['exclude'],
613 1
                            'key' => $key,
614
                        ];
615
616 1
                        if (! in_array($pageId, $this->expandExcludeString($subCfg['exclude']), true)) {
617 1
                            $res[$key] = [];
618 1
                            $res[$key]['subCfg'] = $subCfg;
619 1
                            $res[$key]['paramParsed'] = GeneralUtility::explodeUrl2Array($configurationRecord['configuration']);
620 1
                            $res[$key]['paramExpanded'] = $this->expandParameters($res[$key]['paramParsed'], $pageId);
621 1
                            $res[$key]['URLs'] = $this->compileUrls($res[$key]['paramExpanded'], ['?id=' . $pageId]);
622 1
                            $res[$key]['origin'] = 'tx_crawler_configuration_' . $configurationRecord['uid'];
623
                        }
624
                    }
625
                }
626
            }
627
        }
628
629 4
        foreach ($GLOBALS['TYPO3_CONF_VARS']['EXTCONF']['crawler']['processUrls'] ?? [] as $func) {
630
            $params = [
631
                'res' => &$res,
632
            ];
633
            GeneralUtility::callUserFunction($func, $params, $this);
634
        }
635 4
        return $res;
636
    }
637
638
    /**
639
     * Find all configurations of subpages of a page
640
     * TODO: Write Functional Tests
641
     */
642 1
    public function getConfigurationsForBranch(int $rootid, int $depth): array
643
    {
644 1
        $configurationsForBranch = [];
645 1
        $pageTSconfig = $this->getPageTSconfigForId($rootid);
646 1
        $sets = $pageTSconfig['tx_crawler.']['crawlerCfg.']['paramSets.'] ?? [];
647 1
        foreach ($sets as $key => $value) {
648
            if (! is_array($value)) {
649
                continue;
650
            }
651
            $configurationsForBranch[] = substr($key, -1) === '.' ? substr($key, 0, -1) : $key;
652
        }
653 1
        $pids = [];
654 1
        $rootLine = BackendUtility::BEgetRootLine($rootid);
655 1
        foreach ($rootLine as $node) {
656 1
            $pids[] = $node['uid'];
657
        }
658
        /* @var PageTreeView $tree */
659 1
        $tree = GeneralUtility::makeInstance(PageTreeView::class);
660 1
        $perms_clause = $this->getBackendUser()->getPagePermsClause(Permission::PAGE_SHOW);
661 1
        $tree->init(empty($perms_clause) ? '' : ('AND ' . $perms_clause));
662 1
        $tree->getTree($rootid, $depth, '');
663 1
        foreach ($tree->tree as $node) {
664
            $pids[] = $node['row']['uid'];
665
        }
666
667 1
        $queryBuilder = $this->getQueryBuilder('tx_crawler_configuration');
668
        $statement = $queryBuilder
669 1
            ->select('name')
670 1
            ->from('tx_crawler_configuration')
671 1
            ->where(
672 1
                $queryBuilder->expr()->in('pid', $queryBuilder->createNamedParameter($pids, Connection::PARAM_INT_ARRAY))
673
            )
674 1
            ->execute();
675
676 1
        while ($row = $statement->fetch()) {
677 1
            $configurationsForBranch[] = $row['name'];
678
        }
679 1
        return $configurationsForBranch;
680
    }
681
682
    /**
683
     * Check if a user has access to an item
684
     * (e.g. get the group list of the current logged in user from $GLOBALS['TSFE']->gr_list)
685
     *
686
     * @param string $groupList Comma-separated list of (fe_)group UIDs from a user
687
     * @param string $accessList Comma-separated list of (fe_)group UIDs of the item to access
688
     * @return bool TRUE if at least one of the users group UIDs is in the access list or the access list is empty
689
     * @see \TYPO3\CMS\Frontend\Page\PageRepository::getMultipleGroupsWhereClause()
690
     */
691 3
    public function hasGroupAccess($groupList, $accessList)
692
    {
693 3
        if (empty($accessList)) {
694 1
            return true;
695
        }
696 2
        foreach (GeneralUtility::intExplode(',', $groupList) as $groupUid) {
697 2
            if (GeneralUtility::inList($accessList, $groupUid)) {
698 1
                return true;
699
            }
700
        }
701 1
        return false;
702
    }
703
704
    /**
705
     * Will expand the parameters configuration to individual values. This follows a certain syntax of the value of each parameter.
706
     * Syntax of values:
707
     * - Basically: If the value is wrapped in [...] it will be expanded according to the following syntax, otherwise the value is taken literally
708
     * - Configuration is splitted by "|" and the parts are processed individually and finally added together
709
     * - For each configuration part:
710
     *         - "[int]-[int]" = Integer range, will be expanded to all values in between, values included, starting from low to high (max. 1000). Example "1-34" or "-40--30"
711
     *         - "_TABLE:[TCA table name];[_PID:[optional page id, default is current page]];[_ENABLELANG:1]" = Look up of table records from PID, filtering out deleted records. Example "_TABLE:tt_content; _PID:123"
712
     *        _ENABLELANG:1 picks only original records without their language overlays
713
     *         - Default: Literal value
714
     *
715
     * @param array $paramArray Array with key (GET var name) and values (value of GET var which is configuration for expansion)
716
     * @param integer $pid Current page ID
717
     * @return array
718
     *
719
     * TODO: Write Functional Tests
720
     */
721 11
    public function expandParameters($paramArray, $pid)
722
    {
723
        // Traverse parameter names:
724 11
        foreach ($paramArray as $p => $v) {
725 11
            $v = trim($v);
726
727
            // If value is encapsulated in square brackets it means there are some ranges of values to find, otherwise the value is literal
728 11
            if (strpos($v, '[') === 0 && substr($v, -1) === ']') {
729
                // So, find the value inside brackets and reset the paramArray value as an array.
730 11
                $v = substr($v, 1, -1);
731 11
                $paramArray[$p] = [];
732
733
                // Explode parts and traverse them:
734 11
                $parts = explode('|', $v);
735 11
                foreach ($parts as $pV) {
736
737
                    // Look for integer range: (fx. 1-34 or -40--30 // reads minus 40 to minus 30)
738 11
                    if (preg_match('/^(-?[0-9]+)\s*-\s*(-?[0-9]+)$/', trim($pV), $reg)) {
739 1
                        $reg = $this->swapIfFirstIsLargerThanSecond($reg);
740
741
                        // Traverse range, add values:
742
                        // Limit to size of range!
743 1
                        $runAwayBrake = 1000;
744 1
                        for ($a = $reg[1]; $a <= $reg[2]; $a++) {
745 1
                            $paramArray[$p][] = $a;
746 1
                            $runAwayBrake--;
747 1
                            if ($runAwayBrake <= 0) {
748
                                break;
749
                            }
750
                        }
751 10
                    } elseif (strpos(trim($pV), '_TABLE:') === 0) {
752
753
                        // Parse parameters:
754 6
                        $subparts = GeneralUtility::trimExplode(';', $pV);
755 6
                        $subpartParams = [];
756 6
                        foreach ($subparts as $spV) {
757 6
                            [$pKey, $pVal] = GeneralUtility::trimExplode(':', $spV);
758 6
                            $subpartParams[$pKey] = $pVal;
759
                        }
760
761
                        // Table exists:
762 6
                        if (isset($GLOBALS['TCA'][$subpartParams['_TABLE']])) {
763 6
                            $lookUpPid = isset($subpartParams['_PID']) ? intval($subpartParams['_PID']) : intval($pid);
764 6
                            $recursiveDepth = isset($subpartParams['_RECURSIVE']) ? intval($subpartParams['_RECURSIVE']) : 0;
765 6
                            $pidField = isset($subpartParams['_PIDFIELD']) ? trim($subpartParams['_PIDFIELD']) : 'pid';
766 6
                            $where = $subpartParams['_WHERE'] ?? '';
767 6
                            $addTable = $subpartParams['_ADDTABLE'] ?? '';
768
769 6
                            $fieldName = $subpartParams['_FIELD'] ? $subpartParams['_FIELD'] : 'uid';
770 6
                            if ($fieldName === 'uid' || $GLOBALS['TCA'][$subpartParams['_TABLE']]['columns'][$fieldName]) {
771 6
                                $queryBuilder = $this->getQueryBuilder($subpartParams['_TABLE']);
772
773 6
                                if ($recursiveDepth > 0) {
774
                                    /** @var \TYPO3\CMS\Core\Database\QueryGenerator $queryGenerator */
775 2
                                    $queryGenerator = GeneralUtility::makeInstance(\TYPO3\CMS\Core\Database\QueryGenerator::class);
776 2
                                    $pidList = $queryGenerator->getTreeList($lookUpPid, $recursiveDepth, 0, 1);
777 2
                                    $pidArray = GeneralUtility::intExplode(',', $pidList);
778
                                } else {
779 4
                                    $pidArray = [(string) $lookUpPid];
780
                                }
781
782 6
                                $queryBuilder->getRestrictions()
783 6
                                    ->removeAll()
784 6
                                    ->add(GeneralUtility::makeInstance(DeletedRestriction::class));
785
786
                                $queryBuilder
787 6
                                    ->select($fieldName)
788 6
                                    ->from($subpartParams['_TABLE'])
789 6
                                    ->where(
790 6
                                        $queryBuilder->expr()->in($pidField, $queryBuilder->createNamedParameter($pidArray, Connection::PARAM_INT_ARRAY)),
791 6
                                        $where
792
                                    );
793
794 6
                                if (! empty($addTable)) {
795
                                    // TODO: Check if this works as intended!
796
                                    $queryBuilder->add('from', $addTable);
797
                                }
798 6
                                $transOrigPointerField = $GLOBALS['TCA'][$subpartParams['_TABLE']]['ctrl']['transOrigPointerField'];
799
800 6
                                if ($subpartParams['_ENABLELANG'] && $transOrigPointerField) {
801
                                    $queryBuilder->andWhere(
802
                                        $queryBuilder->expr()->lte(
803
                                            $transOrigPointerField,
804
                                            0
805
                                        )
806
                                    );
807
                                }
808
809 6
                                $statement = $queryBuilder->execute();
810
811 6
                                $rows = [];
812 6
                                while ($row = $statement->fetch()) {
813 6
                                    $rows[$row[$fieldName]] = $row;
814
                                }
815
816 6
                                if (is_array($rows)) {
817 6
                                    $paramArray[$p] = array_merge($paramArray[$p], array_keys($rows));
818
                                }
819
                            }
820
                        }
821
                    } else {
822
                        // Just add value:
823 4
                        $paramArray[$p][] = $pV;
824
                    }
825
                    // Hook for processing own expandParameters place holder
826 11
                    if (is_array($GLOBALS['TYPO3_CONF_VARS']['SC_OPTIONS']['crawler/class.tx_crawler_lib.php']['expandParameters'])) {
827
                        $_params = [
828
                            'pObj' => &$this,
829
                            'paramArray' => &$paramArray,
830
                            'currentKey' => $p,
831
                            'currentValue' => $pV,
832
                            'pid' => $pid,
833
                        ];
834
                        foreach ($GLOBALS['TYPO3_CONF_VARS']['SC_OPTIONS']['crawler/class.tx_crawler_lib.php']['expandParameters'] as $_funcRef) {
835
                            GeneralUtility::callUserFunction($_funcRef, $_params, $this);
836
                        }
837
                    }
838
                }
839
840
                // Make unique set of values and sort array by key:
841 11
                $paramArray[$p] = array_unique($paramArray[$p]);
842 11
                ksort($paramArray);
843
            } else {
844
                // Set the literal value as only value in array:
845 4
                $paramArray[$p] = [$v];
846
            }
847
        }
848
849 11
        return $paramArray;
850
    }
851
852
    /**
853
     * Compiling URLs from parameter array (output of expandParameters())
854
     * The number of URLs will be the multiplication of the number of parameter values for each key
855
     *
856
     * @param array $paramArray Output of expandParameters(): Array with keys (GET var names) and for each an array of values
857
     * @param array $urls URLs accumulated in this array (for recursion)
858
     * @return array
859
     */
860 7
    public function compileUrls($paramArray, array $urls)
861
    {
862 7
        if (empty($paramArray)) {
863 7
            return $urls;
864
        }
865
        // shift first off stack:
866 6
        reset($paramArray);
867 6
        $varName = key($paramArray);
868 6
        $valueSet = array_shift($paramArray);
869
870
        // Traverse value set:
871 6
        $newUrls = [];
872 6
        foreach ($urls as $url) {
873 5
            foreach ($valueSet as $val) {
874 5
                $newUrls[] = $url . (strcmp((string) $val, '') ? '&' . rawurlencode($varName) . '=' . rawurlencode((string) $val) : '');
875
876 5
                if (count($newUrls) > $this->maximumUrlsToCompile) {
877
                    break;
878
                }
879
            }
880
        }
881 6
        return $this->compileUrls($paramArray, $newUrls);
882
    }
883
884
    /************************************
885
     *
886
     * Crawler log
887
     *
888
     ************************************/
889
890
    /**
891
     * Return array of records from crawler queue for input page ID
892
     *
893
     * @param integer $id Page ID for which to look up log entries.
894
     * @param string $filter Filter: "all" => all entries, "pending" => all that is not yet run, "finished" => all complete ones
895
     * @param boolean $doFlush If TRUE, then entries selected at DELETED(!) instead of selected!
896
     * @param boolean $doFullFlush
897
     * @param integer $itemsPerPage Limit the amount of entries per page default is 10
898
     * @return array
899
     */
900 4
    public function getLogEntriesForPageId($id, $filter = '', $doFlush = false, $doFullFlush = false, $itemsPerPage = 10)
901
    {
902 4
        $queryBuilder = GeneralUtility::makeInstance(ConnectionPool::class)->getQueryBuilderForTable($this->tableName);
903
        $queryBuilder
904 4
            ->select('*')
905 4
            ->from($this->tableName)
906 4
            ->where(
907 4
                $queryBuilder->expr()->eq('page_id', $queryBuilder->createNamedParameter($id, \PDO::PARAM_INT))
908
            )
909 4
            ->orderBy('scheduled', 'DESC');
910
911 4
        $expressionBuilder = GeneralUtility::makeInstance(ConnectionPool::class)
912 4
            ->getConnectionForTable($this->tableName)
913 4
            ->getExpressionBuilder();
914 4
        $query = $expressionBuilder->andX();
0 ignored issues
show
Unused Code introduced by
The assignment to $query is dead and can be removed.
Loading history...
915
        // PHPStorm adds the highlight that the $addWhere is immediately overwritten,
916
        // but the $query = $expressionBuilder->andX() ensures that the $addWhere is written correctly with AND
917
        // between the statements, it's not a mistake in the code.
918 4
        switch ($filter) {
919 4
            case 'pending':
920
                $queryBuilder->andWhere($queryBuilder->expr()->eq('exec_time', 0));
921
                break;
922 4
            case 'finished':
923
                $queryBuilder->andWhere($queryBuilder->expr()->gt('exec_time', 0));
924
                break;
925
        }
926
927 4
        if ($doFlush) {
928 2
            if ($doFullFlush) {
929 1
                $this->queueRepository->flushQueue('all');
930
            } else {
931 1
                $this->queueRepository->flushQueue($filter);
932
            }
933
        }
934 4
        if ($itemsPerPage > 0) {
935
            $queryBuilder
936 4
                ->setMaxResults((int) $itemsPerPage);
937
        }
938
939 4
        return $queryBuilder->execute()->fetchAll();
940
    }
941
942
    /**
943
     * Return array of records from crawler queue for input set ID
944
     *
945
     * @param int $set_id Set ID for which to look up log entries.
946
     * @param string $filter Filter: "all" => all entries, "pending" => all that is not yet run, "finished" => all complete ones
947
     * @param bool $doFlush If TRUE, then entries selected at DELETED(!) instead of selected!
948
     * @param int $itemsPerPage Limit the amount of entries per page default is 10
949
     * @return array
950
     *
951
     * @deprecated
952
     */
953 6
    public function getLogEntriesForSetId(int $set_id, string $filter = '', bool $doFlush = false, bool $doFullFlush = false, int $itemsPerPage = 10)
954
    {
955 6
        $queryBuilder = GeneralUtility::makeInstance(ConnectionPool::class)->getQueryBuilderForTable($this->tableName);
956
        $queryBuilder
957 6
            ->select('*')
958 6
            ->from($this->tableName)
959 6
            ->where(
960 6
                $queryBuilder->expr()->eq('set_id', $queryBuilder->createNamedParameter($set_id, \PDO::PARAM_INT))
961
            )
962 6
            ->orderBy('scheduled', 'DESC');
963
964 6
        $expressionBuilder = GeneralUtility::makeInstance(ConnectionPool::class)
965 6
            ->getConnectionForTable($this->tableName)
966 6
            ->getExpressionBuilder();
967 6
        $query = $expressionBuilder->andX();
968
        // PHPStorm adds the highlight that the $addWhere is immediately overwritten,
969
        // but the $query = $expressionBuilder->andX() ensures that the $addWhere is written correctly with AND
970
        // between the statements, it's not a mistake in the code.
971 6
        $addWhere = '';
972 6
        switch ($filter) {
973 6
            case 'pending':
974 1
                $queryBuilder->andWhere($queryBuilder->expr()->eq('exec_time', 0));
975 1
                $addWhere = $query->add($expressionBuilder->eq('exec_time', 0));
0 ignored issues
show
Unused Code introduced by
The assignment to $addWhere is dead and can be removed.
Loading history...
976 1
                break;
977 5
            case 'finished':
978 1
                $queryBuilder->andWhere($queryBuilder->expr()->gt('exec_time', 0));
979 1
                $addWhere = $query->add($expressionBuilder->gt('exec_time', 0));
980 1
                break;
981
        }
982 6
        if ($doFlush) {
983 4
            $addWhere = $query->add($expressionBuilder->eq('set_id', (int) $set_id));
984 4
            $this->flushQueue($doFullFlush ? '' : $addWhere);
0 ignored issues
show
Deprecated Code introduced by
The function AOE\Crawler\Controller\C...ontroller::flushQueue() has been deprecated. ( Ignorable by Annotation )

If this is a false-positive, you can also ignore this issue in your code via the ignore-deprecated  annotation

984
            /** @scrutinizer ignore-deprecated */ $this->flushQueue($doFullFlush ? '' : $addWhere);
Loading history...
985 4
            return [];
986
        }
987 2
        if ($itemsPerPage > 0) {
988
            $queryBuilder
989 2
                ->setMaxResults((int) $itemsPerPage);
990
        }
991
992 2
        return $queryBuilder->execute()->fetchAll();
993
    }
994
995
    /**
996
     * Adding call back entries to log (called from hooks typically, see indexed search class "class.crawler.php"
997
     *
998
     * @param integer $setId Set ID
999
     * @param array $params Parameters to pass to call back function
1000
     * @param string $callBack Call back object reference, eg. 'EXT:indexed_search/class.crawler.php:&tx_indexedsearch_crawler'
1001
     * @param integer $page_id Page ID to attach it to
1002
     * @param integer $schedule Time at which to activate
1003
     */
1004
    public function addQueueEntry_callBack($setId, $params, $callBack, $page_id = 0, $schedule = 0): void
1005
    {
1006
        if (! is_array($params)) {
0 ignored issues
show
introduced by
The condition is_array($params) is always true.
Loading history...
1007
            $params = [];
1008
        }
1009
        $params['_CALLBACKOBJ'] = $callBack;
1010
1011
        GeneralUtility::makeInstance(ConnectionPool::class)->getConnectionForTable('tx_crawler_queue')
1012
            ->insert(
1013
                'tx_crawler_queue',
1014
                [
1015
                    'page_id' => (int) $page_id,
1016
                    'parameters' => json_encode($params),
1017
                    'scheduled' => (int) $schedule ?: $this->getCurrentTime(),
1018
                    'exec_time' => 0,
1019
                    'set_id' => (int) $setId,
1020
                    'result_data' => '',
1021
                ]
1022
            );
1023
    }
1024
1025
    /************************************
1026
     *
1027
     * URL setting
1028
     *
1029
     ************************************/
1030
1031
    /**
1032
     * Setting a URL for crawling:
1033
     *
1034
     * @param integer $id Page ID
1035
     * @param string $url Complete URL
1036
     * @param array $subCfg Sub configuration array (from TS config)
1037
     * @param integer $tstamp Scheduled-time
1038
     * @param string $configurationHash (optional) configuration hash
1039
     * @param bool $skipInnerDuplicationCheck (optional) skip inner duplication check
1040
     * @return bool
1041
     */
1042 8
    public function addUrl(
1043
        $id,
1044
        $url,
1045
        array $subCfg,
1046
        $tstamp,
1047
        $configurationHash = '',
1048
        $skipInnerDuplicationCheck = false
1049
    ) {
1050 8
        $urlAdded = false;
1051 8
        $rows = [];
1052
1053
        // Creating parameters:
1054
        $parameters = [
1055 8
            'url' => $url,
1056
        ];
1057
1058
        // fe user group simulation:
1059 8
        $uGs = implode(',', array_unique(GeneralUtility::intExplode(',', $subCfg['userGroups'], true)));
1060 8
        if ($uGs) {
1061 1
            $parameters['feUserGroupList'] = $uGs;
1062
        }
1063
1064
        // Setting processing instructions
1065 8
        $parameters['procInstructions'] = GeneralUtility::trimExplode(',', $subCfg['procInstrFilter']);
1066 8
        if (is_array($subCfg['procInstrParams.'])) {
1067 5
            $parameters['procInstrParams'] = $subCfg['procInstrParams.'];
1068
        }
1069
1070
        // Compile value array:
1071 8
        $parameters_serialized = json_encode($parameters);
1072
        $fieldArray = [
1073 8
            'page_id' => (int) $id,
1074 8
            'parameters' => $parameters_serialized,
1075 8
            'parameters_hash' => GeneralUtility::shortMD5($parameters_serialized),
1076 8
            'configuration_hash' => $configurationHash,
1077 8
            'scheduled' => $tstamp,
1078 8
            'exec_time' => 0,
1079 8
            'set_id' => (int) $this->setID,
1080 8
            'result_data' => '',
1081 8
            'configuration' => $subCfg['key'],
1082
        ];
1083
1084 8
        if ($this->registerQueueEntriesInternallyOnly) {
1085
            //the entries will only be registered and not stored to the database
1086 1
            $this->queueEntries[] = $fieldArray;
1087
        } else {
1088 7
            if (! $skipInnerDuplicationCheck) {
1089
                // check if there is already an equal entry
1090 6
                $rows = $this->getDuplicateRowsIfExist($tstamp, $fieldArray);
1091
            }
1092
1093 7
            if (empty($rows)) {
1094 6
                $connectionForCrawlerQueue = GeneralUtility::makeInstance(ConnectionPool::class)->getConnectionForTable('tx_crawler_queue');
1095 6
                $connectionForCrawlerQueue->insert(
1096 6
                    'tx_crawler_queue',
1097 6
                    $fieldArray
1098
                );
1099 6
                $uid = $connectionForCrawlerQueue->lastInsertId('tx_crawler_queue', 'qid');
1100 6
                $rows[] = $uid;
1101 6
                $urlAdded = true;
1102
1103 6
                $signalPayload = ['uid' => $uid, 'fieldArray' => $fieldArray];
1104 6
                SignalSlotUtility::emitSignal(
1105 6
                    self::class,
1106 6
                    SignalSlotUtility::SIGNAL_URL_ADDED_TO_QUEUE,
1107 6
                    $signalPayload
1108
                );
1109
            } else {
1110 3
                $signalPayload = ['rows' => $rows, 'fieldArray' => $fieldArray];
1111 3
                SignalSlotUtility::emitSignal(
1112 3
                    self::class,
1113 3
                    SignalSlotUtility::SIGNAL_DUPLICATE_URL_IN_QUEUE,
1114 3
                    $signalPayload
1115
                );
1116
            }
1117
        }
1118
1119 8
        return $urlAdded;
1120
    }
1121
1122
    /**
1123
     * Returns the current system time
1124
     *
1125
     * @return int
1126
     */
1127
    public function getCurrentTime()
1128
    {
1129
        return time();
1130
    }
1131
1132
    /************************************
1133
     *
1134
     * URL reading
1135
     *
1136
     ************************************/
1137
1138
    /**
1139
     * Read URL for single queue entry
1140
     *
1141
     * @param integer $queueId
1142
     * @param boolean $force If set, will process even if exec_time has been set!
1143
     * @return integer
1144
     */
1145
    public function readUrl($queueId, $force = false)
1146
    {
1147
        $queryBuilder = GeneralUtility::makeInstance(ConnectionPool::class)->getQueryBuilderForTable($this->tableName);
1148
        $ret = 0;
1149
        $this->logger->debug('crawler-readurl start ' . microtime(true));
1150
        // Get entry:
1151
        $queryBuilder
1152
            ->select('*')
1153
            ->from('tx_crawler_queue')
1154
            ->where(
1155
                $queryBuilder->expr()->eq('qid', $queryBuilder->createNamedParameter($queueId, \PDO::PARAM_INT))
1156
            );
1157
        if (! $force) {
1158
            $queryBuilder
1159
                ->andWhere('exec_time = 0')
1160
                ->andWhere('process_scheduled > 0');
1161
        }
1162
        $queueRec = $queryBuilder->execute()->fetch();
1163
1164
        if (! is_array($queueRec)) {
1165
            return;
1166
        }
1167
1168
        SignalSlotUtility::emitSignal(
1169
            self::class,
1170
            SignalSlotUtility::SIGNAL_QUEUEITEM_PREPROCESS,
1171
            [$queueId, &$queueRec]
1172
        );
1173
1174
        // Set exec_time to lock record:
1175
        $field_array = ['exec_time' => $this->getCurrentTime()];
1176
1177
        if (isset($this->processID)) {
1178
            //if mulitprocessing is used we need to store the id of the process which has handled this entry
1179
            $field_array['process_id_completed'] = $this->processID;
1180
        }
1181
1182
        GeneralUtility::makeInstance(ConnectionPool::class)->getConnectionForTable('tx_crawler_queue')
1183
            ->update(
1184
                'tx_crawler_queue',
1185
                $field_array,
1186
                ['qid' => (int) $queueId]
1187
            );
1188
1189
        $result = $this->queueExecutor->executeQueueItem($queueRec, $this);
1190
        if ($result['content'] === null) {
1191
            $resultData = 'An errors happened';
1192
        } else {
1193
            /** @var JsonCompatibilityConverter $jsonCompatibilityConverter */
1194
            $jsonCompatibilityConverter = GeneralUtility::makeInstance(JsonCompatibilityConverter::class);
1195
            $resultData = $jsonCompatibilityConverter->convert($result['content']);
1196
        }
1197
1198
        //atm there's no need to point to specific pollable extensions
1199
        if (is_array($GLOBALS['TYPO3_CONF_VARS']['EXTCONF']['crawler']['pollSuccess'])) {
1200
            foreach ($GLOBALS['TYPO3_CONF_VARS']['EXTCONF']['crawler']['pollSuccess'] as $pollable) {
1201
                // only check the success value if the instruction is runnig
1202
                // it is important to name the pollSuccess key same as the procInstructions key
1203
                if (is_array($resultData['parameters']['procInstructions'])
1204
                    && in_array(
1205
                        $pollable,
1206
                        $resultData['parameters']['procInstructions'], true
1207
                    )
1208
                ) {
1209
                    if (! empty($resultData['success'][$pollable]) && $resultData['success'][$pollable]) {
1210
                        $ret |= self::CLI_STATUS_POLLABLE_PROCESSED;
1211
                    }
1212
                }
1213
            }
1214
        }
1215
1216
        // Set result in log which also denotes the end of the processing of this entry.
1217
        $field_array = ['result_data' => json_encode($result)];
1218
1219
        SignalSlotUtility::emitSignal(
1220
            self::class,
1221
            SignalSlotUtility::SIGNAL_QUEUEITEM_POSTPROCESS,
1222
            [$queueId, &$field_array]
1223
        );
1224
1225
        GeneralUtility::makeInstance(ConnectionPool::class)->getConnectionForTable('tx_crawler_queue')
1226
            ->update(
1227
                'tx_crawler_queue',
1228
                $field_array,
1229
                ['qid' => (int) $queueId]
1230
            );
1231
1232
        $this->logger->debug('crawler-readurl stop ' . microtime(true));
1233
        return $ret;
1234
    }
1235
1236
    /**
1237
     * Read URL for not-yet-inserted log-entry
1238
     *
1239
     * @param array $field_array Queue field array,
1240
     *
1241
     * @return array|bool|mixed|string
1242
     */
1243
    public function readUrlFromArray($field_array)
1244
    {
1245
        // Set exec_time to lock record:
1246
        $field_array['exec_time'] = $this->getCurrentTime();
1247
        $connectionForCrawlerQueue = GeneralUtility::makeInstance(ConnectionPool::class)->getConnectionForTable($this->tableName);
1248
        $connectionForCrawlerQueue->insert(
1249
            $this->tableName,
1250
            $field_array
1251
        );
1252
        $queueId = $field_array['qid'] = $connectionForCrawlerQueue->lastInsertId($this->tableName, 'qid');
1253
1254
        $result = $this->queueExecutor->executeQueueItem($field_array, $this);
1255
1256
        // Set result in log which also denotes the end of the processing of this entry.
1257
        $field_array = ['result_data' => json_encode($result)];
1258
1259
        SignalSlotUtility::emitSignal(
1260
            self::class,
1261
            SignalSlotUtility::SIGNAL_QUEUEITEM_POSTPROCESS,
1262
            [$queueId, &$field_array]
1263
        );
1264
1265
        $connectionForCrawlerQueue->update(
1266
            $this->tableName,
1267
            $field_array,
1268
            ['qid' => $queueId]
1269
        );
1270
1271
        return $result;
1272
    }
1273
1274
    /*****************************
1275
     *
1276
     * Compiling URLs to crawl - tools
1277
     *
1278
     *****************************/
1279
1280
    /**
1281
     * @param integer $id Root page id to start from.
1282
     * @param integer $depth Depth of tree, 0=only id-page, 1= on sublevel, 99 = infinite
1283
     * @param integer $scheduledTime Unix Time when the URL is timed to be visited when put in queue
1284
     * @param integer $reqMinute Number of requests per minute (creates the interleave between requests)
1285
     * @param boolean $submitCrawlUrls If set, submits the URLs to queue in database (real crawling)
1286
     * @param boolean $downloadCrawlUrls If set (and submitcrawlUrls is false) will fill $downloadUrls with entries)
1287
     * @param array $incomingProcInstructions Array of processing instructions
1288
     * @param array $configurationSelection Array of configuration keys
1289
     * @return string
1290
     */
1291
    public function getPageTreeAndUrls(
1292
        $id,
1293
        $depth,
1294
        $scheduledTime,
1295
        $reqMinute,
1296
        $submitCrawlUrls,
1297
        $downloadCrawlUrls,
1298
        array $incomingProcInstructions,
1299
        array $configurationSelection
1300
    ) {
1301
        $this->scheduledTime = $scheduledTime;
1302
        $this->reqMinute = $reqMinute;
1303
        $this->submitCrawlUrls = $submitCrawlUrls;
1304
        $this->downloadCrawlUrls = $downloadCrawlUrls;
1305
        $this->incomingProcInstructions = $incomingProcInstructions;
1306
        $this->incomingConfigurationSelection = $configurationSelection;
1307
1308
        $this->duplicateTrack = [];
1309
        $this->downloadUrls = [];
1310
1311
        // Drawing tree:
1312
        /* @var PageTreeView $tree */
1313
        $tree = GeneralUtility::makeInstance(PageTreeView::class);
1314
        $perms_clause = $this->getBackendUser()->getPagePermsClause(Permission::PAGE_SHOW);
1315
        $tree->init('AND ' . $perms_clause);
1316
1317
        $pageInfo = BackendUtility::readPageAccess($id, $perms_clause);
1318
        if (is_array($pageInfo)) {
0 ignored issues
show
introduced by
The condition is_array($pageInfo) is always true.
Loading history...
1319
            // Set root row:
1320
            $tree->tree[] = [
1321
                'row' => $pageInfo,
1322
                'HTML' => $this->iconFactory->getIconForRecord('pages', $pageInfo, Icon::SIZE_SMALL),
1323
            ];
1324
        }
1325
1326
        // Get branch beneath:
1327
        if ($depth) {
1328
            $tree->getTree($id, $depth, '');
1329
        }
1330
1331
        // Traverse page tree:
1332
        $code = '';
1333
1334
        foreach ($tree->tree as $data) {
1335
            $this->MP = false;
1336
1337
            // recognize mount points
1338
            if ($data['row']['doktype'] === PageRepository::DOKTYPE_MOUNTPOINT) {
1339
                $queryBuilder = GeneralUtility::makeInstance(ConnectionPool::class)->getQueryBuilderForTable('pages');
1340
                $queryBuilder->getRestrictions()->removeAll()->add(GeneralUtility::makeInstance(DeletedRestriction::class));
1341
                $mountpage = $queryBuilder
1342
                    ->select('*')
1343
                    ->from('pages')
1344
                    ->where(
1345
                        $queryBuilder->expr()->eq('uid', $queryBuilder->createNamedParameter($data['row']['uid'], \PDO::PARAM_INT))
1346
                    )
1347
                    ->execute()
1348
                    ->fetchAll();
1349
                $queryBuilder->resetRestrictions();
1350
1351
                // fetch mounted pages
1352
                $this->MP = $mountpage[0]['mount_pid'] . '-' . $data['row']['uid'];
0 ignored issues
show
Documentation Bug introduced by
The property $MP was declared of type boolean, but $mountpage[0]['mount_pid...' . $data['row']['uid'] is of type string. Maybe add a type cast?

This check looks for assignments to scalar types that may be of the wrong type.

To ensure the code behaves as expected, it may be a good idea to add an explicit type cast.

$answer = 42;

$correct = false;

$correct = (bool) $answer;
Loading history...
1353
1354
                $mountTree = GeneralUtility::makeInstance(PageTreeView::class);
1355
                $mountTree->init('AND ' . $perms_clause);
1356
                $mountTree->getTree($mountpage[0]['mount_pid'], $depth);
1357
1358
                foreach ($mountTree->tree as $mountData) {
1359
                    $code .= $this->drawURLs_addRowsForPage(
1360
                        $mountData['row'],
1361
                        $mountData['HTML'] . BackendUtility::getRecordTitle('pages', $mountData['row'], true)
1362
                    );
1363
                }
1364
1365
                // replace page when mount_pid_ol is enabled
1366
                if ($mountpage[0]['mount_pid_ol']) {
1367
                    $data['row']['uid'] = $mountpage[0]['mount_pid'];
1368
                } else {
1369
                    // if the mount_pid_ol is not set the MP must not be used for the mountpoint page
1370
                    $this->MP = false;
1371
                }
1372
            }
1373
1374
            $code .= $this->drawURLs_addRowsForPage(
1375
                $data['row'],
1376
                $data['HTML'] . BackendUtility::getRecordTitle('pages', $data['row'], true)
1377
            );
1378
        }
1379
1380
        return $code;
1381
    }
1382
1383
    /**
1384
     * Expands exclude string
1385
     *
1386
     * @param string $excludeString Exclude string
1387
     * @return array
1388
     */
1389 2
    public function expandExcludeString($excludeString)
1390
    {
1391
        // internal static caches;
1392 2
        static $expandedExcludeStringCache;
1393 2
        static $treeCache;
1394
1395 2
        if (empty($expandedExcludeStringCache[$excludeString])) {
1396 2
            $pidList = [];
1397
1398 2
            if (! empty($excludeString)) {
1399
                /** @var PageTreeView $tree */
1400 1
                $tree = GeneralUtility::makeInstance(PageTreeView::class);
1401 1
                $tree->init('AND ' . $this->getBackendUser()->getPagePermsClause(Permission::PAGE_SHOW));
1402
1403 1
                $excludeParts = GeneralUtility::trimExplode(',', $excludeString);
1404
1405 1
                foreach ($excludeParts as $excludePart) {
1406 1
                    [$pid, $depth] = GeneralUtility::trimExplode('+', $excludePart);
1407
1408
                    // default is "page only" = "depth=0"
1409 1
                    if (empty($depth)) {
1410 1
                        $depth = (stristr($excludePart, '+')) ? 99 : 0;
1411
                    }
1412
1413 1
                    $pidList[] = (int) $pid;
1414
1415 1
                    if ($depth > 0) {
1416
                        if (empty($treeCache[$pid][$depth])) {
1417
                            $tree->reset();
1418
                            $tree->getTree($pid, $depth);
0 ignored issues
show
Bug introduced by
$pid of type string is incompatible with the type integer expected by parameter $uid of TYPO3\CMS\Backend\Tree\V...ractTreeView::getTree(). ( Ignorable by Annotation )

If this is a false-positive, you can also ignore this issue in your code via the ignore-type  annotation

1418
                            $tree->getTree(/** @scrutinizer ignore-type */ $pid, $depth);
Loading history...
1419
                            $treeCache[$pid][$depth] = $tree->tree;
1420
                        }
1421
1422
                        foreach ($treeCache[$pid][$depth] as $data) {
1423
                            $pidList[] = (int) $data['row']['uid'];
1424
                        }
1425
                    }
1426
                }
1427
            }
1428
1429 2
            $expandedExcludeStringCache[$excludeString] = array_unique($pidList);
1430
        }
1431
1432 2
        return $expandedExcludeStringCache[$excludeString];
1433
    }
1434
1435
    /**
1436
     * Create the rows for display of the page tree
1437
     * For each page a number of rows are shown displaying GET variable configuration
1438
     */
1439
    public function drawURLs_addRowsForPage(array $pageRow, string $pageTitle): string
1440
    {
1441
        $skipMessage = '';
1442
1443
        // Get list of configurations
1444
        $configurations = $this->getUrlsForPageRow($pageRow, $skipMessage);
1445
1446
        if (! empty($this->incomingConfigurationSelection)) {
1447
            // remove configuration that does not match the current selection
1448
            foreach ($configurations as $confKey => $confArray) {
1449
                if (! in_array($confKey, $this->incomingConfigurationSelection, true)) {
1450
                    unset($configurations[$confKey]);
1451
                }
1452
            }
1453
        }
1454
1455
        // Traverse parameter combinations:
1456
        $c = 0;
1457
        $content = '';
1458
        if (! empty($configurations)) {
1459
            foreach ($configurations as $confKey => $confArray) {
1460
1461
                // Title column:
1462
                if (! $c) {
1463
                    $titleClm = '<td rowspan="' . count($configurations) . '">' . $pageTitle . '</td>';
1464
                } else {
1465
                    $titleClm = '';
1466
                }
1467
1468
                if (! in_array($pageRow['uid'], $this->expandExcludeString($confArray['subCfg']['exclude']), true)) {
1469
1470
                    // URL list:
1471
                    $urlList = $this->urlListFromUrlArray(
1472
                        $confArray,
1473
                        $pageRow,
1474
                        $this->scheduledTime,
1475
                        $this->reqMinute,
1476
                        $this->submitCrawlUrls,
1477
                        $this->downloadCrawlUrls,
1478
                        $this->duplicateTrack,
1479
                        $this->downloadUrls,
1480
                        // if empty the urls won't be filtered by processing instructions
1481
                        $this->incomingProcInstructions
1482
                    );
1483
1484
                    // Expanded parameters:
1485
                    $paramExpanded = '';
1486
                    $calcAccu = [];
1487
                    $calcRes = 1;
1488
                    foreach ($confArray['paramExpanded'] as $gVar => $gVal) {
1489
                        $paramExpanded .= '
1490
                            <tr>
1491
                                <td>' . htmlspecialchars('&' . $gVar . '=') . '<br/>' .
1492
                            '(' . count($gVal) . ')' .
1493
                            '</td>
1494
                                <td nowrap="nowrap">' . nl2br(htmlspecialchars(implode(chr(10), $gVal))) . '</td>
1495
                            </tr>
1496
                        ';
1497
                        $calcRes *= count($gVal);
1498
                        $calcAccu[] = count($gVal);
1499
                    }
1500
                    $paramExpanded = '<table>' . $paramExpanded . '</table>';
1501
                    $paramExpanded .= 'Comb: ' . implode('*', $calcAccu) . '=' . $calcRes;
1502
1503
                    // Options
1504
                    $optionValues = '';
1505
                    if ($confArray['subCfg']['userGroups']) {
1506
                        $optionValues .= 'User Groups: ' . $confArray['subCfg']['userGroups'] . '<br/>';
1507
                    }
1508
                    if ($confArray['subCfg']['procInstrFilter']) {
1509
                        $optionValues .= 'ProcInstr: ' . $confArray['subCfg']['procInstrFilter'] . '<br/>';
1510
                    }
1511
1512
                    // Compile row:
1513
                    $content .= '
1514
                        <tr>
1515
                            ' . $titleClm . '
1516
                            <td>' . htmlspecialchars($confKey) . '</td>
1517
                            <td>' . nl2br(htmlspecialchars(rawurldecode(trim(str_replace('&', chr(10) . '&', GeneralUtility::implodeArrayForUrl('', $confArray['paramParsed'])))))) . '</td>
1518
                            <td>' . $paramExpanded . '</td>
1519
                            <td nowrap="nowrap">' . $urlList . '</td>
1520
                            <td nowrap="nowrap">' . $optionValues . '</td>
1521
                            <td nowrap="nowrap">' . DebugUtility::viewArray($confArray['subCfg']['procInstrParams.']) . '</td>
1522
                        </tr>';
1523
                } else {
1524
                    $content .= '<tr>
1525
                            ' . $titleClm . '
1526
                            <td>' . htmlspecialchars($confKey) . '</td>
1527
                            <td colspan="5"><em>No entries</em> (Page is excluded in this configuration)</td>
1528
                        </tr>';
1529
                }
1530
1531
                $c++;
1532
            }
1533
        } else {
1534
            $message = ! empty($skipMessage) ? ' (' . $skipMessage . ')' : '';
1535
1536
            // Compile row:
1537
            $content .= '
1538
                <tr>
1539
                    <td>' . $pageTitle . '</td>
1540
                    <td colspan="6"><em>No entries</em>' . $message . '</td>
1541
                </tr>';
1542
        }
1543
1544
        return $content;
1545
    }
1546
1547
    /*****************************
1548
     *
1549
     * CLI functions
1550
     *
1551
     *****************************/
1552
1553
    /**
1554
     * Running the functionality of the CLI (crawling URLs from queue)
1555
     */
1556
    public function CLI_run(int $countInARun, int $sleepTime, int $sleepAfterFinish): int
1557
    {
1558
        $result = 0;
1559
        $counter = 0;
1560
1561
        // First, run hooks:
1562
        $this->CLI_runHooks();
1563
1564
        // Clean up the queue
1565
        $this->queueRepository->cleanupQueue();
1566
1567
        // Select entries:
1568
        $rows = $this->queueRepository->fetchRecordsToBeCrawled($countInARun);
1569
1570
        if (! empty($rows)) {
1571
            $quidList = [];
1572
1573
            foreach ($rows as $r) {
1574
                $quidList[] = $r['qid'];
1575
            }
1576
1577
            $processId = $this->CLI_buildProcessId();
0 ignored issues
show
Deprecated Code introduced by
The function AOE\Crawler\Controller\C...r::CLI_buildProcessId() has been deprecated. ( Ignorable by Annotation )

If this is a false-positive, you can also ignore this issue in your code via the ignore-deprecated  annotation

1577
            $processId = /** @scrutinizer ignore-deprecated */ $this->CLI_buildProcessId();
Loading history...
1578
1579
            //save the number of assigned queue entries to determine how many have been processed later
1580
            $numberOfAffectedRows = $this->queueRepository->updateProcessIdAndSchedulerForQueueIds($quidList, $processId);
1581
            $this->processRepository->updateProcessAssignItemsCount($numberOfAffectedRows, $processId);
1582
1583
            if ($numberOfAffectedRows !== count($quidList)) {
1584
                $this->CLI_debug('Nothing processed due to multi-process collision (' . $this->CLI_buildProcessId() . ')');
0 ignored issues
show
Deprecated Code introduced by
The function AOE\Crawler\Controller\C...r::CLI_buildProcessId() has been deprecated. ( Ignorable by Annotation )

If this is a false-positive, you can also ignore this issue in your code via the ignore-deprecated  annotation

1584
                $this->CLI_debug('Nothing processed due to multi-process collision (' . /** @scrutinizer ignore-deprecated */ $this->CLI_buildProcessId() . ')');
Loading history...
1585
                return ($result | self::CLI_STATUS_ABORTED);
1586
            }
1587
1588
            foreach ($rows as $r) {
1589
                $result |= $this->readUrl($r['qid']);
1590
1591
                $counter++;
1592
                // Just to relax the system
1593
                usleep((int) $sleepTime);
1594
1595
                // if during the start and the current read url the cli has been disable we need to return from the function
1596
                // mark the process NOT as ended.
1597
                if ($this->getDisabled()) {
1598
                    return ($result | self::CLI_STATUS_ABORTED);
1599
                }
1600
1601
                if (! $this->processRepository->isProcessActive($this->CLI_buildProcessId())) {
0 ignored issues
show
Deprecated Code introduced by
The function AOE\Crawler\Controller\C...r::CLI_buildProcessId() has been deprecated. ( Ignorable by Annotation )

If this is a false-positive, you can also ignore this issue in your code via the ignore-deprecated  annotation

1601
                if (! $this->processRepository->isProcessActive(/** @scrutinizer ignore-deprecated */ $this->CLI_buildProcessId())) {
Loading history...
1602
                    $this->CLI_debug('conflict / timeout (' . $this->CLI_buildProcessId() . ')');
0 ignored issues
show
Deprecated Code introduced by
The function AOE\Crawler\Controller\C...r::CLI_buildProcessId() has been deprecated. ( Ignorable by Annotation )

If this is a false-positive, you can also ignore this issue in your code via the ignore-deprecated  annotation

1602
                    $this->CLI_debug('conflict / timeout (' . /** @scrutinizer ignore-deprecated */ $this->CLI_buildProcessId() . ')');
Loading history...
1603
                    $result |= self::CLI_STATUS_ABORTED;
1604
                    //possible timeout
1605
                    break;
1606
                }
1607
            }
1608
1609
            sleep((int) $sleepAfterFinish);
1610
1611
            $msg = 'Rows: ' . $counter;
1612
            $this->CLI_debug($msg . ' (' . $this->CLI_buildProcessId() . ')');
0 ignored issues
show
Deprecated Code introduced by
The function AOE\Crawler\Controller\C...r::CLI_buildProcessId() has been deprecated. ( Ignorable by Annotation )

If this is a false-positive, you can also ignore this issue in your code via the ignore-deprecated  annotation

1612
            $this->CLI_debug($msg . ' (' . /** @scrutinizer ignore-deprecated */ $this->CLI_buildProcessId() . ')');
Loading history...
1613
        } else {
1614
            $this->CLI_debug('Nothing within queue which needs to be processed (' . $this->CLI_buildProcessId() . ')');
0 ignored issues
show
Deprecated Code introduced by
The function AOE\Crawler\Controller\C...r::CLI_buildProcessId() has been deprecated. ( Ignorable by Annotation )

If this is a false-positive, you can also ignore this issue in your code via the ignore-deprecated  annotation

1614
            $this->CLI_debug('Nothing within queue which needs to be processed (' . /** @scrutinizer ignore-deprecated */ $this->CLI_buildProcessId() . ')');
Loading history...
1615
        }
1616
1617
        if ($counter > 0) {
1618
            $result |= self::CLI_STATUS_PROCESSED;
1619
        }
1620
1621
        return $result;
1622
    }
1623
1624
    /**
1625
     * Activate hooks
1626
     */
1627
    public function CLI_runHooks(): void
1628
    {
1629
        foreach ($GLOBALS['TYPO3_CONF_VARS']['EXTCONF']['crawler']['cli_hooks'] ?? [] as $objRef) {
1630
            $hookObj = GeneralUtility::makeInstance($objRef);
1631
            if (is_object($hookObj)) {
1632
                $hookObj->crawler_init($this);
1633
            }
1634
        }
1635
    }
1636
1637
    /**
1638
     * Try to acquire a new process with the given id
1639
     * also performs some auto-cleanup for orphan processes
1640
     * @param string $id identification string for the process
1641
     * @return boolean
1642
     * @todo preemption might not be the most elegant way to clean up
1643
     */
1644
    public function CLI_checkAndAcquireNewProcess($id)
1645
    {
1646
        $queryBuilder = GeneralUtility::makeInstance(ConnectionPool::class)->getQueryBuilderForTable($this->tableName);
1647
        $ret = true;
1648
1649
        $systemProcessId = getmypid();
1650
        if ($systemProcessId < 1) {
1651
            return false;
1652
        }
1653
1654
        $processCount = 0;
1655
        $orphanProcesses = [];
1656
1657
        $statement = $queryBuilder
1658
            ->select('process_id', 'ttl')
1659
            ->from('tx_crawler_process')
1660
            ->where(
1661
                'active = 1 AND deleted = 0'
1662
            )
1663
            ->execute();
1664
1665
        $currentTime = $this->getCurrentTime();
1666
1667
        while ($row = $statement->fetch()) {
1668
            if ($row['ttl'] < $currentTime) {
1669
                $orphanProcesses[] = $row['process_id'];
1670
            } else {
1671
                $processCount++;
1672
            }
1673
        }
1674
1675
        // if there are less than allowed active processes then add a new one
1676
        if ($processCount < (int) $this->extensionSettings['processLimit']) {
1677
            $this->CLI_debug('add process ' . $this->CLI_buildProcessId() . ' (' . ($processCount + 1) . '/' . (int) $this->extensionSettings['processLimit'] . ')');
0 ignored issues
show
Deprecated Code introduced by
The function AOE\Crawler\Controller\C...r::CLI_buildProcessId() has been deprecated. ( Ignorable by Annotation )

If this is a false-positive, you can also ignore this issue in your code via the ignore-deprecated  annotation

1677
            $this->CLI_debug('add process ' . /** @scrutinizer ignore-deprecated */ $this->CLI_buildProcessId() . ' (' . ($processCount + 1) . '/' . (int) $this->extensionSettings['processLimit'] . ')');
Loading history...
1678
1679
            GeneralUtility::makeInstance(ConnectionPool::class)->getConnectionForTable('tx_crawler_process')->insert(
1680
                'tx_crawler_process',
1681
                [
1682
                    'process_id' => $id,
1683
                    'active' => 1,
1684
                    'ttl' => $currentTime + (int) $this->extensionSettings['processMaxRunTime'],
1685
                    'system_process_id' => $systemProcessId,
1686
                ]
1687
            );
1688
        } else {
1689
            $this->CLI_debug('Processlimit reached (' . ($processCount) . '/' . (int) $this->extensionSettings['processLimit'] . ')');
1690
            $ret = false;
1691
        }
1692
1693
        $this->processRepository->deleteProcessesMarkedAsDeleted();
1694
        $this->CLI_releaseProcesses($orphanProcesses);
1695
1696
        return $ret;
1697
    }
1698
1699
    /**
1700
     * Release a process and the required resources
1701
     *
1702
     * @param mixed $releaseIds string with a single process-id or array with multiple process-ids
1703
     * @return boolean
1704
     */
1705
    public function CLI_releaseProcesses($releaseIds)
1706
    {
1707
        $queryBuilder = GeneralUtility::makeInstance(ConnectionPool::class)->getQueryBuilderForTable($this->tableName);
1708
1709
        if (! is_array($releaseIds)) {
1710
            $releaseIds = [$releaseIds];
1711
        }
1712
1713
        if (empty($releaseIds)) {
1714
            //nothing to release
1715
            return false;
1716
        }
1717
1718
        // some kind of 2nd chance algo - this way you need at least 2 processes to have a real cleanup
1719
        // this ensures that a single process can't mess up the entire process table
1720
1721
        // mark all processes as deleted which have no "waiting" queue-entires and which are not active
1722
1723
        $queryBuilder
1724
            ->update($this->tableName, 'q')
1725
            ->where(
1726
                'q.process_id IN(SELECT p.process_id FROM tx_crawler_process as p WHERE p.active = 0)'
1727
            )
1728
            ->set('q.process_scheduled', 0)
1729
            ->set('q.process_id', '')
1730
            ->execute();
1731
1732
        // FIXME: Not entirely sure that this is equivalent to the previous version
1733
        $queryBuilder->resetQueryPart('set');
1734
1735
        $queryBuilder
1736
            ->update('tx_crawler_process')
1737
            ->where(
1738
                $queryBuilder->expr()->eq('active', 0),
1739
                'process_id IN(SELECT q.process_id FROM tx_crawler_queue as q WHERE q.exec_time = 0)'
1740
            )
1741
            ->set('system_process_id', 0)
1742
            ->execute();
1743
1744
        $this->processRepository->markRequestedProcessesAsNotActive($releaseIds);
1745
        $this->queueRepository->unsetProcessScheduledAndProcessIdForQueueEntries($releaseIds);
1746
1747
        return true;
1748
    }
1749
1750
    /**
1751
     * Create a unique Id for the current process
1752
     *
1753
     * @return string the ID
1754
     * @deprecated
1755
     */
1756 1
    public function CLI_buildProcessId(): string
1757
    {
1758 1
        return ProcessService::createProcessId($this->processID);
1759
    }
1760
1761
    /**
1762
     * Prints a message to the stdout (only if debug-mode is enabled)
1763
     *
1764
     * @param string $msg the message
1765
     */
1766
    public function CLI_debug($msg): void
1767
    {
1768
        if ((int) $this->extensionSettings['processDebug']) {
1769
            echo $msg . "\n";
1770
            flush();
1771
        }
1772
    }
1773
1774
    /**
1775
     * Cleans up entries that stayed for too long in the queue. These are:
1776
     * - processed entries that are over 1.5 days in age
1777
     * - scheduled entries that are over 7 days old
1778
     *
1779
     * @deprecated
1780
     */
1781 1
    public function cleanUpOldQueueEntries(): void
1782
    {
1783
        // 24*60*60 Seconds in 24 hours
1784 1
        $processedAgeInSeconds = $this->extensionSettings['cleanUpProcessedAge'] * 86400;
1785 1
        $scheduledAgeInSeconds = $this->extensionSettings['cleanUpScheduledAge'] * 86400;
1786
1787 1
        $now = time();
1788 1
        $condition = '(exec_time<>0 AND exec_time<' . ($now - $processedAgeInSeconds) . ') OR scheduled<=' . ($now - $scheduledAgeInSeconds);
1789 1
        $this->flushQueue($condition);
0 ignored issues
show
Deprecated Code introduced by
The function AOE\Crawler\Controller\C...ontroller::flushQueue() has been deprecated. ( Ignorable by Annotation )

If this is a false-positive, you can also ignore this issue in your code via the ignore-deprecated  annotation

1789
        /** @scrutinizer ignore-deprecated */ $this->flushQueue($condition);
Loading history...
1790 1
    }
1791
1792
    /**
1793
     * Removes queue entries
1794
     *
1795
     * @param string $where SQL related filter for the entries which should be removed
1796
     *
1797
     * @deprecated
1798
     */
1799 5
    protected function flushQueue($where = ''): void
1800
    {
1801 5
        $realWhere = strlen((string) $where) > 0 ? $where : '1=1';
1802
1803 5
        $queryBuilder = $this->getQueryBuilder($this->tableName);
1804
1805
        $groups = $queryBuilder
1806 5
            ->selectLiteral('DISTINCT set_id')
1807 5
            ->from($this->tableName)
1808 5
            ->where($realWhere)
1809 5
            ->execute()
1810 5
            ->fetchAll();
1811 5
        if (is_array($groups)) {
0 ignored issues
show
introduced by
The condition is_array($groups) is always true.
Loading history...
1812 5
            foreach ($groups as $group) {
1813
                $subSet = $queryBuilder
1814 4
                    ->select('qid', 'set_id')
1815 4
                    ->from($this->tableName)
1816 4
                    ->where(
1817 4
                        $realWhere,
1818 4
                        $queryBuilder->expr()->eq('set_id', $group['set_id'])
1819
                    )
1820 4
                    ->execute()
1821 4
                    ->fetchAll();
1822
1823 4
                $payLoad = ['subSet' => $subSet];
1824 4
                SignalSlotUtility::emitSignal(
1825 4
                    self::class,
1826 4
                    SignalSlotUtility::SIGNAL_QUEUE_ENTRY_FLUSH,
1827 4
                    $payLoad
1828
                );
1829
            }
1830
        }
1831
1832
        $queryBuilder
1833 5
            ->delete($this->tableName)
1834 5
            ->where($realWhere)
1835 5
            ->execute();
1836 5
    }
1837
1838
    /**
1839
     * This method determines duplicates for a queue entry with the same parameters and this timestamp.
1840
     * If the timestamp is in the past, it will check if there is any unprocessed queue entry in the past.
1841
     * If the timestamp is in the future it will check, if the queued entry has exactly the same timestamp
1842
     *
1843
     * @param int $tstamp
1844
     * @param array $fieldArray
1845
     *
1846
     * @return array
1847
     */
1848 9
    protected function getDuplicateRowsIfExist($tstamp, $fieldArray)
1849
    {
1850 9
        $rows = [];
1851
1852 9
        $currentTime = $this->getCurrentTime();
1853
1854 9
        $queryBuilder = GeneralUtility::makeInstance(ConnectionPool::class)->getQueryBuilderForTable($this->tableName);
1855
        $queryBuilder
1856 9
            ->select('qid')
1857 9
            ->from('tx_crawler_queue');
1858
        //if this entry is scheduled with "now"
1859 9
        if ($tstamp <= $currentTime) {
1860 3
            if ($this->extensionSettings['enableTimeslot']) {
1861 2
                $timeBegin = $currentTime - 100;
1862 2
                $timeEnd = $currentTime + 100;
1863
                $queryBuilder
1864 2
                    ->where(
1865 2
                        'scheduled BETWEEN ' . $timeBegin . ' AND ' . $timeEnd . ''
1866
                    )
1867 2
                    ->orWhere(
1868 2
                        $queryBuilder->expr()->lte('scheduled', $currentTime)
1869
                    );
1870
            } else {
1871
                $queryBuilder
1872 1
                    ->where(
1873 3
                        $queryBuilder->expr()->lte('scheduled', $currentTime)
1874
                    );
1875
            }
1876 6
        } elseif ($tstamp > $currentTime) {
1877
            //entry with a timestamp in the future need to have the same schedule time
1878
            $queryBuilder
1879 6
                ->where(
1880 6
                    $queryBuilder->expr()->eq('scheduled', $tstamp)
1881
                );
1882
        }
1883
1884
        $queryBuilder
1885 9
            ->andWhere('NOT exec_time')
1886 9
            ->andWhere('NOT process_id')
1887 9
            ->andWhere($queryBuilder->expr()->eq('page_id', $queryBuilder->createNamedParameter($fieldArray['page_id'], \PDO::PARAM_INT)))
1888 9
            ->andWhere($queryBuilder->expr()->eq('parameters_hash', $queryBuilder->createNamedParameter($fieldArray['parameters_hash'], \PDO::PARAM_STR)));
1889
1890 9
        $statement = $queryBuilder->execute();
1891
1892 9
        while ($row = $statement->fetch()) {
1893 7
            $rows[] = $row['qid'];
1894
        }
1895
1896 9
        return $rows;
1897
    }
1898
1899
    /**
1900
     * Returns a md5 hash generated from a serialized configuration array.
1901
     *
1902
     * @return string
1903
     */
1904 10
    protected function getConfigurationHash(array $configuration)
1905
    {
1906 10
        unset($configuration['paramExpanded']);
1907 10
        unset($configuration['URLs']);
1908 10
        return md5(serialize($configuration));
1909
    }
1910
1911
    /**
1912
     * Build a URL from a Page and the Query String. If the page has a Site configuration, it can be built by using
1913
     * the Site instance.
1914
     *
1915
     * @param int $httpsOrHttp see tx_crawler_configuration.force_ssl
1916
     * @throws \TYPO3\CMS\Core\Exception\SiteNotFoundException
1917
     * @throws \TYPO3\CMS\Core\Routing\InvalidRouteArgumentsException
1918
     *
1919
     * @deprecated Using CrawlerController::getUrlFromPageAndQueryParameters() is deprecated since 9.1.1 and will be removed in v11.x, please use UrlService->getUrlFromPageAndQueryParameters() instead.
1920
     */
1921
    protected function getUrlFromPageAndQueryParameters(int $pageId, string $queryString, ?string $alternativeBaseUrl, int $httpsOrHttp): UriInterface
1922
    {
1923
        $urlService = new UrlService();
1924
        return $urlService->getUrlFromPageAndQueryParameters($pageId, $queryString, $alternativeBaseUrl, $httpsOrHttp);
1925
    }
1926
1927 1
    protected function swapIfFirstIsLargerThanSecond(array $reg): array
1928
    {
1929
        // Swap if first is larger than last:
1930 1
        if ($reg[1] > $reg[2]) {
1931
            $temp = $reg[2];
1932
            $reg[2] = $reg[1];
1933
            $reg[1] = $temp;
1934
        }
1935
1936 1
        return $reg;
1937
    }
1938
1939
    /**
1940
     * @return BackendUserAuthentication
1941
     */
1942 2
    private function getBackendUser()
1943
    {
1944
        // Make sure the _cli_ user is loaded
1945 2
        Bootstrap::initializeBackendAuthentication();
1946 2
        if ($this->backendUser === null) {
1947 2
            $this->backendUser = $GLOBALS['BE_USER'];
1948
        }
1949 2
        return $this->backendUser;
1950
    }
1951
1952
    /**
1953
     * Get querybuilder for given table
1954
     *
1955
     * @return \TYPO3\CMS\Core\Database\Query\QueryBuilder
1956
     */
1957 12
    private function getQueryBuilder(string $table)
1958
    {
1959 12
        return GeneralUtility::makeInstance(ConnectionPool::class)->getQueryBuilderForTable($table);
1960
    }
1961
}
1962