Passed
Push — wip/remove-deprecations-for-v1... ( 107830...7e6bff )
by Tomas Norre
05:46
created

CrawlerController   F

Complexity

Total Complexity 160

Size/Duplication

Total Lines 1455
Duplicated Lines 0 %

Test Coverage

Coverage 65.39%

Importance

Changes 15
Bugs 0 Features 0
Metric Value
wmc 160
eloc 621
c 15
b 0
f 0
dl 0
loc 1455
ccs 376
cts 575
cp 0.6539
rs 1.979

29 Methods

Rating   Name   Duplication   Size   Complexity  
A swapIfFirstIsLargerThanSecond() 0 10 2
A getMaximumUrlsToCompile() 0 3 1
B CLI_run() 0 57 8
A getCurrentTime() 0 3 1
A CLI_buildProcessId() 0 6 2
A readUrlFromArray() 0 22 1
F expandParameters() 0 129 25
B urlListFromUrlArray() 0 68 8
A getConfigurationHash() 0 5 1
B checkIfPageShouldBeSkipped() 0 33 9
A CLI_releaseProcesses() 0 43 3
A getQueryBuilder() 0 3 1
B drawURLs_addRowsForPage() 0 98 9
A addQueueEntry_callBack() 0 17 3
A setMaximumUrlsToCompile() 0 3 1
A __construct() 0 24 3
A getUrlsForPageRow() 0 17 3
C getUrlsForPageId() 0 93 16
A compileUrls() 0 18 6
A getPageTSconfigForId() 0 21 4
A getBackendUser() 0 8 2
B addUrl() 0 70 6
A drawURLs_PIfilter() 0 12 4
A CLI_checkAndAcquireNewProcess() 0 43 5
B readUrl() 0 77 11
B getPageTreeAndUrls() 0 80 7
A setExtensionSettings() 0 3 1
B getConfigurationsForBranch() 0 38 8
B expandExcludeString() 0 44 9

How to fix   Complexity   

Complex Class

Complex classes like CrawlerController often do a lot of different things. To break such a class down, we need to identify a cohesive component within that class. A common approach to find such a component is to look for fields/methods that share the same prefixes, or suffixes.

Once you have determined the fields that belong together, you can apply the Extract Class refactoring. If the component makes sense as a sub-class, Extract Subclass is also a candidate, and is often faster.

While breaking up the class, it is a good idea to analyze how other classes use CrawlerController, and based on these observations, apply Extract Interface, too.

1
<?php
2
3
declare(strict_types=1);
4
5
namespace AOE\Crawler\Controller;
6
7
/***************************************************************
8
 *  Copyright notice
9
 *
10
 *  (c) 2020 AOE GmbH <[email protected]>
11
 *
12
 *  All rights reserved
13
 *
14
 *  This script is part of the TYPO3 project. The TYPO3 project is
15
 *  free software; you can redistribute it and/or modify
16
 *  it under the terms of the GNU General Public License as published by
17
 *  the Free Software Foundation; either version 3 of the License, or
18
 *  (at your option) any later version.
19
 *
20
 *  The GNU General Public License can be found at
21
 *  http://www.gnu.org/copyleft/gpl.html.
22
 *
23
 *  This script is distributed in the hope that it will be useful,
24
 *  but WITHOUT ANY WARRANTY; without even the implied warranty of
25
 *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
26
 *  GNU General Public License for more details.
27
 *
28
 *  This copyright notice MUST APPEAR in all copies of the script!
29
 ***************************************************************/
30
31
use AOE\Crawler\Configuration\ExtensionConfigurationProvider;
32
use AOE\Crawler\Converter\JsonCompatibilityConverter;
33
use AOE\Crawler\Crawler;
34
use AOE\Crawler\CrawlStrategy\CrawlStrategyFactory;
35
use AOE\Crawler\Domain\Model\Process;
36
use AOE\Crawler\Domain\Repository\ConfigurationRepository;
37
use AOE\Crawler\Domain\Repository\ProcessRepository;
38
use AOE\Crawler\Domain\Repository\QueueRepository;
39
use AOE\Crawler\QueueExecutor;
40
use AOE\Crawler\Service\ConfigurationService;
41
use AOE\Crawler\Service\UrlService;
42
use AOE\Crawler\Service\UserService;
43
use PDO;
44
use Psr\Log\LoggerAwareInterface;
45
use Psr\Log\LoggerAwareTrait;
46
use TYPO3\CMS\Backend\Tree\View\PageTreeView;
47
use TYPO3\CMS\Backend\Utility\BackendUtility;
48
use TYPO3\CMS\Core\Authentication\BackendUserAuthentication;
49
use TYPO3\CMS\Core\Compatibility\PublicMethodDeprecationTrait;
50
use TYPO3\CMS\Core\Compatibility\PublicPropertyDeprecationTrait;
51
use TYPO3\CMS\Core\Core\Bootstrap;
52
use TYPO3\CMS\Core\Database\Connection;
53
use TYPO3\CMS\Core\Database\ConnectionPool;
54
use TYPO3\CMS\Core\Database\Query\QueryBuilder;
55
use TYPO3\CMS\Core\Database\Query\Restriction\DeletedRestriction;
56
use TYPO3\CMS\Core\Database\QueryGenerator;
57
use TYPO3\CMS\Core\Domain\Repository\PageRepository;
58
use TYPO3\CMS\Core\Imaging\Icon;
59
use TYPO3\CMS\Core\Imaging\IconFactory;
60
use TYPO3\CMS\Core\Type\Bitmask\Permission;
61
use TYPO3\CMS\Core\TypoScript\Parser\TypoScriptParser;
62
use TYPO3\CMS\Core\Utility\DebugUtility;
63
use TYPO3\CMS\Core\Utility\GeneralUtility;
64
use TYPO3\CMS\Core\Utility\MathUtility;
65
use TYPO3\CMS\Extbase\Object\ObjectManager;
66
67
/**
68
 * Class CrawlerController
69
 *
70
 * @package AOE\Crawler\Controller
71
 */
72
class CrawlerController implements LoggerAwareInterface
73
{
74
    use LoggerAwareTrait;
75
    use PublicMethodDeprecationTrait;
0 ignored issues
show
Bug introduced by
The trait TYPO3\CMS\Core\Compatibi...cMethodDeprecationTrait requires the property $deprecatedPublicMethods which is not provided by AOE\Crawler\Controller\CrawlerController.
Loading history...
76
    use PublicPropertyDeprecationTrait;
0 ignored issues
show
Bug introduced by
The trait TYPO3\CMS\Core\Compatibi...ropertyDeprecationTrait requires the property $deprecatedPublicProperties which is not provided by AOE\Crawler\Controller\CrawlerController.
Loading history...
77
78
    public const CLI_STATUS_NOTHING_PROCCESSED = 0;
79
80
    //queue not empty
81
    public const CLI_STATUS_REMAIN = 1;
82
83
    //(some) queue items where processed
84
    public const CLI_STATUS_PROCESSED = 2;
85
86
    //instance didn't finish
87
    public const CLI_STATUS_ABORTED = 4;
88
89
    public const CLI_STATUS_POLLABLE_PROCESSED = 8;
90
91
    /**
92
     * @var integer
93
     */
94
    public $setID = 0;
95
96
    /**
97
     * @var string
98
     */
99
    public $processID = '';
100
101
    /**
102
     * @var array
103
     */
104
    public $duplicateTrack = [];
105
106
    /**
107
     * @var array
108
     */
109
    public $downloadUrls = [];
110
111
    /**
112
     * @var array
113
     */
114
    public $incomingProcInstructions = [];
115
116
    /**
117
     * @var array
118
     */
119
    public $incomingConfigurationSelection = [];
120
121
    /**
122
     * @var bool
123
     */
124
    public $registerQueueEntriesInternallyOnly = false;
125
126
    /**
127
     * @var array
128
     */
129
    public $queueEntries = [];
130
131
    /**
132
     * @var array
133
     */
134
    public $urlList = [];
135
136
    /**
137
     * @var array
138
     */
139
    public $extensionSettings = [];
140
141
    /**
142
     * Mount Point
143
     *
144
     * @var bool
145
     * Todo: Check what this is used for and adjust the type hint or code, as bool doesn't match the current code.
146
     */
147
    public $MP = false;
148
149
    /**
150
     * @var QueueRepository
151
     */
152
    protected $queueRepository;
153
154
    /**
155
     * @var ProcessRepository
156
     */
157
    protected $processRepository;
158
159
    /**
160
     * @var ConfigurationRepository
161
     */
162
    protected $configurationRepository;
163
164
    /**
165
     * @var string
166
     */
167
    protected $tableName = 'tx_crawler_queue';
168
169
    /**
170
     * @var QueueExecutor
171
     */
172
    protected $queueExecutor;
173
174
    /**
175
     * @var int
176
     */
177
    protected $maximumUrlsToCompile = 10000;
178
179
    /**
180
     * @var IconFactory
181
     */
182
    protected $iconFactory;
183
184
    /**
185
     * @var BackendUserAuthentication|null
186
     */
187
    private $backendUser;
188
189
    /**
190
     * @var integer
191
     */
192
    private $scheduledTime = 0;
193
194
    /**
195
     * @var integer
196
     */
197
    private $reqMinute = 0;
198
199
    /**
200
     * @var bool
201
     */
202
    private $submitCrawlUrls = false;
203
204
    /**
205
     * @var bool
206
     */
207
    private $downloadCrawlUrls = false;
208
209
    /**
210
     * @var PageRepository
211
     */
212
    private $pageRepository;
213
214
    /**
215
     * @var Crawler
216
     */
217
    private $crawler;
218
219
    /************************************
220
     *
221
     * Getting URLs based on Page TSconfig
222
     *
223
     ************************************/
224
225 19
    public function __construct()
226
    {
227 19
        $objectManager = GeneralUtility::makeInstance(ObjectManager::class);
228 19
        $crawlStrategyFactory = GeneralUtility::makeInstance(CrawlStrategyFactory::class);
229 19
        $this->queueRepository = $objectManager->get(QueueRepository::class);
230 19
        $this->processRepository = $objectManager->get(ProcessRepository::class);
231 19
        $this->configurationRepository = $objectManager->get(ConfigurationRepository::class);
232 19
        $this->pageRepository = GeneralUtility::makeInstance(PageRepository::class);
233 19
        $this->queueExecutor = GeneralUtility::makeInstance(QueueExecutor::class, $crawlStrategyFactory);
234 19
        $this->iconFactory = GeneralUtility::makeInstance(IconFactory::class);
235 19
        $this->crawler = GeneralUtility::makeInstance(Crawler::class);
236
237
        /** @var ExtensionConfigurationProvider $configurationProvider */
238 19
        $configurationProvider = GeneralUtility::makeInstance(ExtensionConfigurationProvider::class);
239 19
        $settings = $configurationProvider->getExtensionConfiguration();
240 19
        $this->extensionSettings = is_array($settings) ? $settings : [];
0 ignored issues
show
introduced by
The condition is_array($settings) is always true.
Loading history...
241
242
        // set defaults:
243 19
        if (MathUtility::convertToPositiveInteger($this->extensionSettings['countInARun']) === 0) {
244
            $this->extensionSettings['countInARun'] = 100;
245
        }
246
247 19
        $this->extensionSettings['processLimit'] = MathUtility::forceIntegerInRange($this->extensionSettings['processLimit'], 1, 99, 1);
248 19
        $this->setMaximumUrlsToCompile(MathUtility::forceIntegerInRange($this->extensionSettings['maxCompileUrls'], 1, 1000000000, 10000));
249 19
    }
250
251 23
    public function setMaximumUrlsToCompile(int $maximumUrlsToCompile): void
252
    {
253 23
        $this->maximumUrlsToCompile = $maximumUrlsToCompile;
254 23
    }
255
256
    /**
257
     * Sets the extensions settings (unserialized pendant of $TYPO3_CONF_VARS['EXT']['extConf']['crawler']).
258
     */
259 8
    public function setExtensionSettings(array $extensionSettings): void
260
    {
261 8
        $this->extensionSettings = $extensionSettings;
262 8
    }
263
264
    /**
265
     * Check if the given page should be crawled
266
     *
267
     * @return false|string false if the page should be crawled (not excluded), true / skipMessage if it should be skipped
268
     */
269 11
    public function checkIfPageShouldBeSkipped(array $pageRow)
270
    {
271
        // if page is hidden
272 11
        if (! $this->extensionSettings['crawlHiddenPages'] && $pageRow['hidden']) {
273 1
            return 'Because page is hidden';
274
        }
275
276 10
        if (GeneralUtility::inList('3,4,199,254,255', $pageRow['doktype'])) {
277 3
            return 'Because doktype is not allowed';
278
        }
279
280 7
        foreach ($GLOBALS['TYPO3_CONF_VARS']['EXTCONF']['crawler']['excludeDoktype'] ?? [] as $key => $doktypeList) {
281 1
            if (GeneralUtility::inList($doktypeList, $pageRow['doktype'])) {
282 1
                return 'Doktype was excluded by "' . $key . '"';
283
            }
284
        }
285
286
        // veto hook
287 6
        foreach ($GLOBALS['TYPO3_CONF_VARS']['EXTCONF']['crawler']['pageVeto'] ?? [] as $key => $func) {
288
            $params = [
289 2
                'pageRow' => $pageRow,
290
            ];
291
            // expects "false" if page is ok and "true" or a skipMessage if this page should _not_ be crawled
292 2
            $veto = GeneralUtility::callUserFunction($func, $params, $this);
293 2
            if ($veto !== false) {
294 2
                if (is_string($veto)) {
295 1
                    return $veto;
296
                }
297 1
                return 'Veto from hook "' . htmlspecialchars($key) . '"';
298
            }
299
        }
300
301 4
        return false;
302
    }
303
304
    /**
305
     * Wrapper method for getUrlsForPageId()
306
     * It returns an array of configurations and no urls!
307
     *
308
     * @param array $pageRow Page record with at least dok-type and uid columns.
309
     * @param string $skipMessage
310
     * @return array
311
     * @see getUrlsForPageId()
312
     */
313 5
    public function getUrlsForPageRow(array $pageRow, &$skipMessage = '')
314
    {
315 5
        if (! is_int($pageRow['uid'])) {
316
            $skipMessage = 'PageUid ' . $pageRow['uid'] . ' was not an integer';
317
            return [];
318
        }
319
320 5
        $message = $this->checkIfPageShouldBeSkipped($pageRow);
321 5
        if ($message === false) {
322 4
            $res = $this->getUrlsForPageId($pageRow['uid']);
323 4
            $skipMessage = '';
324
        } else {
325 1
            $skipMessage = $message;
326 1
            $res = [];
327
        }
328
329 5
        return $res;
330
    }
331
332
    /**
333
     * Creates a list of URLs from input array (and submits them to queue if asked for)
334
     * See Web > Info module script + "indexed_search"'s crawler hook-client using this!
335
     *
336
     * @param array $vv Information about URLs from pageRow to crawl.
337
     * @param array $pageRow Page row
338
     * @param int $scheduledTime Unix time to schedule indexing to, typically time()
339
     * @param int $reqMinute Number of requests per minute (creates the interleave between requests)
340
     * @param bool $submitCrawlUrls If set, submits the URLs to queue
341
     * @param bool $downloadCrawlUrls If set (and submitcrawlUrls is false) will fill $downloadUrls with entries)
342
     * @param array $duplicateTrack Array which is passed by reference and contains the an id per url to secure we will not crawl duplicates
343
     * @param array $downloadUrls Array which will be filled with URLS for download if flag is set.
344
     * @param array $incomingProcInstructions Array of processing instructions
345
     * @return string List of URLs (meant for display in backend module)
346
     */
347 3
    public function urlListFromUrlArray(
348
        array $vv,
349
        array $pageRow,
350
        $scheduledTime,
351
        $reqMinute,
352
        $submitCrawlUrls,
353
        $downloadCrawlUrls,
354
        array &$duplicateTrack,
355
        array &$downloadUrls,
356
        array $incomingProcInstructions
357
    ) {
358 3
        if (! is_array($vv['URLs'])) {
359
            return 'ERROR - no URL generated';
360
        }
361 3
        $urlLog = [];
362 3
        $pageId = (int) $pageRow['uid'];
363 3
        $configurationHash = $this->getConfigurationHash($vv);
364 3
        $skipInnerCheck = $this->queueRepository->noUnprocessedQueueEntriesForPageWithConfigurationHashExist($pageId, $configurationHash);
365
366 3
        $urlService = new UrlService();
367
368 3
        foreach ($vv['URLs'] as $urlQuery) {
369 3
            if (! $this->drawURLs_PIfilter($vv['subCfg']['procInstrFilter'], $incomingProcInstructions)) {
370
                continue;
371
            }
372 3
            $url = (string) $urlService->getUrlFromPageAndQueryParameters(
373 3
                $pageId,
374
                $urlQuery,
375 3
                $vv['subCfg']['baseUrl'] ?? null,
376 3
                $vv['subCfg']['force_ssl'] ?? 0
377
            );
378
379
            // Create key by which to determine unique-ness:
380 3
            $uKey = $url . '|' . $vv['subCfg']['userGroups'] . '|' . $vv['subCfg']['procInstrFilter'];
381
382 3
            if (isset($duplicateTrack[$uKey])) {
383
                //if the url key is registered just display it and do not resubmit is
384
                $urlLog[] = '<em><span class="text-muted">' . htmlspecialchars($url) . '</span></em>';
385
            } else {
386
                // Scheduled time:
387 3
                $schTime = $scheduledTime + round(count($duplicateTrack) * (60 / $reqMinute));
388 3
                $schTime = intval($schTime / 60) * 60;
389 3
                $formattedDate = BackendUtility::datetime($schTime);
390 3
                $this->urlList[] = '[' . $formattedDate . '] ' . $url;
391 3
                $urlList = '[' . $formattedDate . '] ' . htmlspecialchars($url);
392
393
                // Submit for crawling!
394 3
                if ($submitCrawlUrls) {
395 3
                    $added = $this->addUrl(
396 3
                        $pageId,
397
                        $url,
398 3
                        $vv['subCfg'],
399
                        $scheduledTime,
400
                        $configurationHash,
401
                        $skipInnerCheck
402
                    );
403 3
                    if ($added === false) {
404 3
                        $urlList .= ' (URL already existed)';
405
                    }
406
                } elseif ($downloadCrawlUrls) {
407
                    $downloadUrls[$url] = $url;
408
                }
409 3
                $urlLog[] = $urlList;
410
            }
411 3
            $duplicateTrack[$uKey] = true;
412
        }
413
414 3
        return implode('<br>', $urlLog);
415
    }
416
417
    /**
418
     * Returns true if input processing instruction is among registered ones.
419
     *
420
     * @param string $piString PI to test
421
     * @param array $incomingProcInstructions Processing instructions
422
     * @return boolean
423
     */
424 8
    public function drawURLs_PIfilter($piString, array $incomingProcInstructions)
425
    {
426 8
        if (empty($incomingProcInstructions)) {
427 4
            return true;
428
        }
429
430 4
        foreach ($incomingProcInstructions as $pi) {
431 4
            if (GeneralUtility::inList($piString, $pi)) {
432 2
                return true;
433
            }
434
        }
435 2
        return false;
436
    }
437
438 5
    public function getPageTSconfigForId($id): array
439
    {
440 5
        if (! $this->MP) {
441 5
            $pageTSconfig = BackendUtility::getPagesTSconfig($id);
442
        } else {
443
            // TODO: Please check, this makes no sense to split a boolean value.
444
            [, $mountPointId] = explode('-', $this->MP);
0 ignored issues
show
Bug introduced by
$this->MP of type true is incompatible with the type string expected by parameter $string of explode(). ( Ignorable by Annotation )

If this is a false-positive, you can also ignore this issue in your code via the ignore-type  annotation

444
            [, $mountPointId] = explode('-', /** @scrutinizer ignore-type */ $this->MP);
Loading history...
445
            $pageTSconfig = BackendUtility::getPagesTSconfig($mountPointId);
0 ignored issues
show
Bug introduced by
$mountPointId of type string is incompatible with the type integer expected by parameter $id of TYPO3\CMS\Backend\Utilit...ity::getPagesTSconfig(). ( Ignorable by Annotation )

If this is a false-positive, you can also ignore this issue in your code via the ignore-type  annotation

445
            $pageTSconfig = BackendUtility::getPagesTSconfig(/** @scrutinizer ignore-type */ $mountPointId);
Loading history...
446
        }
447
448
        // Call a hook to alter configuration
449 5
        if (is_array($GLOBALS['TYPO3_CONF_VARS']['EXTCONF']['crawler']['getPageTSconfigForId'])) {
450
            $params = [
451
                'pageId' => $id,
452
                'pageTSConfig' => &$pageTSconfig,
453
            ];
454
            foreach ($GLOBALS['TYPO3_CONF_VARS']['EXTCONF']['crawler']['getPageTSconfigForId'] as $userFunc) {
455
                GeneralUtility::callUserFunction($userFunc, $params, $this);
456
            }
457
        }
458 5
        return $pageTSconfig;
459
    }
460
461
    /**
462
     * This methods returns an array of configurations.
463
     * Adds no urls!
464
     */
465 3
    public function getUrlsForPageId(int $pageId): array
466
    {
467
        // Get page TSconfig for page ID
468 3
        $pageTSconfig = $this->getPageTSconfigForId($pageId);
469
470 3
        $res = [];
471
472
        // Fetch Crawler Configuration from pageTSconfig
473 3
        $crawlerCfg = $pageTSconfig['tx_crawler.']['crawlerCfg.']['paramSets.'] ?? [];
474 3
        foreach ($crawlerCfg as $key => $values) {
475 3
            if (! is_array($values)) {
476 3
                continue;
477
            }
478 3
            $key = str_replace('.', '', $key);
479
            // Sub configuration for a single configuration string:
480 3
            $subCfg = (array) $crawlerCfg[$key . '.'];
481 3
            $subCfg['key'] = $key;
482
483 3
            if (strcmp($subCfg['procInstrFilter'] ?? '', '')) {
484 3
                $subCfg['procInstrFilter'] = implode(',', GeneralUtility::trimExplode(',', $subCfg['procInstrFilter']));
485
            }
486 3
            $pidOnlyList = implode(',', GeneralUtility::trimExplode(',', $subCfg['pidsOnly'], true));
487
488
            // process configuration if it is not page-specific or if the specific page is the current page:
489
            // TODO: Check if $pidOnlyList can be kept as Array instead of imploded
490 3
            if (! strcmp((string) $subCfg['pidsOnly'], '') || GeneralUtility::inList($pidOnlyList, strval($pageId))) {
491
492
                // Explode, process etc.:
493 3
                $res[$key] = [];
494 3
                $res[$key]['subCfg'] = $subCfg;
495 3
                $res[$key]['paramParsed'] = GeneralUtility::explodeUrl2Array($crawlerCfg[$key]);
496 3
                $res[$key]['paramExpanded'] = $this->expandParameters($res[$key]['paramParsed'], $pageId);
497 3
                $res[$key]['origin'] = 'pagets';
498
499
                // recognize MP value
500 3
                if (! $this->MP) {
501 3
                    $res[$key]['URLs'] = $this->compileUrls($res[$key]['paramExpanded'], ['?id=' . $pageId]);
502
                } else {
503
                    $res[$key]['URLs'] = $this->compileUrls($res[$key]['paramExpanded'], ['?id=' . $pageId . '&MP=' . $this->MP]);
0 ignored issues
show
Bug introduced by
Are you sure $this->MP of type true can be used in concatenation? ( Ignorable by Annotation )

If this is a false-positive, you can also ignore this issue in your code via the ignore-type  annotation

503
                    $res[$key]['URLs'] = $this->compileUrls($res[$key]['paramExpanded'], ['?id=' . $pageId . '&MP=' . /** @scrutinizer ignore-type */ $this->MP]);
Loading history...
504
                }
505
            }
506
        }
507
508
        // Get configuration from tx_crawler_configuration records up the rootline
509 3
        $crawlerConfigurations = $this->configurationRepository->getCrawlerConfigurationRecordsFromRootLine($pageId);
510 3
        foreach ($crawlerConfigurations as $configurationRecord) {
511
512
            // check access to the configuration record
513
            if (empty($configurationRecord['begroups']) || $this->getBackendUser()->isAdmin() || UserService::hasGroupAccess($this->getBackendUser()->user['usergroup_cached_list'], $configurationRecord['begroups'])) {
514
                $pidOnlyList = implode(',', GeneralUtility::trimExplode(',', $configurationRecord['pidsonly'], true));
515
516
                // process configuration if it is not page-specific or if the specific page is the current page:
517
                // TODO: Check if $pidOnlyList can be kept as Array instead of imploded
518
                if (! strcmp($configurationRecord['pidsonly'], '') || GeneralUtility::inList($pidOnlyList, strval($pageId))) {
519
                    $key = $configurationRecord['name'];
520
521
                    // don't overwrite previously defined paramSets
522
                    if (! isset($res[$key])) {
523
524
                        /* @var $TSparserObject TypoScriptParser */
525
                        $TSparserObject = GeneralUtility::makeInstance(TypoScriptParser::class);
526
                        $TSparserObject->parse($configurationRecord['processing_instruction_parameters_ts']);
527
528
                        $subCfg = [
529
                            'procInstrFilter' => $configurationRecord['processing_instruction_filter'],
530
                            'procInstrParams.' => $TSparserObject->setup,
531
                            'baseUrl' => $configurationRecord['base_url'],
532
                            'force_ssl' => (int) $configurationRecord['force_ssl'],
533
                            'userGroups' => $configurationRecord['fegroups'],
534
                            'exclude' => $configurationRecord['exclude'],
535
                            'key' => $key,
536
                        ];
537
538
                        if (! in_array($pageId, $this->expandExcludeString($subCfg['exclude']), true)) {
539
                            $res[$key] = [];
540
                            $res[$key]['subCfg'] = $subCfg;
541
                            $res[$key]['paramParsed'] = GeneralUtility::explodeUrl2Array($configurationRecord['configuration']);
542
                            $res[$key]['paramExpanded'] = $this->expandParameters($res[$key]['paramParsed'], $pageId);
543
                            $res[$key]['URLs'] = $this->compileUrls($res[$key]['paramExpanded'], ['?id=' . $pageId]);
544
                            $res[$key]['origin'] = 'tx_crawler_configuration_' . $configurationRecord['uid'];
545
                        }
546
                    }
547
                }
548
            }
549
        }
550
551 3
        foreach ($GLOBALS['TYPO3_CONF_VARS']['EXTCONF']['crawler']['processUrls'] ?? [] as $func) {
552
            $params = [
553
                'res' => &$res,
554
            ];
555
            GeneralUtility::callUserFunction($func, $params, $this);
556
        }
557 3
        return $res;
558
    }
559
560
    /**
561
     * Find all configurations of subpages of a page
562
     * TODO: Write Functional Tests
563
     */
564 2
    public function getConfigurationsForBranch(int $rootid, int $depth): array
565
    {
566 2
        $configurationsForBranch = [];
567 2
        $pageTSconfig = $this->getPageTSconfigForId($rootid);
568 2
        $sets = $pageTSconfig['tx_crawler.']['crawlerCfg.']['paramSets.'] ?? [];
569 2
        foreach ($sets as $key => $value) {
570
            if (! is_array($value)) {
571
                continue;
572
            }
573
            $configurationsForBranch[] = substr($key, -1) === '.' ? substr($key, 0, -1) : $key;
574
        }
575 2
        $pids = [];
576 2
        $rootLine = BackendUtility::BEgetRootLine($rootid);
577 2
        foreach ($rootLine as $node) {
578 1
            $pids[] = $node['uid'];
579
        }
580
        /* @var PageTreeView $tree */
581 2
        $tree = GeneralUtility::makeInstance(PageTreeView::class);
582 2
        $perms_clause = $this->getBackendUser()->getPagePermsClause(Permission::PAGE_SHOW);
583 2
        $tree->init(empty($perms_clause) ? '' : ('AND ' . $perms_clause));
584 2
        $tree->getTree($rootid, $depth, '');
585 2
        foreach ($tree->tree as $node) {
586
            $pids[] = $node['row']['uid'];
587
        }
588
589 2
        $queryBuilder = $this->getQueryBuilder('tx_crawler_configuration');
590
        $statement = $queryBuilder
591 2
            ->select('name')
592 2
            ->from('tx_crawler_configuration')
593 2
            ->where(
594 2
                $queryBuilder->expr()->in('pid', $queryBuilder->createNamedParameter($pids, Connection::PARAM_INT_ARRAY))
595
            )
596 2
            ->execute();
597
598 2
        while ($row = $statement->fetch()) {
599 1
            $configurationsForBranch[] = $row['name'];
600
        }
601 2
        return $configurationsForBranch;
602
    }
603
604
    /**
605
     * Will expand the parameters configuration to individual values. This follows a certain syntax of the value of each parameter.
606
     * Syntax of values:
607
     * - Basically: If the value is wrapped in [...] it will be expanded according to the following syntax, otherwise the value is taken literally
608
     * - Configuration is splitted by "|" and the parts are processed individually and finally added together
609
     * - For each configuration part:
610
     *         - "[int]-[int]" = Integer range, will be expanded to all values in between, values included, starting from low to high (max. 1000). Example "1-34" or "-40--30"
611
     *         - "_TABLE:[TCA table name];[_PID:[optional page id, default is current page]];[_ENABLELANG:1]" = Look up of table records from PID, filtering out deleted records. Example "_TABLE:tt_content; _PID:123"
612
     *        _ENABLELANG:1 picks only original records without their language overlays
613
     *         - Default: Literal value
614
     *
615
     * @param array $paramArray Array with key (GET var name) and values (value of GET var which is configuration for expansion)
616
     * @param integer $pid Current page ID
617
     * @return array
618
     *
619
     * TODO: Write Functional Tests
620
     */
621 10
    public function expandParameters($paramArray, $pid)
622
    {
623
        // Traverse parameter names:
624 10
        foreach ($paramArray as $p => $v) {
625 10
            $v = trim($v);
626
627
            // If value is encapsulated in square brackets it means there are some ranges of values to find, otherwise the value is literal
628 10
            if (strpos($v, '[') === 0 && substr($v, -1) === ']') {
629
                // So, find the value inside brackets and reset the paramArray value as an array.
630 10
                $v = substr($v, 1, -1);
631 10
                $paramArray[$p] = [];
632
633
                // Explode parts and traverse them:
634 10
                $parts = explode('|', $v);
635 10
                foreach ($parts as $pV) {
636
637
                    // Look for integer range: (fx. 1-34 or -40--30 // reads minus 40 to minus 30)
638 10
                    if (preg_match('/^(-?[0-9]+)\s*-\s*(-?[0-9]+)$/', trim($pV), $reg)) {
639 1
                        $reg = $this->swapIfFirstIsLargerThanSecond($reg);
640
641
                        // Traverse range, add values:
642
                        // Limit to size of range!
643 1
                        $runAwayBrake = 1000;
644 1
                        for ($a = $reg[1]; $a <= $reg[2]; $a++) {
645 1
                            $paramArray[$p][] = $a;
646 1
                            $runAwayBrake--;
647 1
                            if ($runAwayBrake <= 0) {
648
                                break;
649
                            }
650
                        }
651 9
                    } elseif (strpos(trim($pV), '_TABLE:') === 0) {
652
653
                        // Parse parameters:
654 6
                        $subparts = GeneralUtility::trimExplode(';', $pV);
655 6
                        $subpartParams = [];
656 6
                        foreach ($subparts as $spV) {
657 6
                            [$pKey, $pVal] = GeneralUtility::trimExplode(':', $spV);
658 6
                            $subpartParams[$pKey] = $pVal;
659
                        }
660
661
                        // Table exists:
662 6
                        if (isset($GLOBALS['TCA'][$subpartParams['_TABLE']])) {
663 6
                            $lookUpPid = isset($subpartParams['_PID']) ? intval($subpartParams['_PID']) : intval($pid);
664 6
                            $recursiveDepth = isset($subpartParams['_RECURSIVE']) ? intval($subpartParams['_RECURSIVE']) : 0;
665 6
                            $pidField = isset($subpartParams['_PIDFIELD']) ? trim($subpartParams['_PIDFIELD']) : 'pid';
666 6
                            $where = $subpartParams['_WHERE'] ?? '';
667 6
                            $addTable = $subpartParams['_ADDTABLE'] ?? '';
668
669 6
                            $fieldName = $subpartParams['_FIELD'] ? $subpartParams['_FIELD'] : 'uid';
670 6
                            if ($fieldName === 'uid' || $GLOBALS['TCA'][$subpartParams['_TABLE']]['columns'][$fieldName]) {
671 6
                                $queryBuilder = $this->getQueryBuilder($subpartParams['_TABLE']);
672
673 6
                                if ($recursiveDepth > 0) {
674
                                    /** @var QueryGenerator $queryGenerator */
675 2
                                    $queryGenerator = GeneralUtility::makeInstance(QueryGenerator::class);
676 2
                                    $pidList = $queryGenerator->getTreeList($lookUpPid, $recursiveDepth, 0, 1);
677 2
                                    $pidArray = GeneralUtility::intExplode(',', $pidList);
678
                                } else {
679 4
                                    $pidArray = [(string) $lookUpPid];
680
                                }
681
682 6
                                $queryBuilder->getRestrictions()
683 6
                                    ->removeAll()
684 6
                                    ->add(GeneralUtility::makeInstance(DeletedRestriction::class));
685
686
                                $queryBuilder
687 6
                                    ->select($fieldName)
688 6
                                    ->from($subpartParams['_TABLE'])
689 6
                                    ->where(
690 6
                                        $queryBuilder->expr()->in($pidField, $queryBuilder->createNamedParameter($pidArray, Connection::PARAM_INT_ARRAY)),
691
                                        $where
692
                                    );
693
694 6
                                if (! empty($addTable)) {
695
                                    // TODO: Check if this works as intended!
696
                                    $queryBuilder->add('from', $addTable);
697
                                }
698 6
                                $transOrigPointerField = $GLOBALS['TCA'][$subpartParams['_TABLE']]['ctrl']['transOrigPointerField'];
699
700 6
                                if ($subpartParams['_ENABLELANG'] && $transOrigPointerField) {
701
                                    $queryBuilder->andWhere(
702
                                        $queryBuilder->expr()->lte(
703
                                            $transOrigPointerField,
704
                                            0
705
                                        )
706
                                    );
707
                                }
708
709 6
                                $statement = $queryBuilder->execute();
710
711 6
                                $rows = [];
712 6
                                while ($row = $statement->fetch()) {
713 6
                                    $rows[$row[$fieldName]] = $row;
714
                                }
715
716 6
                                if (is_array($rows)) {
717 6
                                    $paramArray[$p] = array_merge($paramArray[$p], array_keys($rows));
718
                                }
719
                            }
720
                        }
721
                    } else {
722
                        // Just add value:
723 3
                        $paramArray[$p][] = $pV;
724
                    }
725
                    // Hook for processing own expandParameters place holder
726 10
                    if (is_array($GLOBALS['TYPO3_CONF_VARS']['SC_OPTIONS']['crawler/class.tx_crawler_lib.php']['expandParameters'])) {
727
                        $_params = [
728
                            'pObj' => &$this,
729
                            'paramArray' => &$paramArray,
730
                            'currentKey' => $p,
731
                            'currentValue' => $pV,
732
                            'pid' => $pid,
733
                        ];
734
                        foreach ($GLOBALS['TYPO3_CONF_VARS']['SC_OPTIONS']['crawler/class.tx_crawler_lib.php']['expandParameters'] as $_funcRef) {
735
                            GeneralUtility::callUserFunction($_funcRef, $_params, $this);
736
                        }
737
                    }
738
                }
739
740
                // Make unique set of values and sort array by key:
741 10
                $paramArray[$p] = array_unique($paramArray[$p]);
742 10
                ksort($paramArray);
743
            } else {
744
                // Set the literal value as only value in array:
745 3
                $paramArray[$p] = [$v];
746
            }
747
        }
748
749 10
        return $paramArray;
750
    }
751
752
    /**
753
     * Compiling URLs from parameter array (output of expandParameters())
754
     * The number of URLs will be the multiplication of the number of parameter values for each key
755
     *
756
     * @param array $paramArray Output of expandParameters(): Array with keys (GET var names) and for each an array of values
757
     * @param array $urls URLs accumulated in this array (for recursion)
758
     * @return array
759
     */
760 7
    public function compileUrls($paramArray, array $urls)
761
    {
762 7
        if (empty($paramArray)) {
763 7
            return $urls;
764
        }
765 6
        $varName = key($paramArray);
766 6
        $valueSet = array_shift($paramArray);
767
768
        // Traverse value set:
769 6
        $newUrls = [];
770 6
        foreach ($urls as $url) {
771 5
            foreach ($valueSet as $val) {
772 5
                if (count($newUrls) < $this->getMaximumUrlsToCompile()) {
773 5
                    $newUrls[] = $url . (strcmp((string) $val, '') ? '&' . rawurlencode($varName) . '=' . rawurlencode((string) $val) : '');
774
                }
775
            }
776
        }
777 6
        return $this->compileUrls($paramArray, $newUrls);
778
    }
779
780
    /************************************
781
     *
782
     * Crawler log
783
     *
784
     ************************************/
785
786
    /**
787
     * Adding call back entries to log (called from hooks typically, see indexed search class "class.crawler.php"
788
     *
789
     * @param integer $setId Set ID
790
     * @param array $params Parameters to pass to call back function
791
     * @param string $callBack Call back object reference, eg. 'EXT:indexed_search/class.crawler.php:&tx_indexedsearch_crawler'
792
     * @param integer $page_id Page ID to attach it to
793
     * @param integer $schedule Time at which to activate
794
     */
795
    public function addQueueEntry_callBack($setId, $params, $callBack, $page_id = 0, $schedule = 0): void
796
    {
797
        if (! is_array($params)) {
0 ignored issues
show
introduced by
The condition is_array($params) is always true.
Loading history...
798
            $params = [];
799
        }
800
        $params['_CALLBACKOBJ'] = $callBack;
801
802
        GeneralUtility::makeInstance(ConnectionPool::class)->getConnectionForTable('tx_crawler_queue')
803
            ->insert(
804
                'tx_crawler_queue',
805
                [
806
                    'page_id' => (int) $page_id,
807
                    'parameters' => json_encode($params),
808
                    'scheduled' => (int) $schedule ?: $this->getCurrentTime(),
809
                    'exec_time' => 0,
810
                    'set_id' => (int) $setId,
811
                    'result_data' => '',
812
                ]
813
            );
814
    }
815
816
    /************************************
817
     *
818
     * URL setting
819
     *
820
     ************************************/
821
822
    /**
823
     * Setting a URL for crawling:
824
     *
825
     * @param integer $id Page ID
826
     * @param string $url Complete URL
827
     * @param array $subCfg Sub configuration array (from TS config)
828
     * @param integer $tstamp Scheduled-time
829
     * @param string $configurationHash (optional) configuration hash
830
     * @param bool $skipInnerDuplicationCheck (optional) skip inner duplication check
831
     * @return bool
832
     */
833 7
    public function addUrl(
834
        $id,
835
        $url,
836
        array $subCfg,
837
        $tstamp,
838
        $configurationHash = '',
839
        $skipInnerDuplicationCheck = false
840
    ) {
841 7
        $urlAdded = false;
842 7
        $rows = [];
843
844
        // Creating parameters:
845
        $parameters = [
846 7
            'url' => $url,
847
        ];
848
849
        // fe user group simulation:
850 7
        $uGs = implode(',', array_unique(GeneralUtility::intExplode(',', $subCfg['userGroups'], true)));
851 7
        if ($uGs) {
852 1
            $parameters['feUserGroupList'] = $uGs;
853
        }
854
855
        // Setting processing instructions
856 7
        $parameters['procInstructions'] = GeneralUtility::trimExplode(',', $subCfg['procInstrFilter']);
857 7
        if (is_array($subCfg['procInstrParams.'])) {
858 4
            $parameters['procInstrParams'] = $subCfg['procInstrParams.'];
859
        }
860
861
        // Compile value array:
862 7
        $parameters_serialized = json_encode($parameters);
863
        $fieldArray = [
864 7
            'page_id' => (int) $id,
865 7
            'parameters' => $parameters_serialized,
866 7
            'parameters_hash' => GeneralUtility::shortMD5($parameters_serialized),
867 7
            'configuration_hash' => $configurationHash,
868 7
            'scheduled' => $tstamp,
869 7
            'exec_time' => 0,
870 7
            'set_id' => (int) $this->setID,
871 7
            'result_data' => '',
872 7
            'configuration' => $subCfg['key'],
873
        ];
874
875 7
        if ($this->registerQueueEntriesInternallyOnly) {
876
            //the entries will only be registered and not stored to the database
877 1
            $this->queueEntries[] = $fieldArray;
878
        } else {
879 6
            if (! $skipInnerDuplicationCheck) {
880
                // check if there is already an equal entry
881 5
                $rows = $this->queueRepository->getDuplicateQueueItemsIfExists(
882 5
                    (bool) $this->extensionSettings['enableTimeslot'],
883
                    $tstamp,
884 5
                    $this->getCurrentTime(),
885 5
                    $fieldArray['page_id'],
886 5
                    $fieldArray['parameters_hash']
887
                );
888
            }
889
890 6
            if (empty($rows)) {
891 5
                $connectionForCrawlerQueue = GeneralUtility::makeInstance(ConnectionPool::class)->getConnectionForTable('tx_crawler_queue');
892 5
                $connectionForCrawlerQueue->insert(
893 5
                    'tx_crawler_queue',
894
                    $fieldArray
895
                );
896 5
                $uid = $connectionForCrawlerQueue->lastInsertId('tx_crawler_queue', 'qid');
897 5
                $rows[] = $uid;
898 5
                $urlAdded = true;
899
            }
900
        }
901
902 7
        return $urlAdded;
903
    }
904
905
    /**
906
     * Returns the current system time
907
     *
908
     * @return int
909
     */
910 4
    public function getCurrentTime()
911
    {
912 4
        return time();
913
    }
914
915
    /************************************
916
     *
917
     * URL reading
918
     *
919
     ************************************/
920
921
    /**
922
     * Read URL for single queue entry
923
     *
924
     * @param integer $queueId
925
     * @param boolean $force If set, will process even if exec_time has been set!
926
     * @return integer
927
     */
928 2
    public function readUrl($queueId, $force = false)
929
    {
930 2
        $queryBuilder = GeneralUtility::makeInstance(ConnectionPool::class)->getQueryBuilderForTable($this->tableName);
931 2
        $ret = 0;
932 2
        $this->logger->debug('crawler-readurl start ' . microtime(true));
933
        // Get entry:
934
        $queryBuilder
935 2
            ->select('*')
936 2
            ->from('tx_crawler_queue')
937 2
            ->where(
938 2
                $queryBuilder->expr()->eq('qid', $queryBuilder->createNamedParameter($queueId, PDO::PARAM_INT))
939
            );
940 2
        if (! $force) {
941
            $queryBuilder
942 2
                ->andWhere('exec_time = 0')
943 2
                ->andWhere('process_scheduled > 0');
944
        }
945 2
        $queueRec = $queryBuilder->execute()->fetch();
946
947 2
        if (! is_array($queueRec)) {
948
            return;
949
        }
950
951
        // Set exec_time to lock record:
952 2
        $field_array = ['exec_time' => $this->getCurrentTime()];
953
954 2
        if (isset($this->processID)) {
955
            //if mulitprocessing is used we need to store the id of the process which has handled this entry
956 2
            $field_array['process_id_completed'] = $this->processID;
957
        }
958
959 2
        GeneralUtility::makeInstance(ConnectionPool::class)->getConnectionForTable('tx_crawler_queue')
960 2
            ->update(
961 2
                'tx_crawler_queue',
962
                $field_array,
963 2
                ['qid' => (int) $queueId]
964
            );
965
966 2
        $result = $this->queueExecutor->executeQueueItem($queueRec, $this);
967 2
        if ($result['content'] === null) {
968
            $resultData = 'An errors happened';
969
        } else {
970
            /** @var JsonCompatibilityConverter $jsonCompatibilityConverter */
971 2
            $jsonCompatibilityConverter = GeneralUtility::makeInstance(JsonCompatibilityConverter::class);
972 2
            $resultData = $jsonCompatibilityConverter->convert($result['content']);
973
        }
974
975
        //atm there's no need to point to specific pollable extensions
976 2
        if (is_array($GLOBALS['TYPO3_CONF_VARS']['EXTCONF']['crawler']['pollSuccess'])) {
977
            foreach ($GLOBALS['TYPO3_CONF_VARS']['EXTCONF']['crawler']['pollSuccess'] as $pollable) {
978
                // only check the success value if the instruction is runnig
979
                // it is important to name the pollSuccess key same as the procInstructions key
980
                if (is_array($resultData['parameters']['procInstructions'])
981
                    && in_array(
982
                        $pollable,
983
                        $resultData['parameters']['procInstructions'], true
984
                    )
985
                ) {
986
                    if (! empty($resultData['success'][$pollable]) && $resultData['success'][$pollable]) {
987
                        $ret |= self::CLI_STATUS_POLLABLE_PROCESSED;
988
                    }
989
                }
990
            }
991
        }
992
993
        // Set result in log which also denotes the end of the processing of this entry.
994 2
        $field_array = ['result_data' => json_encode($result)];
995
996 2
        GeneralUtility::makeInstance(ConnectionPool::class)->getConnectionForTable('tx_crawler_queue')
997 2
            ->update(
998 2
                'tx_crawler_queue',
999
                $field_array,
1000 2
                ['qid' => (int) $queueId]
1001
            );
1002
1003 2
        $this->logger->debug('crawler-readurl stop ' . microtime(true));
1004 2
        return $ret;
1005
    }
1006
1007
    /**
1008
     * Read URL for not-yet-inserted log-entry
1009
     *
1010
     * @param array $field_array Queue field array,
1011
     *
1012
     * @return array|bool|mixed|string
1013
     */
1014
    public function readUrlFromArray($field_array)
1015
    {
1016
        // Set exec_time to lock record:
1017
        $field_array['exec_time'] = $this->getCurrentTime();
1018
        $connectionForCrawlerQueue = GeneralUtility::makeInstance(ConnectionPool::class)->getConnectionForTable($this->tableName);
1019
        $connectionForCrawlerQueue->insert(
1020
            $this->tableName,
1021
            $field_array
1022
        );
1023
        $queueId = $connectionForCrawlerQueue->lastInsertId($this->tableName, 'qid');
1024
        $result = $this->queueExecutor->executeQueueItem($field_array, $this);
1025
1026
        // Set result in log which also denotes the end of the processing of this entry.
1027
        $field_array = ['result_data' => json_encode($result)];
1028
1029
        $connectionForCrawlerQueue->update(
1030
            $this->tableName,
1031
            $field_array,
1032
            ['qid' => $queueId]
1033
        );
1034
1035
        return $result;
1036
    }
1037
1038
    /*****************************
1039
     *
1040
     * Compiling URLs to crawl - tools
1041
     *
1042
     *****************************/
1043
1044
    /**
1045
     * @param integer $id Root page id to start from.
1046
     * @param integer $depth Depth of tree, 0=only id-page, 1= on sublevel, 99 = infinite
1047
     * @param integer $scheduledTime Unix Time when the URL is timed to be visited when put in queue
1048
     * @param integer $reqMinute Number of requests per minute (creates the interleave between requests)
1049
     * @param boolean $submitCrawlUrls If set, submits the URLs to queue in database (real crawling)
1050
     * @param boolean $downloadCrawlUrls If set (and submitcrawlUrls is false) will fill $downloadUrls with entries)
1051
     * @param array $incomingProcInstructions Array of processing instructions
1052
     * @param array $configurationSelection Array of configuration keys
1053
     * @return string
1054
     */
1055
    public function getPageTreeAndUrls(
1056
        $id,
1057
        $depth,
1058
        $scheduledTime,
1059
        $reqMinute,
1060
        $submitCrawlUrls,
1061
        $downloadCrawlUrls,
1062
        array $incomingProcInstructions,
1063
        array $configurationSelection
1064
    ) {
1065
        $this->scheduledTime = $scheduledTime;
1066
        $this->reqMinute = $reqMinute;
1067
        $this->submitCrawlUrls = $submitCrawlUrls;
1068
        $this->downloadCrawlUrls = $downloadCrawlUrls;
1069
        $this->incomingProcInstructions = $incomingProcInstructions;
1070
        $this->incomingConfigurationSelection = $configurationSelection;
1071
1072
        $this->duplicateTrack = [];
1073
        $this->downloadUrls = [];
1074
1075
        // Drawing tree:
1076
        /* @var PageTreeView $tree */
1077
        $tree = GeneralUtility::makeInstance(PageTreeView::class);
1078
        $perms_clause = $this->getBackendUser()->getPagePermsClause(Permission::PAGE_SHOW);
1079
        $tree->init('AND ' . $perms_clause);
1080
1081
        $pageInfo = BackendUtility::readPageAccess($id, $perms_clause);
1082
        if (is_array($pageInfo)) {
1083
            // Set root row:
1084
            $tree->tree[] = [
1085
                'row' => $pageInfo,
1086
                'HTML' => $this->iconFactory->getIconForRecord('pages', $pageInfo, Icon::SIZE_SMALL),
1087
            ];
1088
        }
1089
1090
        // Get branch beneath:
1091
        if ($depth) {
1092
            $tree->getTree($id, $depth, '');
1093
        }
1094
1095
        // Traverse page tree:
1096
        $code = '';
1097
1098
        foreach ($tree->tree as $data) {
1099
            $this->MP = false;
1100
1101
            // recognize mount points
1102
            if ($data['row']['doktype'] === PageRepository::DOKTYPE_MOUNTPOINT) {
1103
                $mountpage = $this->pageRepository->getPage($data['row']['uid']);
1104
1105
                // fetch mounted pages
1106
                $this->MP = $mountpage[0]['mount_pid'] . '-' . $data['row']['uid'];
0 ignored issues
show
Documentation Bug introduced by
The property $MP was declared of type boolean, but $mountpage[0]['mount_pid...' . $data['row']['uid'] is of type string. Maybe add a type cast?

This check looks for assignments to scalar types that may be of the wrong type.

To ensure the code behaves as expected, it may be a good idea to add an explicit type cast.

$answer = 42;

$correct = false;

$correct = (bool) $answer;
Loading history...
1107
1108
                $mountTree = GeneralUtility::makeInstance(PageTreeView::class);
1109
                $mountTree->init('AND ' . $perms_clause);
1110
                $mountTree->getTree($mountpage[0]['mount_pid'], $depth);
1111
1112
                foreach ($mountTree->tree as $mountData) {
1113
                    $code .= $this->drawURLs_addRowsForPage(
1114
                        $mountData['row'],
1115
                        $mountData['HTML'] . BackendUtility::getRecordTitle('pages', $mountData['row'], true)
1116
                    );
1117
                }
1118
1119
                // replace page when mount_pid_ol is enabled
1120
                if ($mountpage[0]['mount_pid_ol']) {
1121
                    $data['row']['uid'] = $mountpage[0]['mount_pid'];
1122
                } else {
1123
                    // if the mount_pid_ol is not set the MP must not be used for the mountpoint page
1124
                    $this->MP = false;
1125
                }
1126
            }
1127
1128
            $code .= $this->drawURLs_addRowsForPage(
1129
                $data['row'],
1130
                $data['HTML'] . BackendUtility::getRecordTitle('pages', $data['row'], true)
1131
            );
1132
        }
1133
1134
        return $code;
1135
    }
1136
1137
    /**
1138
     * Expands exclude string
1139
     *
1140
     * @param string $excludeString Exclude string
1141
     * @return array
1142
     */
1143 1
    public function expandExcludeString($excludeString)
1144
    {
1145
        // internal static caches;
1146 1
        static $expandedExcludeStringCache;
1147 1
        static $treeCache;
1148
1149 1
        if (empty($expandedExcludeStringCache[$excludeString])) {
1150 1
            $pidList = [];
1151
1152 1
            if (! empty($excludeString)) {
1153
                /** @var PageTreeView $tree */
1154 1
                $tree = GeneralUtility::makeInstance(PageTreeView::class);
1155 1
                $tree->init('AND ' . $this->getBackendUser()->getPagePermsClause(Permission::PAGE_SHOW));
1156
1157 1
                $excludeParts = GeneralUtility::trimExplode(',', $excludeString);
1158
1159 1
                foreach ($excludeParts as $excludePart) {
1160 1
                    [$pid, $depth] = GeneralUtility::trimExplode('+', $excludePart);
1161
1162
                    // default is "page only" = "depth=0"
1163 1
                    if (empty($depth)) {
1164 1
                        $depth = (stristr($excludePart, '+')) ? 99 : 0;
1165
                    }
1166
1167 1
                    $pidList[] = (int) $pid;
1168
1169 1
                    if ($depth > 0) {
1170
                        if (empty($treeCache[$pid][$depth])) {
1171
                            $tree->reset();
1172
                            $tree->getTree($pid, $depth);
0 ignored issues
show
Bug introduced by
$pid of type string is incompatible with the type integer expected by parameter $uid of TYPO3\CMS\Backend\Tree\V...ractTreeView::getTree(). ( Ignorable by Annotation )

If this is a false-positive, you can also ignore this issue in your code via the ignore-type  annotation

1172
                            $tree->getTree(/** @scrutinizer ignore-type */ $pid, $depth);
Loading history...
1173
                            $treeCache[$pid][$depth] = $tree->tree;
1174
                        }
1175
1176
                        foreach ($treeCache[$pid][$depth] as $data) {
1177
                            $pidList[] = (int) $data['row']['uid'];
1178
                        }
1179
                    }
1180
                }
1181
            }
1182
1183 1
            $expandedExcludeStringCache[$excludeString] = array_unique($pidList);
1184
        }
1185
1186 1
        return $expandedExcludeStringCache[$excludeString];
1187
    }
1188
1189
    /**
1190
     * Create the rows for display of the page tree
1191
     * For each page a number of rows are shown displaying GET variable configuration
1192
     */
1193
    public function drawURLs_addRowsForPage(array $pageRow, string $pageTitle): string
1194
    {
1195
        $skipMessage = '';
1196
1197
        // Get list of configurations
1198
        $configurations = $this->getUrlsForPageRow($pageRow, $skipMessage);
1199
        $configurations = ConfigurationService::removeDisallowedConfigurations($this->incomingConfigurationSelection, $configurations);
1200
1201
        // Traverse parameter combinations:
1202
        $c = 0;
1203
        $content = '';
1204
        if (! empty($configurations)) {
1205
            foreach ($configurations as $confKey => $confArray) {
1206
1207
                // Title column:
1208
                if (! $c) {
1209
                    $titleClm = '<td rowspan="' . count($configurations) . '">' . $pageTitle . '</td>';
1210
                } else {
1211
                    $titleClm = '';
1212
                }
1213
1214
                if (! in_array($pageRow['uid'], $this->expandExcludeString($confArray['subCfg']['exclude']), true)) {
1215
1216
                    // URL list:
1217
                    $urlList = $this->urlListFromUrlArray(
1218
                        $confArray,
1219
                        $pageRow,
1220
                        $this->scheduledTime,
1221
                        $this->reqMinute,
1222
                        $this->submitCrawlUrls,
1223
                        $this->downloadCrawlUrls,
1224
                        $this->duplicateTrack,
1225
                        $this->downloadUrls,
1226
                        // if empty the urls won't be filtered by processing instructions
1227
                        $this->incomingProcInstructions
1228
                    );
1229
1230
                    // Expanded parameters:
1231
                    $paramExpanded = '';
1232
                    $calcAccu = [];
1233
                    $calcRes = 1;
1234
                    foreach ($confArray['paramExpanded'] as $gVar => $gVal) {
1235
                        $paramExpanded .= '
1236
                            <tr>
1237
                                <td>' . htmlspecialchars('&' . $gVar . '=') . '<br/>' .
1238
                            '(' . count($gVal) . ')' .
1239
                            '</td>
1240
                                <td nowrap="nowrap">' . nl2br(htmlspecialchars(implode(chr(10), $gVal))) . '</td>
1241
                            </tr>
1242
                        ';
1243
                        $calcRes *= count($gVal);
1244
                        $calcAccu[] = count($gVal);
1245
                    }
1246
                    $paramExpanded = '<table>' . $paramExpanded . '</table>';
1247
                    $paramExpanded .= 'Comb: ' . implode('*', $calcAccu) . '=' . $calcRes;
1248
1249
                    // Options
1250
                    $optionValues = '';
1251
                    if ($confArray['subCfg']['userGroups']) {
1252
                        $optionValues .= 'User Groups: ' . $confArray['subCfg']['userGroups'] . '<br/>';
1253
                    }
1254
                    if ($confArray['subCfg']['procInstrFilter']) {
1255
                        $optionValues .= 'ProcInstr: ' . $confArray['subCfg']['procInstrFilter'] . '<br/>';
1256
                    }
1257
1258
                    // Compile row:
1259
                    $content .= '
1260
                        <tr>
1261
                            ' . $titleClm . '
1262
                            <td>' . htmlspecialchars($confKey) . '</td>
1263
                            <td>' . nl2br(htmlspecialchars(rawurldecode(trim(str_replace('&', chr(10) . '&', GeneralUtility::implodeArrayForUrl('', $confArray['paramParsed'])))))) . '</td>
1264
                            <td>' . $paramExpanded . '</td>
1265
                            <td nowrap="nowrap">' . $urlList . '</td>
1266
                            <td nowrap="nowrap">' . $optionValues . '</td>
1267
                            <td nowrap="nowrap">' . DebugUtility::viewArray($confArray['subCfg']['procInstrParams.']) . '</td>
1268
                        </tr>';
1269
                } else {
1270
                    $content .= '<tr>
1271
                            ' . $titleClm . '
1272
                            <td>' . htmlspecialchars($confKey) . '</td>
1273
                            <td colspan="5"><em>No entries</em> (Page is excluded in this configuration)</td>
1274
                        </tr>';
1275
                }
1276
1277
                $c++;
1278
            }
1279
        } else {
1280
            $message = ! empty($skipMessage) ? ' (' . $skipMessage . ')' : '';
1281
1282
            // Compile row:
1283
            $content .= '
1284
                <tr>
1285
                    <td>' . $pageTitle . '</td>
1286
                    <td colspan="6"><em>No entries</em>' . $message . '</td>
1287
                </tr>';
1288
        }
1289
1290
        return $content;
1291
    }
1292
1293
    /*****************************
1294
     *
1295
     * CLI functions
1296
     *
1297
     *****************************/
1298
1299
    /**
1300
     * Running the functionality of the CLI (crawling URLs from queue)
1301
     */
1302 2
    public function CLI_run(int $countInARun, int $sleepTime, int $sleepAfterFinish): int
1303
    {
1304 2
        $result = 0;
1305 2
        $counter = 0;
1306
1307
        // Clean up the queue
1308 2
        $this->queueRepository->cleanupQueue();
1309
1310
        // Select entries:
1311 2
        $rows = $this->queueRepository->fetchRecordsToBeCrawled($countInARun);
1312
1313 2
        if (! empty($rows)) {
1314 2
            $quidList = [];
1315
1316 2
            foreach ($rows as $r) {
1317 2
                $quidList[] = $r['qid'];
1318
            }
1319
1320 2
            $processId = $this->CLI_buildProcessId();
1321
1322
            //save the number of assigned queue entries to determine how many have been processed later
1323 2
            $numberOfAffectedRows = $this->queueRepository->updateProcessIdAndSchedulerForQueueIds($quidList, $processId);
1324 2
            $this->processRepository->updateProcessAssignItemsCount($numberOfAffectedRows, $processId);
1325
1326 2
            if ($numberOfAffectedRows !== count($quidList)) {
1327
                return ($result | self::CLI_STATUS_ABORTED);
1328
            }
1329
1330 2
            foreach ($rows as $r) {
1331 2
                $result |= $this->readUrl($r['qid']);
1332
1333 2
                $counter++;
1334
                // Just to relax the system
1335 2
                usleep((int) $sleepTime);
1336
1337
                // if during the start and the current read url the cli has been disable we need to return from the function
1338
                // mark the process NOT as ended.
1339 2
                if ($this->crawler->isDisabled()) {
1340
                    return ($result | self::CLI_STATUS_ABORTED);
1341
                }
1342
1343 2
                if (! $this->processRepository->isProcessActive($this->CLI_buildProcessId())) {
1344
                    $this->CLI_debug('conflict / timeout (' . $this->CLI_buildProcessId() . ')');
0 ignored issues
show
Bug introduced by
The method CLI_debug() does not exist on AOE\Crawler\Controller\CrawlerController. Since you implemented __call, consider adding a @method annotation. ( Ignorable by Annotation )

If this is a false-positive, you can also ignore this issue in your code via the ignore-call  annotation

1344
                    $this->/** @scrutinizer ignore-call */ 
1345
                           CLI_debug('conflict / timeout (' . $this->CLI_buildProcessId() . ')');
Loading history...
1345
                    $result |= self::CLI_STATUS_ABORTED;
1346
                    //possible timeout
1347
                    break;
1348
                }
1349
            }
1350
1351 2
            sleep((int) $sleepAfterFinish);
1352
        }
1353
1354 2
        if ($counter > 0) {
1355 2
            $result |= self::CLI_STATUS_PROCESSED;
1356
        }
1357
1358 2
        return $result;
1359
    }
1360
1361
    /**
1362
     * Try to acquire a new process with the given id
1363
     * also performs some auto-cleanup for orphan processes
1364
     * @param string $id identification string for the process
1365
     * @return boolean
1366
     * @todo preemption might not be the most elegant way to clean up
1367
     */
1368 2
    public function CLI_checkAndAcquireNewProcess($id)
1369
    {
1370 2
        $ret = true;
1371
1372 2
        $systemProcessId = getmypid();
1373 2
        if (! $systemProcessId) {
1374
            return false;
1375
        }
1376
1377 2
        $processCount = 0;
1378 2
        $orphanProcesses = [];
1379
1380 2
        $activeProcesses = $this->processRepository->findAllActive();
1381 2
        $currentTime = $this->getCurrentTime();
1382
1383
        /** @var Process $process */
1384 2
        foreach ($activeProcesses as $process) {
1385
            if ($process->getTtl() < $currentTime) {
1386
                $orphanProcesses[] = $process->getProcessId();
1387
            } else {
1388
                $processCount++;
1389
            }
1390
        }
1391
1392
        // if there are less than allowed active processes then add a new one
1393 2
        if ($processCount < (int) $this->extensionSettings['processLimit']) {
1394 2
            GeneralUtility::makeInstance(ConnectionPool::class)->getConnectionForTable('tx_crawler_process')->insert(
1395 2
                'tx_crawler_process',
1396
                [
1397 2
                    'process_id' => $id,
1398 2
                    'active' => 1,
1399 2
                    'ttl' => $currentTime + (int) $this->extensionSettings['processMaxRunTime'],
1400 2
                    'system_process_id' => $systemProcessId,
1401
                ]
1402
            );
1403
        } else {
1404
            $ret = false;
1405
        }
1406
1407 2
        $this->processRepository->deleteProcessesMarkedAsDeleted();
1408 2
        $this->CLI_releaseProcesses($orphanProcesses);
1409
1410 2
        return $ret;
1411
    }
1412
1413
    /**
1414
     * Release a process and the required resources
1415
     *
1416
     * @param mixed $releaseIds string with a single process-id or array with multiple process-ids
1417
     * @return boolean
1418
     */
1419 2
    public function CLI_releaseProcesses($releaseIds)
1420
    {
1421 2
        $queryBuilder = GeneralUtility::makeInstance(ConnectionPool::class)->getQueryBuilderForTable($this->tableName);
1422
1423 2
        if (! is_array($releaseIds)) {
1424 2
            $releaseIds = [$releaseIds];
1425
        }
1426
1427 2
        if (empty($releaseIds)) {
1428
            //nothing to release
1429 2
            return false;
1430
        }
1431
1432
        // some kind of 2nd chance algo - this way you need at least 2 processes to have a real cleanup
1433
        // this ensures that a single process can't mess up the entire process table
1434
1435
        // mark all processes as deleted which have no "waiting" queue-entires and which are not active
1436
1437
        $queryBuilder
1438 2
            ->update($this->tableName, 'q')
1439 2
            ->where(
1440 2
                'q.process_id IN(SELECT p.process_id FROM tx_crawler_process as p WHERE p.active = 0)'
1441
            )
1442 2
            ->set('q.process_scheduled', 0)
1443 2
            ->set('q.process_id', '')
1444 2
            ->execute();
1445
1446
        // FIXME: Not entirely sure that this is equivalent to the previous version
1447 2
        $queryBuilder->resetQueryPart('set');
1448
1449
        $queryBuilder
1450 2
            ->update('tx_crawler_process')
1451 2
            ->where(
1452 2
                $queryBuilder->expr()->eq('active', 0),
1453 2
                'process_id IN(SELECT q.process_id FROM tx_crawler_queue as q WHERE q.exec_time = 0)'
1454
            )
1455 2
            ->set('system_process_id', 0)
1456 2
            ->execute();
1457
1458 2
        $this->processRepository->markRequestedProcessesAsNotActive($releaseIds);
1459 2
        $this->queueRepository->unsetProcessScheduledAndProcessIdForQueueEntries($releaseIds);
1460
1461 2
        return true;
1462
    }
1463
1464
    /**
1465
     * Create a unique Id for the current process
1466
     *
1467
     * @return string the ID
1468
     */
1469 3
    public function CLI_buildProcessId()
1470
    {
1471 3
        if (! $this->processID) {
1472 2
            $this->processID = GeneralUtility::shortMD5(microtime(true));
1473
        }
1474 3
        return $this->processID;
1475
    }
1476
1477
    /**
1478
     * Returns a md5 hash generated from a serialized configuration array.
1479
     *
1480
     * @return string
1481
     */
1482 9
    protected function getConfigurationHash(array $configuration)
1483
    {
1484 9
        unset($configuration['paramExpanded']);
1485 9
        unset($configuration['URLs']);
1486 9
        return md5(serialize($configuration));
1487
    }
1488
1489 1
    protected function swapIfFirstIsLargerThanSecond(array $reg): array
1490
    {
1491
        // Swap if first is larger than last:
1492 1
        if ($reg[1] > $reg[2]) {
1493
            $temp = $reg[2];
1494
            $reg[2] = $reg[1];
1495
            $reg[1] = $temp;
1496
        }
1497
1498 1
        return $reg;
1499
    }
1500
1501 5
    private function getMaximumUrlsToCompile(): int
1502
    {
1503 5
        return $this->maximumUrlsToCompile;
1504
    }
1505
1506
    /**
1507
     * @return BackendUserAuthentication
1508
     */
1509 3
    private function getBackendUser()
1510
    {
1511
        // Make sure the _cli_ user is loaded
1512 3
        Bootstrap::initializeBackendAuthentication();
1513 3
        if ($this->backendUser === null) {
1514 3
            $this->backendUser = $GLOBALS['BE_USER'];
1515
        }
1516 3
        return $this->backendUser;
1517
    }
1518
1519
    /**
1520
     * Get querybuilder for given table
1521
     *
1522
     * @return QueryBuilder
1523
     */
1524 8
    private function getQueryBuilder(string $table)
1525
    {
1526 8
        return GeneralUtility::makeInstance(ConnectionPool::class)->getQueryBuilderForTable($table);
1527
    }
1528
}
1529