Passed
Push — wip/remove-deprecations-for-v1... ( f97f5b )
by Tomas Norre
05:15
created

CrawlerController::readUrl()   B

Complexity

Conditions 11
Paths 18

Size

Total Lines 77
Code Lines 45

Duplication

Lines 0
Ratio 0 %

Code Coverage

Tests 32
CRAP Score 12.2796

Importance

Changes 3
Bugs 0 Features 0
Metric Value
cc 11
eloc 45
c 3
b 0
f 0
nc 18
nop 2
dl 0
loc 77
ccs 32
cts 41
cp 0.7805
crap 12.2796
rs 7.3166

How to fix   Long Method    Complexity   

Long Method

Small methods make your code easier to understand, in particular if combined with a good name. Besides, if your method is small, finding a good name is usually much easier.

For example, if you find yourself adding comments to a method's body, this is usually a good sign to extract the commented part to a new method, and use the comment as a starting point when coming up with a good name for this new method.

Commonly applied refactorings include:

1
<?php
2
3
declare(strict_types=1);
4
5
namespace AOE\Crawler\Controller;
6
7
/***************************************************************
8
 *  Copyright notice
9
 *
10
 *  (c) 2020 AOE GmbH <[email protected]>
11
 *
12
 *  All rights reserved
13
 *
14
 *  This script is part of the TYPO3 project. The TYPO3 project is
15
 *  free software; you can redistribute it and/or modify
16
 *  it under the terms of the GNU General Public License as published by
17
 *  the Free Software Foundation; either version 3 of the License, or
18
 *  (at your option) any later version.
19
 *
20
 *  The GNU General Public License can be found at
21
 *  http://www.gnu.org/copyleft/gpl.html.
22
 *
23
 *  This script is distributed in the hope that it will be useful,
24
 *  but WITHOUT ANY WARRANTY; without even the implied warranty of
25
 *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
26
 *  GNU General Public License for more details.
27
 *
28
 *  This copyright notice MUST APPEAR in all copies of the script!
29
 ***************************************************************/
30
31
use AOE\Crawler\Configuration\ExtensionConfigurationProvider;
32
use AOE\Crawler\Converter\JsonCompatibilityConverter;
33
use AOE\Crawler\Crawler;
34
use AOE\Crawler\CrawlStrategy\CrawlStrategyFactory;
35
use AOE\Crawler\Domain\Model\Process;
36
use AOE\Crawler\Domain\Repository\ConfigurationRepository;
37
use AOE\Crawler\Domain\Repository\ProcessRepository;
38
use AOE\Crawler\Domain\Repository\QueueRepository;
39
use AOE\Crawler\QueueExecutor;
40
use AOE\Crawler\Service\ConfigurationService;
41
use AOE\Crawler\Service\UrlService;
42
use AOE\Crawler\Service\UserService;
43
use AOE\Crawler\Value\QueueFilter;
44
use PDO;
45
use Psr\Http\Message\UriInterface;
46
use Psr\Log\LoggerAwareInterface;
47
use Psr\Log\LoggerAwareTrait;
48
use TYPO3\CMS\Backend\Tree\View\PageTreeView;
49
use TYPO3\CMS\Backend\Utility\BackendUtility;
50
use TYPO3\CMS\Core\Authentication\BackendUserAuthentication;
51
use TYPO3\CMS\Core\Compatibility\PublicMethodDeprecationTrait;
52
use TYPO3\CMS\Core\Compatibility\PublicPropertyDeprecationTrait;
53
use TYPO3\CMS\Core\Core\Bootstrap;
54
use TYPO3\CMS\Core\Core\Environment;
55
use TYPO3\CMS\Core\Database\Connection;
56
use TYPO3\CMS\Core\Database\ConnectionPool;
57
use TYPO3\CMS\Core\Database\Query\QueryBuilder;
58
use TYPO3\CMS\Core\Database\Query\Restriction\DeletedRestriction;
59
use TYPO3\CMS\Core\Database\QueryGenerator;
60
use TYPO3\CMS\Core\Domain\Repository\PageRepository;
61
use TYPO3\CMS\Core\Exception\SiteNotFoundException;
62
use TYPO3\CMS\Core\Imaging\Icon;
63
use TYPO3\CMS\Core\Imaging\IconFactory;
64
use TYPO3\CMS\Core\Routing\InvalidRouteArgumentsException;
65
use TYPO3\CMS\Core\Site\Entity\Site;
66
use TYPO3\CMS\Core\Type\Bitmask\Permission;
67
use TYPO3\CMS\Core\TypoScript\Parser\TypoScriptParser;
68
use TYPO3\CMS\Core\Utility\DebugUtility;
69
use TYPO3\CMS\Core\Utility\GeneralUtility;
70
use TYPO3\CMS\Core\Utility\MathUtility;
71
use TYPO3\CMS\Extbase\Object\ObjectManager;
72
73
/**
74
 * Class CrawlerController
75
 *
76
 * @package AOE\Crawler\Controller
77
 */
78
class CrawlerController implements LoggerAwareInterface
79
{
80
    use LoggerAwareTrait;
81
    use PublicMethodDeprecationTrait;
0 ignored issues
show
Bug introduced by
The trait TYPO3\CMS\Core\Compatibi...cMethodDeprecationTrait requires the property $deprecatedPublicMethods which is not provided by AOE\Crawler\Controller\CrawlerController.
Loading history...
82
    use PublicPropertyDeprecationTrait;
0 ignored issues
show
Bug introduced by
The trait TYPO3\CMS\Core\Compatibi...ropertyDeprecationTrait requires the property $deprecatedPublicProperties which is not provided by AOE\Crawler\Controller\CrawlerController.
Loading history...
83
84
    public const CLI_STATUS_NOTHING_PROCCESSED = 0;
85
86
    //queue not empty
87
    public const CLI_STATUS_REMAIN = 1;
88
89
    //(some) queue items where processed
90
    public const CLI_STATUS_PROCESSED = 2;
91
92
    //instance didn't finish
93
    public const CLI_STATUS_ABORTED = 4;
94
95
    public const CLI_STATUS_POLLABLE_PROCESSED = 8;
96
97
    /**
98
     * @var integer
99
     */
100
    public $setID = 0;
101
102
    /**
103
     * @var string
104
     */
105
    public $processID = '';
106
107
    /**
108
     * @var array
109
     */
110
    public $duplicateTrack = [];
111
112
    /**
113
     * @var array
114
     */
115
    public $downloadUrls = [];
116
117
    /**
118
     * @var array
119
     */
120
    public $incomingProcInstructions = [];
121
122
    /**
123
     * @var array
124
     */
125
    public $incomingConfigurationSelection = [];
126
127
    /**
128
     * @var bool
129
     */
130
    public $registerQueueEntriesInternallyOnly = false;
131
132
    /**
133
     * @var array
134
     */
135
    public $queueEntries = [];
136
137
    /**
138
     * @var array
139
     */
140
    public $urlList = [];
141
142
    /**
143
     * @var array
144
     */
145
    public $extensionSettings = [];
146
147
    /**
148
     * Mount Point
149
     *
150
     * @var bool
151
     * Todo: Check what this is used for and adjust the type hint or code, as bool doesn't match the current code.
152
     */
153
    public $MP = false;
154
155
    /**
156
     * @var QueueRepository
157
     */
158
    protected $queueRepository;
159
160
    /**
161
     * @var ProcessRepository
162
     */
163
    protected $processRepository;
164
165
    /**
166
     * @var ConfigurationRepository
167
     */
168
    protected $configurationRepository;
169
170
    /**
171
     * @var string
172
     */
173
    protected $tableName = 'tx_crawler_queue';
174
175
    /**
176
     * @var QueueExecutor
177
     */
178
    protected $queueExecutor;
179
180
    /**
181
     * @var int
182
     */
183
    protected $maximumUrlsToCompile = 10000;
184
185
    /**
186
     * @var IconFactory
187
     */
188
    protected $iconFactory;
189
190
    /**
191
     * @var BackendUserAuthentication|null
192
     */
193
    private $backendUser;
194
195
    /**
196
     * @var integer
197
     */
198
    private $scheduledTime = 0;
199
200
    /**
201
     * @var integer
202
     */
203
    private $reqMinute = 0;
204
205
    /**
206
     * @var bool
207
     */
208
    private $submitCrawlUrls = false;
209
210
    /**
211
     * @var bool
212
     */
213
    private $downloadCrawlUrls = false;
214
215
    /**
216
     * @var PageRepository
217
     */
218
    private $pageRepository;
219
220
    /**
221
     * @var Crawler
222
     */
223
    private $crawler;
224
225
    /************************************
226
     *
227
     * Getting URLs based on Page TSconfig
228
     *
229
     ************************************/
230
231 19
    public function __construct()
232
    {
233 19
        $objectManager = GeneralUtility::makeInstance(ObjectManager::class);
234 19
        $crawlStrategyFactory = GeneralUtility::makeInstance(CrawlStrategyFactory::class);
235 19
        $this->queueRepository = $objectManager->get(QueueRepository::class);
236 19
        $this->processRepository = $objectManager->get(ProcessRepository::class);
237 19
        $this->configurationRepository = $objectManager->get(ConfigurationRepository::class);
238 19
        $this->pageRepository = GeneralUtility::makeInstance(PageRepository::class);
239 19
        $this->queueExecutor = GeneralUtility::makeInstance(QueueExecutor::class, $crawlStrategyFactory);
240 19
        $this->iconFactory = GeneralUtility::makeInstance(IconFactory::class);
241 19
        $this->crawler = GeneralUtility::makeInstance(Crawler::class);
242
243 19
        $this->processFilename = Environment::getVarPath() . '/lock/tx_crawler.proc';
0 ignored issues
show
Bug Best Practice introduced by
The property processFilename does not exist. Although not strictly required by PHP, it is generally a best practice to declare properties explicitly.
Loading history...
244
245
        /** @var ExtensionConfigurationProvider $configurationProvider */
246 19
        $configurationProvider = GeneralUtility::makeInstance(ExtensionConfigurationProvider::class);
247 19
        $settings = $configurationProvider->getExtensionConfiguration();
248 19
        $this->extensionSettings = is_array($settings) ? $settings : [];
0 ignored issues
show
introduced by
The condition is_array($settings) is always true.
Loading history...
249
250
        // set defaults:
251 19
        if (MathUtility::convertToPositiveInteger($this->extensionSettings['countInARun']) === 0) {
252
            $this->extensionSettings['countInARun'] = 100;
253
        }
254
255 19
        $this->extensionSettings['processLimit'] = MathUtility::forceIntegerInRange($this->extensionSettings['processLimit'], 1, 99, 1);
256 19
        $this->setMaximumUrlsToCompile(MathUtility::forceIntegerInRange($this->extensionSettings['maxCompileUrls'], 1, 1000000000, 10000));
257 19
    }
258
259 23
    public function setMaximumUrlsToCompile(int $maximumUrlsToCompile): void
260
    {
261 23
        $this->maximumUrlsToCompile = $maximumUrlsToCompile;
262 23
    }
263
264
    /**
265
     * Sets the extensions settings (unserialized pendant of $TYPO3_CONF_VARS['EXT']['extConf']['crawler']).
266
     */
267 8
    public function setExtensionSettings(array $extensionSettings): void
268
    {
269 8
        $this->extensionSettings = $extensionSettings;
270 8
    }
271
272
    /**
273
     * Check if the given page should be crawled
274
     *
275
     * @return false|string false if the page should be crawled (not excluded), true / skipMessage if it should be skipped
276
     */
277 11
    public function checkIfPageShouldBeSkipped(array $pageRow)
278
    {
279
        // if page is hidden
280 11
        if (! $this->extensionSettings['crawlHiddenPages'] && $pageRow['hidden']) {
281 1
            return 'Because page is hidden';
282
        }
283
284 10
        if (GeneralUtility::inList('3,4,199,254,255', $pageRow['doktype'])) {
285 3
            return 'Because doktype is not allowed';
286
        }
287
288 7
        foreach ($GLOBALS['TYPO3_CONF_VARS']['EXTCONF']['crawler']['excludeDoktype'] ?? [] as $key => $doktypeList) {
289 1
            if (GeneralUtility::inList($doktypeList, $pageRow['doktype'])) {
290 1
                return 'Doktype was excluded by "' . $key . '"';
291
            }
292
        }
293
294
        // veto hook
295 6
        foreach ($GLOBALS['TYPO3_CONF_VARS']['EXTCONF']['crawler']['pageVeto'] ?? [] as $key => $func) {
296
            $params = [
297 2
                'pageRow' => $pageRow,
298
            ];
299
            // expects "false" if page is ok and "true" or a skipMessage if this page should _not_ be crawled
300 2
            $veto = GeneralUtility::callUserFunction($func, $params, $this);
301 2
            if ($veto !== false) {
302 2
                if (is_string($veto)) {
303 1
                    return $veto;
304
                }
305 1
                return 'Veto from hook "' . htmlspecialchars($key) . '"';
306
            }
307
        }
308
309 4
        return false;
310
    }
311
312
    /**
313
     * Wrapper method for getUrlsForPageId()
314
     * It returns an array of configurations and no urls!
315
     *
316
     * @param array $pageRow Page record with at least dok-type and uid columns.
317
     * @param string $skipMessage
318
     * @return array
319
     * @see getUrlsForPageId()
320
     */
321 5
    public function getUrlsForPageRow(array $pageRow, &$skipMessage = '')
322
    {
323 5
        if (! is_int($pageRow['uid'])) {
324
            $skipMessage = 'PageUid ' . $pageRow['uid'] . ' was not an integer';
325
            return [];
326
        }
327
328 5
        $message = $this->checkIfPageShouldBeSkipped($pageRow);
329 5
        if ($message === false) {
330 4
            $res = $this->getUrlsForPageId($pageRow['uid']);
331 4
            $skipMessage = '';
332
        } else {
333 1
            $skipMessage = $message;
334 1
            $res = [];
335
        }
336
337 5
        return $res;
338
    }
339
340
    /**
341
     * Creates a list of URLs from input array (and submits them to queue if asked for)
342
     * See Web > Info module script + "indexed_search"'s crawler hook-client using this!
343
     *
344
     * @param array $vv Information about URLs from pageRow to crawl.
345
     * @param array $pageRow Page row
346
     * @param int $scheduledTime Unix time to schedule indexing to, typically time()
347
     * @param int $reqMinute Number of requests per minute (creates the interleave between requests)
348
     * @param bool $submitCrawlUrls If set, submits the URLs to queue
349
     * @param bool $downloadCrawlUrls If set (and submitcrawlUrls is false) will fill $downloadUrls with entries)
350
     * @param array $duplicateTrack Array which is passed by reference and contains the an id per url to secure we will not crawl duplicates
351
     * @param array $downloadUrls Array which will be filled with URLS for download if flag is set.
352
     * @param array $incomingProcInstructions Array of processing instructions
353
     * @return string List of URLs (meant for display in backend module)
354
     */
355 3
    public function urlListFromUrlArray(
356
        array $vv,
357
        array $pageRow,
358
        $scheduledTime,
359
        $reqMinute,
360
        $submitCrawlUrls,
361
        $downloadCrawlUrls,
362
        array &$duplicateTrack,
363
        array &$downloadUrls,
364
        array $incomingProcInstructions
365
    ) {
366 3
        if (! is_array($vv['URLs'])) {
367
            return 'ERROR - no URL generated';
368
        }
369 3
        $urlLog = [];
370 3
        $pageId = (int) $pageRow['uid'];
371 3
        $configurationHash = $this->getConfigurationHash($vv);
372 3
        $skipInnerCheck = $this->queueRepository->noUnprocessedQueueEntriesForPageWithConfigurationHashExist($pageId, $configurationHash);
373
374 3
        $urlService = new UrlService();
375
376 3
        foreach ($vv['URLs'] as $urlQuery) {
377 3
            if (! $this->drawURLs_PIfilter($vv['subCfg']['procInstrFilter'], $incomingProcInstructions)) {
378
                continue;
379
            }
380 3
            $url = (string) $urlService->getUrlFromPageAndQueryParameters(
381 3
                $pageId,
382
                $urlQuery,
383 3
                $vv['subCfg']['baseUrl'] ?? null,
384 3
                $vv['subCfg']['force_ssl'] ?? 0
385
            );
386
387
            // Create key by which to determine unique-ness:
388 3
            $uKey = $url . '|' . $vv['subCfg']['userGroups'] . '|' . $vv['subCfg']['procInstrFilter'];
389
390 3
            if (isset($duplicateTrack[$uKey])) {
391
                //if the url key is registered just display it and do not resubmit is
392
                $urlLog[] = '<em><span class="text-muted">' . htmlspecialchars($url) . '</span></em>';
393
            } else {
394
                // Scheduled time:
395 3
                $schTime = $scheduledTime + round(count($duplicateTrack) * (60 / $reqMinute));
396 3
                $schTime = intval($schTime / 60) * 60;
397 3
                $formattedDate = BackendUtility::datetime($schTime);
398 3
                $this->urlList[] = '[' . $formattedDate . '] ' . $url;
399 3
                $urlList = '[' . $formattedDate . '] ' . htmlspecialchars($url);
400
401
                // Submit for crawling!
402 3
                if ($submitCrawlUrls) {
403 3
                    $added = $this->addUrl(
404 3
                        $pageId,
405
                        $url,
406 3
                        $vv['subCfg'],
407
                        $scheduledTime,
408
                        $configurationHash,
409
                        $skipInnerCheck
410
                    );
411 3
                    if ($added === false) {
412 3
                        $urlList .= ' (URL already existed)';
413
                    }
414
                } elseif ($downloadCrawlUrls) {
415
                    $downloadUrls[$url] = $url;
416
                }
417 3
                $urlLog[] = $urlList;
418
            }
419 3
            $duplicateTrack[$uKey] = true;
420
        }
421
422 3
        return implode('<br>', $urlLog);
423
    }
424
425
    /**
426
     * Returns true if input processing instruction is among registered ones.
427
     *
428
     * @param string $piString PI to test
429
     * @param array $incomingProcInstructions Processing instructions
430
     * @return boolean
431
     */
432 8
    public function drawURLs_PIfilter($piString, array $incomingProcInstructions)
433
    {
434 8
        if (empty($incomingProcInstructions)) {
435 4
            return true;
436
        }
437
438 4
        foreach ($incomingProcInstructions as $pi) {
439 4
            if (GeneralUtility::inList($piString, $pi)) {
440 2
                return true;
441
            }
442
        }
443 2
        return false;
444
    }
445
446 5
    public function getPageTSconfigForId($id): array
447
    {
448 5
        if (! $this->MP) {
449 5
            $pageTSconfig = BackendUtility::getPagesTSconfig($id);
450
        } else {
451
            // TODO: Please check, this makes no sense to split a boolean value.
452
            [, $mountPointId] = explode('-', $this->MP);
0 ignored issues
show
Bug introduced by
$this->MP of type true is incompatible with the type string expected by parameter $string of explode(). ( Ignorable by Annotation )

If this is a false-positive, you can also ignore this issue in your code via the ignore-type  annotation

452
            [, $mountPointId] = explode('-', /** @scrutinizer ignore-type */ $this->MP);
Loading history...
453
            $pageTSconfig = BackendUtility::getPagesTSconfig($mountPointId);
0 ignored issues
show
Bug introduced by
$mountPointId of type string is incompatible with the type integer expected by parameter $id of TYPO3\CMS\Backend\Utilit...ity::getPagesTSconfig(). ( Ignorable by Annotation )

If this is a false-positive, you can also ignore this issue in your code via the ignore-type  annotation

453
            $pageTSconfig = BackendUtility::getPagesTSconfig(/** @scrutinizer ignore-type */ $mountPointId);
Loading history...
454
        }
455
456
        // Call a hook to alter configuration
457 5
        if (is_array($GLOBALS['TYPO3_CONF_VARS']['EXTCONF']['crawler']['getPageTSconfigForId'])) {
458
            $params = [
459
                'pageId' => $id,
460
                'pageTSConfig' => &$pageTSconfig,
461
            ];
462
            foreach ($GLOBALS['TYPO3_CONF_VARS']['EXTCONF']['crawler']['getPageTSconfigForId'] as $userFunc) {
463
                GeneralUtility::callUserFunction($userFunc, $params, $this);
464
            }
465
        }
466 5
        return $pageTSconfig;
467
    }
468
469
    /**
470
     * This methods returns an array of configurations.
471
     * Adds no urls!
472
     */
473 3
    public function getUrlsForPageId(int $pageId): array
474
    {
475
        // Get page TSconfig for page ID
476 3
        $pageTSconfig = $this->getPageTSconfigForId($pageId);
477
478 3
        $res = [];
479
480
        // Fetch Crawler Configuration from pageTSconfig
481 3
        $crawlerCfg = $pageTSconfig['tx_crawler.']['crawlerCfg.']['paramSets.'] ?? [];
482 3
        foreach ($crawlerCfg as $key => $values) {
483 3
            if (! is_array($values)) {
484 3
                continue;
485
            }
486 3
            $key = str_replace('.', '', $key);
487
            // Sub configuration for a single configuration string:
488 3
            $subCfg = (array) $crawlerCfg[$key . '.'];
489 3
            $subCfg['key'] = $key;
490
491 3
            if (strcmp($subCfg['procInstrFilter'] ?? '', '')) {
492 3
                $subCfg['procInstrFilter'] = implode(',', GeneralUtility::trimExplode(',', $subCfg['procInstrFilter']));
493
            }
494 3
            $pidOnlyList = implode(',', GeneralUtility::trimExplode(',', $subCfg['pidsOnly'], true));
495
496
            // process configuration if it is not page-specific or if the specific page is the current page:
497
            // TODO: Check if $pidOnlyList can be kept as Array instead of imploded
498 3
            if (! strcmp((string) $subCfg['pidsOnly'], '') || GeneralUtility::inList($pidOnlyList, strval($pageId))) {
499
500
                // Explode, process etc.:
501 3
                $res[$key] = [];
502 3
                $res[$key]['subCfg'] = $subCfg;
503 3
                $res[$key]['paramParsed'] = GeneralUtility::explodeUrl2Array($crawlerCfg[$key]);
504 3
                $res[$key]['paramExpanded'] = $this->expandParameters($res[$key]['paramParsed'], $pageId);
505 3
                $res[$key]['origin'] = 'pagets';
506
507
                // recognize MP value
508 3
                if (! $this->MP) {
509 3
                    $res[$key]['URLs'] = $this->compileUrls($res[$key]['paramExpanded'], ['?id=' . $pageId]);
510
                } else {
511
                    $res[$key]['URLs'] = $this->compileUrls($res[$key]['paramExpanded'], ['?id=' . $pageId . '&MP=' . $this->MP]);
0 ignored issues
show
Bug introduced by
Are you sure $this->MP of type true can be used in concatenation? ( Ignorable by Annotation )

If this is a false-positive, you can also ignore this issue in your code via the ignore-type  annotation

511
                    $res[$key]['URLs'] = $this->compileUrls($res[$key]['paramExpanded'], ['?id=' . $pageId . '&MP=' . /** @scrutinizer ignore-type */ $this->MP]);
Loading history...
512
                }
513
            }
514
        }
515
516
        // Get configuration from tx_crawler_configuration records up the rootline
517 3
        $crawlerConfigurations = $this->configurationRepository->getCrawlerConfigurationRecordsFromRootLine($pageId);
518 3
        foreach ($crawlerConfigurations as $configurationRecord) {
519
520
            // check access to the configuration record
521
            if (empty($configurationRecord['begroups']) || $this->getBackendUser()->isAdmin() || UserService::hasGroupAccess($this->getBackendUser()->user['usergroup_cached_list'], $configurationRecord['begroups'])) {
522
                $pidOnlyList = implode(',', GeneralUtility::trimExplode(',', $configurationRecord['pidsonly'], true));
523
524
                // process configuration if it is not page-specific or if the specific page is the current page:
525
                // TODO: Check if $pidOnlyList can be kept as Array instead of imploded
526
                if (! strcmp($configurationRecord['pidsonly'], '') || GeneralUtility::inList($pidOnlyList, strval($pageId))) {
527
                    $key = $configurationRecord['name'];
528
529
                    // don't overwrite previously defined paramSets
530
                    if (! isset($res[$key])) {
531
532
                        /* @var $TSparserObject TypoScriptParser */
533
                        $TSparserObject = GeneralUtility::makeInstance(TypoScriptParser::class);
534
                        $TSparserObject->parse($configurationRecord['processing_instruction_parameters_ts']);
535
536
                        $subCfg = [
537
                            'procInstrFilter' => $configurationRecord['processing_instruction_filter'],
538
                            'procInstrParams.' => $TSparserObject->setup,
539
                            'baseUrl' => $configurationRecord['base_url'],
540
                            'force_ssl' => (int) $configurationRecord['force_ssl'],
541
                            'userGroups' => $configurationRecord['fegroups'],
542
                            'exclude' => $configurationRecord['exclude'],
543
                            'key' => $key,
544
                        ];
545
546
                        if (! in_array($pageId, $this->expandExcludeString($subCfg['exclude']), true)) {
547
                            $res[$key] = [];
548
                            $res[$key]['subCfg'] = $subCfg;
549
                            $res[$key]['paramParsed'] = GeneralUtility::explodeUrl2Array($configurationRecord['configuration']);
550
                            $res[$key]['paramExpanded'] = $this->expandParameters($res[$key]['paramParsed'], $pageId);
551
                            $res[$key]['URLs'] = $this->compileUrls($res[$key]['paramExpanded'], ['?id=' . $pageId]);
552
                            $res[$key]['origin'] = 'tx_crawler_configuration_' . $configurationRecord['uid'];
553
                        }
554
                    }
555
                }
556
            }
557
        }
558
559 3
        foreach ($GLOBALS['TYPO3_CONF_VARS']['EXTCONF']['crawler']['processUrls'] ?? [] as $func) {
560
            $params = [
561
                'res' => &$res,
562
            ];
563
            GeneralUtility::callUserFunction($func, $params, $this);
564
        }
565 3
        return $res;
566
    }
567
568
    /**
569
     * Find all configurations of subpages of a page
570
     * TODO: Write Functional Tests
571
     */
572 2
    public function getConfigurationsForBranch(int $rootid, int $depth): array
573
    {
574 2
        $configurationsForBranch = [];
575 2
        $pageTSconfig = $this->getPageTSconfigForId($rootid);
576 2
        $sets = $pageTSconfig['tx_crawler.']['crawlerCfg.']['paramSets.'] ?? [];
577 2
        foreach ($sets as $key => $value) {
578
            if (! is_array($value)) {
579
                continue;
580
            }
581
            $configurationsForBranch[] = substr($key, -1) === '.' ? substr($key, 0, -1) : $key;
582
        }
583 2
        $pids = [];
584 2
        $rootLine = BackendUtility::BEgetRootLine($rootid);
585 2
        foreach ($rootLine as $node) {
586 1
            $pids[] = $node['uid'];
587
        }
588
        /* @var PageTreeView $tree */
589 2
        $tree = GeneralUtility::makeInstance(PageTreeView::class);
590 2
        $perms_clause = $this->getBackendUser()->getPagePermsClause(Permission::PAGE_SHOW);
591 2
        $tree->init(empty($perms_clause) ? '' : ('AND ' . $perms_clause));
592 2
        $tree->getTree($rootid, $depth, '');
593 2
        foreach ($tree->tree as $node) {
594
            $pids[] = $node['row']['uid'];
595
        }
596
597 2
        $queryBuilder = $this->getQueryBuilder('tx_crawler_configuration');
598
        $statement = $queryBuilder
599 2
            ->select('name')
600 2
            ->from('tx_crawler_configuration')
601 2
            ->where(
602 2
                $queryBuilder->expr()->in('pid', $queryBuilder->createNamedParameter($pids, Connection::PARAM_INT_ARRAY))
603
            )
604 2
            ->execute();
605
606 2
        while ($row = $statement->fetch()) {
607 1
            $configurationsForBranch[] = $row['name'];
608
        }
609 2
        return $configurationsForBranch;
610
    }
611
612
    /**
613
     * Will expand the parameters configuration to individual values. This follows a certain syntax of the value of each parameter.
614
     * Syntax of values:
615
     * - Basically: If the value is wrapped in [...] it will be expanded according to the following syntax, otherwise the value is taken literally
616
     * - Configuration is splitted by "|" and the parts are processed individually and finally added together
617
     * - For each configuration part:
618
     *         - "[int]-[int]" = Integer range, will be expanded to all values in between, values included, starting from low to high (max. 1000). Example "1-34" or "-40--30"
619
     *         - "_TABLE:[TCA table name];[_PID:[optional page id, default is current page]];[_ENABLELANG:1]" = Look up of table records from PID, filtering out deleted records. Example "_TABLE:tt_content; _PID:123"
620
     *        _ENABLELANG:1 picks only original records without their language overlays
621
     *         - Default: Literal value
622
     *
623
     * @param array $paramArray Array with key (GET var name) and values (value of GET var which is configuration for expansion)
624
     * @param integer $pid Current page ID
625
     * @return array
626
     *
627
     * TODO: Write Functional Tests
628
     */
629 10
    public function expandParameters($paramArray, $pid)
630
    {
631
        // Traverse parameter names:
632 10
        foreach ($paramArray as $p => $v) {
633 10
            $v = trim($v);
634
635
            // If value is encapsulated in square brackets it means there are some ranges of values to find, otherwise the value is literal
636 10
            if (strpos($v, '[') === 0 && substr($v, -1) === ']') {
637
                // So, find the value inside brackets and reset the paramArray value as an array.
638 10
                $v = substr($v, 1, -1);
639 10
                $paramArray[$p] = [];
640
641
                // Explode parts and traverse them:
642 10
                $parts = explode('|', $v);
643 10
                foreach ($parts as $pV) {
644
645
                    // Look for integer range: (fx. 1-34 or -40--30 // reads minus 40 to minus 30)
646 10
                    if (preg_match('/^(-?[0-9]+)\s*-\s*(-?[0-9]+)$/', trim($pV), $reg)) {
647 1
                        $reg = $this->swapIfFirstIsLargerThanSecond($reg);
648
649
                        // Traverse range, add values:
650
                        // Limit to size of range!
651 1
                        $runAwayBrake = 1000;
652 1
                        for ($a = $reg[1]; $a <= $reg[2]; $a++) {
653 1
                            $paramArray[$p][] = $a;
654 1
                            $runAwayBrake--;
655 1
                            if ($runAwayBrake <= 0) {
656
                                break;
657
                            }
658
                        }
659 9
                    } elseif (strpos(trim($pV), '_TABLE:') === 0) {
660
661
                        // Parse parameters:
662 6
                        $subparts = GeneralUtility::trimExplode(';', $pV);
663 6
                        $subpartParams = [];
664 6
                        foreach ($subparts as $spV) {
665 6
                            [$pKey, $pVal] = GeneralUtility::trimExplode(':', $spV);
666 6
                            $subpartParams[$pKey] = $pVal;
667
                        }
668
669
                        // Table exists:
670 6
                        if (isset($GLOBALS['TCA'][$subpartParams['_TABLE']])) {
671 6
                            $lookUpPid = isset($subpartParams['_PID']) ? intval($subpartParams['_PID']) : intval($pid);
672 6
                            $recursiveDepth = isset($subpartParams['_RECURSIVE']) ? intval($subpartParams['_RECURSIVE']) : 0;
673 6
                            $pidField = isset($subpartParams['_PIDFIELD']) ? trim($subpartParams['_PIDFIELD']) : 'pid';
674 6
                            $where = $subpartParams['_WHERE'] ?? '';
675 6
                            $addTable = $subpartParams['_ADDTABLE'] ?? '';
676
677 6
                            $fieldName = $subpartParams['_FIELD'] ? $subpartParams['_FIELD'] : 'uid';
678 6
                            if ($fieldName === 'uid' || $GLOBALS['TCA'][$subpartParams['_TABLE']]['columns'][$fieldName]) {
679 6
                                $queryBuilder = $this->getQueryBuilder($subpartParams['_TABLE']);
680
681 6
                                if ($recursiveDepth > 0) {
682
                                    /** @var QueryGenerator $queryGenerator */
683 2
                                    $queryGenerator = GeneralUtility::makeInstance(QueryGenerator::class);
684 2
                                    $pidList = $queryGenerator->getTreeList($lookUpPid, $recursiveDepth, 0, 1);
685 2
                                    $pidArray = GeneralUtility::intExplode(',', $pidList);
686
                                } else {
687 4
                                    $pidArray = [(string) $lookUpPid];
688
                                }
689
690 6
                                $queryBuilder->getRestrictions()
691 6
                                    ->removeAll()
692 6
                                    ->add(GeneralUtility::makeInstance(DeletedRestriction::class));
693
694
                                $queryBuilder
695 6
                                    ->select($fieldName)
696 6
                                    ->from($subpartParams['_TABLE'])
697 6
                                    ->where(
698 6
                                        $queryBuilder->expr()->in($pidField, $queryBuilder->createNamedParameter($pidArray, Connection::PARAM_INT_ARRAY)),
699
                                        $where
700
                                    );
701
702 6
                                if (! empty($addTable)) {
703
                                    // TODO: Check if this works as intended!
704
                                    $queryBuilder->add('from', $addTable);
705
                                }
706 6
                                $transOrigPointerField = $GLOBALS['TCA'][$subpartParams['_TABLE']]['ctrl']['transOrigPointerField'];
707
708 6
                                if ($subpartParams['_ENABLELANG'] && $transOrigPointerField) {
709
                                    $queryBuilder->andWhere(
710
                                        $queryBuilder->expr()->lte(
711
                                            $transOrigPointerField,
712
                                            0
713
                                        )
714
                                    );
715
                                }
716
717 6
                                $statement = $queryBuilder->execute();
718
719 6
                                $rows = [];
720 6
                                while ($row = $statement->fetch()) {
721 6
                                    $rows[$row[$fieldName]] = $row;
722
                                }
723
724 6
                                if (is_array($rows)) {
725 6
                                    $paramArray[$p] = array_merge($paramArray[$p], array_keys($rows));
726
                                }
727
                            }
728
                        }
729
                    } else {
730
                        // Just add value:
731 3
                        $paramArray[$p][] = $pV;
732
                    }
733
                    // Hook for processing own expandParameters place holder
734 10
                    if (is_array($GLOBALS['TYPO3_CONF_VARS']['SC_OPTIONS']['crawler/class.tx_crawler_lib.php']['expandParameters'])) {
735
                        $_params = [
736
                            'pObj' => &$this,
737
                            'paramArray' => &$paramArray,
738
                            'currentKey' => $p,
739
                            'currentValue' => $pV,
740
                            'pid' => $pid,
741
                        ];
742
                        foreach ($GLOBALS['TYPO3_CONF_VARS']['SC_OPTIONS']['crawler/class.tx_crawler_lib.php']['expandParameters'] as $_funcRef) {
743
                            GeneralUtility::callUserFunction($_funcRef, $_params, $this);
744
                        }
745
                    }
746
                }
747
748
                // Make unique set of values and sort array by key:
749 10
                $paramArray[$p] = array_unique($paramArray[$p]);
750 10
                ksort($paramArray);
751
            } else {
752
                // Set the literal value as only value in array:
753 3
                $paramArray[$p] = [$v];
754
            }
755
        }
756
757 10
        return $paramArray;
758
    }
759
760
    /**
761
     * Compiling URLs from parameter array (output of expandParameters())
762
     * The number of URLs will be the multiplication of the number of parameter values for each key
763
     *
764
     * @param array $paramArray Output of expandParameters(): Array with keys (GET var names) and for each an array of values
765
     * @param array $urls URLs accumulated in this array (for recursion)
766
     * @return array
767
     */
768 7
    public function compileUrls($paramArray, array $urls)
769
    {
770 7
        if (empty($paramArray)) {
771 7
            return $urls;
772
        }
773 6
        $varName = key($paramArray);
774 6
        $valueSet = array_shift($paramArray);
775
776
        // Traverse value set:
777 6
        $newUrls = [];
778 6
        foreach ($urls as $url) {
779 5
            foreach ($valueSet as $val) {
780 5
                if (count($newUrls) < $this->getMaximumUrlsToCompile()) {
781 5
                    $newUrls[] = $url . (strcmp((string) $val, '') ? '&' . rawurlencode($varName) . '=' . rawurlencode((string) $val) : '');
782
                }
783
            }
784
        }
785 6
        return $this->compileUrls($paramArray, $newUrls);
786
    }
787
788
    /************************************
789
     *
790
     * Crawler log
791
     *
792
     ************************************/
793
794
    /**
795
     * Adding call back entries to log (called from hooks typically, see indexed search class "class.crawler.php"
796
     *
797
     * @param integer $setId Set ID
798
     * @param array $params Parameters to pass to call back function
799
     * @param string $callBack Call back object reference, eg. 'EXT:indexed_search/class.crawler.php:&tx_indexedsearch_crawler'
800
     * @param integer $page_id Page ID to attach it to
801
     * @param integer $schedule Time at which to activate
802
     */
803
    public function addQueueEntry_callBack($setId, $params, $callBack, $page_id = 0, $schedule = 0): void
804
    {
805
        if (! is_array($params)) {
0 ignored issues
show
introduced by
The condition is_array($params) is always true.
Loading history...
806
            $params = [];
807
        }
808
        $params['_CALLBACKOBJ'] = $callBack;
809
810
        GeneralUtility::makeInstance(ConnectionPool::class)->getConnectionForTable('tx_crawler_queue')
811
            ->insert(
812
                'tx_crawler_queue',
813
                [
814
                    'page_id' => (int) $page_id,
815
                    'parameters' => json_encode($params),
816
                    'scheduled' => (int) $schedule ?: $this->getCurrentTime(),
817
                    'exec_time' => 0,
818
                    'set_id' => (int) $setId,
819
                    'result_data' => '',
820
                ]
821
            );
822
    }
823
824
    /************************************
825
     *
826
     * URL setting
827
     *
828
     ************************************/
829
830
    /**
831
     * Setting a URL for crawling:
832
     *
833
     * @param integer $id Page ID
834
     * @param string $url Complete URL
835
     * @param array $subCfg Sub configuration array (from TS config)
836
     * @param integer $tstamp Scheduled-time
837
     * @param string $configurationHash (optional) configuration hash
838
     * @param bool $skipInnerDuplicationCheck (optional) skip inner duplication check
839
     * @return bool
840
     */
841 7
    public function addUrl(
842
        $id,
843
        $url,
844
        array $subCfg,
845
        $tstamp,
846
        $configurationHash = '',
847
        $skipInnerDuplicationCheck = false
848
    ) {
849 7
        $urlAdded = false;
850 7
        $rows = [];
851
852
        // Creating parameters:
853
        $parameters = [
854 7
            'url' => $url,
855
        ];
856
857
        // fe user group simulation:
858 7
        $uGs = implode(',', array_unique(GeneralUtility::intExplode(',', $subCfg['userGroups'], true)));
859 7
        if ($uGs) {
860 1
            $parameters['feUserGroupList'] = $uGs;
861
        }
862
863
        // Setting processing instructions
864 7
        $parameters['procInstructions'] = GeneralUtility::trimExplode(',', $subCfg['procInstrFilter']);
865 7
        if (is_array($subCfg['procInstrParams.'])) {
866 4
            $parameters['procInstrParams'] = $subCfg['procInstrParams.'];
867
        }
868
869
        // Compile value array:
870 7
        $parameters_serialized = json_encode($parameters);
871
        $fieldArray = [
872 7
            'page_id' => (int) $id,
873 7
            'parameters' => $parameters_serialized,
874 7
            'parameters_hash' => GeneralUtility::shortMD5($parameters_serialized),
875 7
            'configuration_hash' => $configurationHash,
876 7
            'scheduled' => $tstamp,
877 7
            'exec_time' => 0,
878 7
            'set_id' => (int) $this->setID,
879 7
            'result_data' => '',
880 7
            'configuration' => $subCfg['key'],
881
        ];
882
883 7
        if ($this->registerQueueEntriesInternallyOnly) {
884
            //the entries will only be registered and not stored to the database
885 1
            $this->queueEntries[] = $fieldArray;
886
        } else {
887 6
            if (! $skipInnerDuplicationCheck) {
888
                // check if there is already an equal entry
889 5
                $rows = $this->queueRepository->getDuplicateQueueItemsIfExists(
890 5
                    (bool) $this->extensionSettings['enableTimeslot'],
891
                    $tstamp,
892 5
                    $this->getCurrentTime(),
893 5
                    $fieldArray['page_id'],
894 5
                    $fieldArray['parameters_hash']
895
                );
896
            }
897
898 6
            if (empty($rows)) {
899 5
                $connectionForCrawlerQueue = GeneralUtility::makeInstance(ConnectionPool::class)->getConnectionForTable('tx_crawler_queue');
900 5
                $connectionForCrawlerQueue->insert(
901 5
                    'tx_crawler_queue',
902
                    $fieldArray
903
                );
904 5
                $uid = $connectionForCrawlerQueue->lastInsertId('tx_crawler_queue', 'qid');
905 5
                $rows[] = $uid;
906 5
                $urlAdded = true;
907
            }
908
        }
909
910 7
        return $urlAdded;
911
    }
912
913
    /**
914
     * Returns the current system time
915
     *
916
     * @return int
917
     */
918 4
    public function getCurrentTime()
919
    {
920 4
        return time();
921
    }
922
923
    /************************************
924
     *
925
     * URL reading
926
     *
927
     ************************************/
928
929
    /**
930
     * Read URL for single queue entry
931
     *
932
     * @param integer $queueId
933
     * @param boolean $force If set, will process even if exec_time has been set!
934
     * @return integer
935
     */
936 2
    public function readUrl($queueId, $force = false)
937
    {
938 2
        $queryBuilder = GeneralUtility::makeInstance(ConnectionPool::class)->getQueryBuilderForTable($this->tableName);
939 2
        $ret = 0;
940 2
        $this->logger->debug('crawler-readurl start ' . microtime(true));
941
        // Get entry:
942
        $queryBuilder
943 2
            ->select('*')
944 2
            ->from('tx_crawler_queue')
945 2
            ->where(
946 2
                $queryBuilder->expr()->eq('qid', $queryBuilder->createNamedParameter($queueId, PDO::PARAM_INT))
947
            );
948 2
        if (! $force) {
949
            $queryBuilder
950 2
                ->andWhere('exec_time = 0')
951 2
                ->andWhere('process_scheduled > 0');
952
        }
953 2
        $queueRec = $queryBuilder->execute()->fetch();
954
955 2
        if (! is_array($queueRec)) {
956
            return;
957
        }
958
959
        // Set exec_time to lock record:
960 2
        $field_array = ['exec_time' => $this->getCurrentTime()];
961
962 2
        if (isset($this->processID)) {
963
            //if mulitprocessing is used we need to store the id of the process which has handled this entry
964 2
            $field_array['process_id_completed'] = $this->processID;
965
        }
966
967 2
        GeneralUtility::makeInstance(ConnectionPool::class)->getConnectionForTable('tx_crawler_queue')
968 2
            ->update(
969 2
                'tx_crawler_queue',
970
                $field_array,
971 2
                ['qid' => (int) $queueId]
972
            );
973
974 2
        $result = $this->queueExecutor->executeQueueItem($queueRec, $this);
975 2
        if ($result['content'] === null) {
976
            $resultData = 'An errors happened';
977
        } else {
978
            /** @var JsonCompatibilityConverter $jsonCompatibilityConverter */
979 2
            $jsonCompatibilityConverter = GeneralUtility::makeInstance(JsonCompatibilityConverter::class);
980 2
            $resultData = $jsonCompatibilityConverter->convert($result['content']);
981
        }
982
983
        //atm there's no need to point to specific pollable extensions
984 2
        if (is_array($GLOBALS['TYPO3_CONF_VARS']['EXTCONF']['crawler']['pollSuccess'])) {
985
            foreach ($GLOBALS['TYPO3_CONF_VARS']['EXTCONF']['crawler']['pollSuccess'] as $pollable) {
986
                // only check the success value if the instruction is runnig
987
                // it is important to name the pollSuccess key same as the procInstructions key
988
                if (is_array($resultData['parameters']['procInstructions'])
989
                    && in_array(
990
                        $pollable,
991
                        $resultData['parameters']['procInstructions'], true
992
                    )
993
                ) {
994
                    if (! empty($resultData['success'][$pollable]) && $resultData['success'][$pollable]) {
995
                        $ret |= self::CLI_STATUS_POLLABLE_PROCESSED;
996
                    }
997
                }
998
            }
999
        }
1000
1001
        // Set result in log which also denotes the end of the processing of this entry.
1002 2
        $field_array = ['result_data' => json_encode($result)];
1003
1004 2
        GeneralUtility::makeInstance(ConnectionPool::class)->getConnectionForTable('tx_crawler_queue')
1005 2
            ->update(
1006 2
                'tx_crawler_queue',
1007
                $field_array,
1008 2
                ['qid' => (int) $queueId]
1009
            );
1010
1011 2
        $this->logger->debug('crawler-readurl stop ' . microtime(true));
1012 2
        return $ret;
1013
    }
1014
1015
    /**
1016
     * Read URL for not-yet-inserted log-entry
1017
     *
1018
     * @param array $field_array Queue field array,
1019
     *
1020
     * @return array|bool|mixed|string
1021
     */
1022
    public function readUrlFromArray($field_array)
1023
    {
1024
        // Set exec_time to lock record:
1025
        $field_array['exec_time'] = $this->getCurrentTime();
1026
        $connectionForCrawlerQueue = GeneralUtility::makeInstance(ConnectionPool::class)->getConnectionForTable($this->tableName);
1027
        $connectionForCrawlerQueue->insert(
1028
            $this->tableName,
1029
            $field_array
1030
        );
1031
        $queueId = $connectionForCrawlerQueue->lastInsertId($this->tableName, 'qid');
1032
        $result = $this->queueExecutor->executeQueueItem($field_array, $this);
1033
1034
        // Set result in log which also denotes the end of the processing of this entry.
1035
        $field_array = ['result_data' => json_encode($result)];
1036
1037
        $connectionForCrawlerQueue->update(
1038
            $this->tableName,
1039
            $field_array,
1040
            ['qid' => $queueId]
1041
        );
1042
1043
        return $result;
1044
    }
1045
1046
    /*****************************
1047
     *
1048
     * Compiling URLs to crawl - tools
1049
     *
1050
     *****************************/
1051
1052
    /**
1053
     * @param integer $id Root page id to start from.
1054
     * @param integer $depth Depth of tree, 0=only id-page, 1= on sublevel, 99 = infinite
1055
     * @param integer $scheduledTime Unix Time when the URL is timed to be visited when put in queue
1056
     * @param integer $reqMinute Number of requests per minute (creates the interleave between requests)
1057
     * @param boolean $submitCrawlUrls If set, submits the URLs to queue in database (real crawling)
1058
     * @param boolean $downloadCrawlUrls If set (and submitcrawlUrls is false) will fill $downloadUrls with entries)
1059
     * @param array $incomingProcInstructions Array of processing instructions
1060
     * @param array $configurationSelection Array of configuration keys
1061
     * @return string
1062
     */
1063
    public function getPageTreeAndUrls(
1064
        $id,
1065
        $depth,
1066
        $scheduledTime,
1067
        $reqMinute,
1068
        $submitCrawlUrls,
1069
        $downloadCrawlUrls,
1070
        array $incomingProcInstructions,
1071
        array $configurationSelection
1072
    ) {
1073
        $this->scheduledTime = $scheduledTime;
1074
        $this->reqMinute = $reqMinute;
1075
        $this->submitCrawlUrls = $submitCrawlUrls;
1076
        $this->downloadCrawlUrls = $downloadCrawlUrls;
1077
        $this->incomingProcInstructions = $incomingProcInstructions;
1078
        $this->incomingConfigurationSelection = $configurationSelection;
1079
1080
        $this->duplicateTrack = [];
1081
        $this->downloadUrls = [];
1082
1083
        // Drawing tree:
1084
        /* @var PageTreeView $tree */
1085
        $tree = GeneralUtility::makeInstance(PageTreeView::class);
1086
        $perms_clause = $this->getBackendUser()->getPagePermsClause(Permission::PAGE_SHOW);
1087
        $tree->init('AND ' . $perms_clause);
1088
1089
        $pageInfo = BackendUtility::readPageAccess($id, $perms_clause);
1090
        if (is_array($pageInfo)) {
1091
            // Set root row:
1092
            $tree->tree[] = [
1093
                'row' => $pageInfo,
1094
                'HTML' => $this->iconFactory->getIconForRecord('pages', $pageInfo, Icon::SIZE_SMALL),
1095
            ];
1096
        }
1097
1098
        // Get branch beneath:
1099
        if ($depth) {
1100
            $tree->getTree($id, $depth, '');
1101
        }
1102
1103
        // Traverse page tree:
1104
        $code = '';
1105
1106
        foreach ($tree->tree as $data) {
1107
            $this->MP = false;
1108
1109
            // recognize mount points
1110
            if ($data['row']['doktype'] === PageRepository::DOKTYPE_MOUNTPOINT) {
1111
                $mountpage = $this->pageRepository->getPage($data['row']['uid']);
1112
1113
                // fetch mounted pages
1114
                $this->MP = $mountpage[0]['mount_pid'] . '-' . $data['row']['uid'];
0 ignored issues
show
Documentation Bug introduced by
The property $MP was declared of type boolean, but $mountpage[0]['mount_pid...' . $data['row']['uid'] is of type string. Maybe add a type cast?

This check looks for assignments to scalar types that may be of the wrong type.

To ensure the code behaves as expected, it may be a good idea to add an explicit type cast.

$answer = 42;

$correct = false;

$correct = (bool) $answer;
Loading history...
1115
1116
                $mountTree = GeneralUtility::makeInstance(PageTreeView::class);
1117
                $mountTree->init('AND ' . $perms_clause);
1118
                $mountTree->getTree($mountpage[0]['mount_pid'], $depth);
1119
1120
                foreach ($mountTree->tree as $mountData) {
1121
                    $code .= $this->drawURLs_addRowsForPage(
1122
                        $mountData['row'],
1123
                        $mountData['HTML'] . BackendUtility::getRecordTitle('pages', $mountData['row'], true)
1124
                    );
1125
                }
1126
1127
                // replace page when mount_pid_ol is enabled
1128
                if ($mountpage[0]['mount_pid_ol']) {
1129
                    $data['row']['uid'] = $mountpage[0]['mount_pid'];
1130
                } else {
1131
                    // if the mount_pid_ol is not set the MP must not be used for the mountpoint page
1132
                    $this->MP = false;
1133
                }
1134
            }
1135
1136
            $code .= $this->drawURLs_addRowsForPage(
1137
                $data['row'],
1138
                $data['HTML'] . BackendUtility::getRecordTitle('pages', $data['row'], true)
1139
            );
1140
        }
1141
1142
        return $code;
1143
    }
1144
1145
    /**
1146
     * Expands exclude string
1147
     *
1148
     * @param string $excludeString Exclude string
1149
     * @return array
1150
     */
1151 1
    public function expandExcludeString($excludeString)
1152
    {
1153
        // internal static caches;
1154 1
        static $expandedExcludeStringCache;
1155 1
        static $treeCache;
1156
1157 1
        if (empty($expandedExcludeStringCache[$excludeString])) {
1158 1
            $pidList = [];
1159
1160 1
            if (! empty($excludeString)) {
1161
                /** @var PageTreeView $tree */
1162 1
                $tree = GeneralUtility::makeInstance(PageTreeView::class);
1163 1
                $tree->init('AND ' . $this->getBackendUser()->getPagePermsClause(Permission::PAGE_SHOW));
1164
1165 1
                $excludeParts = GeneralUtility::trimExplode(',', $excludeString);
1166
1167 1
                foreach ($excludeParts as $excludePart) {
1168 1
                    [$pid, $depth] = GeneralUtility::trimExplode('+', $excludePart);
1169
1170
                    // default is "page only" = "depth=0"
1171 1
                    if (empty($depth)) {
1172 1
                        $depth = (stristr($excludePart, '+')) ? 99 : 0;
1173
                    }
1174
1175 1
                    $pidList[] = (int) $pid;
1176
1177 1
                    if ($depth > 0) {
1178
                        if (empty($treeCache[$pid][$depth])) {
1179
                            $tree->reset();
1180
                            $tree->getTree($pid, $depth);
0 ignored issues
show
Bug introduced by
$pid of type string is incompatible with the type integer expected by parameter $uid of TYPO3\CMS\Backend\Tree\V...ractTreeView::getTree(). ( Ignorable by Annotation )

If this is a false-positive, you can also ignore this issue in your code via the ignore-type  annotation

1180
                            $tree->getTree(/** @scrutinizer ignore-type */ $pid, $depth);
Loading history...
1181
                            $treeCache[$pid][$depth] = $tree->tree;
1182
                        }
1183
1184
                        foreach ($treeCache[$pid][$depth] as $data) {
1185
                            $pidList[] = (int) $data['row']['uid'];
1186
                        }
1187
                    }
1188
                }
1189
            }
1190
1191 1
            $expandedExcludeStringCache[$excludeString] = array_unique($pidList);
1192
        }
1193
1194 1
        return $expandedExcludeStringCache[$excludeString];
1195
    }
1196
1197
    /**
1198
     * Create the rows for display of the page tree
1199
     * For each page a number of rows are shown displaying GET variable configuration
1200
     */
1201
    public function drawURLs_addRowsForPage(array $pageRow, string $pageTitle): string
1202
    {
1203
        $skipMessage = '';
1204
1205
        // Get list of configurations
1206
        $configurations = $this->getUrlsForPageRow($pageRow, $skipMessage);
1207
        $configurations = ConfigurationService::removeDisallowedConfigurations($this->incomingConfigurationSelection, $configurations);
1208
1209
        // Traverse parameter combinations:
1210
        $c = 0;
1211
        $content = '';
1212
        if (! empty($configurations)) {
1213
            foreach ($configurations as $confKey => $confArray) {
1214
1215
                // Title column:
1216
                if (! $c) {
1217
                    $titleClm = '<td rowspan="' . count($configurations) . '">' . $pageTitle . '</td>';
1218
                } else {
1219
                    $titleClm = '';
1220
                }
1221
1222
                if (! in_array($pageRow['uid'], $this->expandExcludeString($confArray['subCfg']['exclude']), true)) {
1223
1224
                    // URL list:
1225
                    $urlList = $this->urlListFromUrlArray(
1226
                        $confArray,
1227
                        $pageRow,
1228
                        $this->scheduledTime,
1229
                        $this->reqMinute,
1230
                        $this->submitCrawlUrls,
1231
                        $this->downloadCrawlUrls,
1232
                        $this->duplicateTrack,
1233
                        $this->downloadUrls,
1234
                        // if empty the urls won't be filtered by processing instructions
1235
                        $this->incomingProcInstructions
1236
                    );
1237
1238
                    // Expanded parameters:
1239
                    $paramExpanded = '';
1240
                    $calcAccu = [];
1241
                    $calcRes = 1;
1242
                    foreach ($confArray['paramExpanded'] as $gVar => $gVal) {
1243
                        $paramExpanded .= '
1244
                            <tr>
1245
                                <td>' . htmlspecialchars('&' . $gVar . '=') . '<br/>' .
1246
                            '(' . count($gVal) . ')' .
1247
                            '</td>
1248
                                <td nowrap="nowrap">' . nl2br(htmlspecialchars(implode(chr(10), $gVal))) . '</td>
1249
                            </tr>
1250
                        ';
1251
                        $calcRes *= count($gVal);
1252
                        $calcAccu[] = count($gVal);
1253
                    }
1254
                    $paramExpanded = '<table>' . $paramExpanded . '</table>';
1255
                    $paramExpanded .= 'Comb: ' . implode('*', $calcAccu) . '=' . $calcRes;
1256
1257
                    // Options
1258
                    $optionValues = '';
1259
                    if ($confArray['subCfg']['userGroups']) {
1260
                        $optionValues .= 'User Groups: ' . $confArray['subCfg']['userGroups'] . '<br/>';
1261
                    }
1262
                    if ($confArray['subCfg']['procInstrFilter']) {
1263
                        $optionValues .= 'ProcInstr: ' . $confArray['subCfg']['procInstrFilter'] . '<br/>';
1264
                    }
1265
1266
                    // Compile row:
1267
                    $content .= '
1268
                        <tr>
1269
                            ' . $titleClm . '
1270
                            <td>' . htmlspecialchars($confKey) . '</td>
1271
                            <td>' . nl2br(htmlspecialchars(rawurldecode(trim(str_replace('&', chr(10) . '&', GeneralUtility::implodeArrayForUrl('', $confArray['paramParsed'])))))) . '</td>
1272
                            <td>' . $paramExpanded . '</td>
1273
                            <td nowrap="nowrap">' . $urlList . '</td>
1274
                            <td nowrap="nowrap">' . $optionValues . '</td>
1275
                            <td nowrap="nowrap">' . DebugUtility::viewArray($confArray['subCfg']['procInstrParams.']) . '</td>
1276
                        </tr>';
1277
                } else {
1278
                    $content .= '<tr>
1279
                            ' . $titleClm . '
1280
                            <td>' . htmlspecialchars($confKey) . '</td>
1281
                            <td colspan="5"><em>No entries</em> (Page is excluded in this configuration)</td>
1282
                        </tr>';
1283
                }
1284
1285
                $c++;
1286
            }
1287
        } else {
1288
            $message = ! empty($skipMessage) ? ' (' . $skipMessage . ')' : '';
1289
1290
            // Compile row:
1291
            $content .= '
1292
                <tr>
1293
                    <td>' . $pageTitle . '</td>
1294
                    <td colspan="6"><em>No entries</em>' . $message . '</td>
1295
                </tr>';
1296
        }
1297
1298
        return $content;
1299
    }
1300
1301
    /*****************************
1302
     *
1303
     * CLI functions
1304
     *
1305
     *****************************/
1306
1307
    /**
1308
     * Running the functionality of the CLI (crawling URLs from queue)
1309
     */
1310 2
    public function CLI_run(int $countInARun, int $sleepTime, int $sleepAfterFinish): int
1311
    {
1312 2
        $result = 0;
1313 2
        $counter = 0;
1314
1315
        // Clean up the queue
1316 2
        $this->queueRepository->cleanupQueue();
1317
1318
        // Select entries:
1319 2
        $rows = $this->queueRepository->fetchRecordsToBeCrawled($countInARun);
1320
1321 2
        if (! empty($rows)) {
1322 2
            $quidList = [];
1323
1324 2
            foreach ($rows as $r) {
1325 2
                $quidList[] = $r['qid'];
1326
            }
1327
1328 2
            $processId = $this->CLI_buildProcessId();
1329
1330
            //save the number of assigned queue entries to determine how many have been processed later
1331 2
            $numberOfAffectedRows = $this->queueRepository->updateProcessIdAndSchedulerForQueueIds($quidList, $processId);
1332 2
            $this->processRepository->updateProcessAssignItemsCount($numberOfAffectedRows, $processId);
1333
1334 2
            if ($numberOfAffectedRows !== count($quidList)) {
1335
                return ($result | self::CLI_STATUS_ABORTED);
1336
            }
1337
1338 2
            foreach ($rows as $r) {
1339 2
                $result |= $this->readUrl($r['qid']);
1340
1341 2
                $counter++;
1342
                // Just to relax the system
1343 2
                usleep((int) $sleepTime);
1344
1345
                // if during the start and the current read url the cli has been disable we need to return from the function
1346
                // mark the process NOT as ended.
1347 2
                if ($this->crawler->isDisabled()) {
1348
                    return ($result | self::CLI_STATUS_ABORTED);
1349
                }
1350
1351 2
                if (! $this->processRepository->isProcessActive($this->CLI_buildProcessId())) {
1352
                    $this->CLI_debug('conflict / timeout (' . $this->CLI_buildProcessId() . ')');
0 ignored issues
show
Bug introduced by
The method CLI_debug() does not exist on AOE\Crawler\Controller\CrawlerController. Since you implemented __call, consider adding a @method annotation. ( Ignorable by Annotation )

If this is a false-positive, you can also ignore this issue in your code via the ignore-call  annotation

1352
                    $this->/** @scrutinizer ignore-call */ 
1353
                           CLI_debug('conflict / timeout (' . $this->CLI_buildProcessId() . ')');
Loading history...
1353
                    $result |= self::CLI_STATUS_ABORTED;
1354
                    //possible timeout
1355
                    break;
1356
                }
1357
            }
1358
1359 2
            sleep((int) $sleepAfterFinish);
1360
        }
1361
1362 2
        if ($counter > 0) {
1363 2
            $result |= self::CLI_STATUS_PROCESSED;
1364
        }
1365
1366 2
        return $result;
1367
    }
1368
1369
    /**
1370
     * Try to acquire a new process with the given id
1371
     * also performs some auto-cleanup for orphan processes
1372
     * @param string $id identification string for the process
1373
     * @return boolean
1374
     * @todo preemption might not be the most elegant way to clean up
1375
     */
1376 2
    public function CLI_checkAndAcquireNewProcess($id)
1377
    {
1378 2
        $ret = true;
1379
1380 2
        $systemProcessId = getmypid();
1381 2
        if (! $systemProcessId) {
1382
            return false;
1383
        }
1384
1385 2
        $processCount = 0;
1386 2
        $orphanProcesses = [];
1387
1388 2
        $activeProcesses = $this->processRepository->findAllActive();
1389 2
        $currentTime = $this->getCurrentTime();
1390
1391
        /** @var Process $process */
1392 2
        foreach ($activeProcesses as $process) {
1393
            if ($process->getTtl() < $currentTime) {
1394
                $orphanProcesses[] = $process->getProcessId();
1395
            } else {
1396
                $processCount++;
1397
            }
1398
        }
1399
1400
        // if there are less than allowed active processes then add a new one
1401 2
        if ($processCount < (int) $this->extensionSettings['processLimit']) {
1402 2
            GeneralUtility::makeInstance(ConnectionPool::class)->getConnectionForTable('tx_crawler_process')->insert(
1403 2
                'tx_crawler_process',
1404
                [
1405 2
                    'process_id' => $id,
1406 2
                    'active' => 1,
1407 2
                    'ttl' => $currentTime + (int) $this->extensionSettings['processMaxRunTime'],
1408 2
                    'system_process_id' => $systemProcessId,
1409
                ]
1410
            );
1411
        } else {
1412
            $ret = false;
1413
        }
1414
1415 2
        $this->processRepository->deleteProcessesMarkedAsDeleted();
1416 2
        $this->CLI_releaseProcesses($orphanProcesses);
1417
1418 2
        return $ret;
1419
    }
1420
1421
    /**
1422
     * Release a process and the required resources
1423
     *
1424
     * @param mixed $releaseIds string with a single process-id or array with multiple process-ids
1425
     * @return boolean
1426
     */
1427 2
    public function CLI_releaseProcesses($releaseIds)
1428
    {
1429 2
        $queryBuilder = GeneralUtility::makeInstance(ConnectionPool::class)->getQueryBuilderForTable($this->tableName);
1430
1431 2
        if (! is_array($releaseIds)) {
1432 2
            $releaseIds = [$releaseIds];
1433
        }
1434
1435 2
        if (empty($releaseIds)) {
1436
            //nothing to release
1437 2
            return false;
1438
        }
1439
1440
        // some kind of 2nd chance algo - this way you need at least 2 processes to have a real cleanup
1441
        // this ensures that a single process can't mess up the entire process table
1442
1443
        // mark all processes as deleted which have no "waiting" queue-entires and which are not active
1444
1445
        $queryBuilder
1446 2
            ->update($this->tableName, 'q')
1447 2
            ->where(
1448 2
                'q.process_id IN(SELECT p.process_id FROM tx_crawler_process as p WHERE p.active = 0)'
1449
            )
1450 2
            ->set('q.process_scheduled', 0)
1451 2
            ->set('q.process_id', '')
1452 2
            ->execute();
1453
1454
        // FIXME: Not entirely sure that this is equivalent to the previous version
1455 2
        $queryBuilder->resetQueryPart('set');
1456
1457
        $queryBuilder
1458 2
            ->update('tx_crawler_process')
1459 2
            ->where(
1460 2
                $queryBuilder->expr()->eq('active', 0),
1461 2
                'process_id IN(SELECT q.process_id FROM tx_crawler_queue as q WHERE q.exec_time = 0)'
1462
            )
1463 2
            ->set('system_process_id', 0)
1464 2
            ->execute();
1465
1466 2
        $this->processRepository->markRequestedProcessesAsNotActive($releaseIds);
1467 2
        $this->queueRepository->unsetProcessScheduledAndProcessIdForQueueEntries($releaseIds);
1468
1469 2
        return true;
1470
    }
1471
1472
    /**
1473
     * Create a unique Id for the current process
1474
     *
1475
     * @return string the ID
1476
     */
1477 3
    public function CLI_buildProcessId()
1478
    {
1479 3
        if (! $this->processID) {
1480 2
            $this->processID = GeneralUtility::shortMD5(microtime(true));
1481
        }
1482 3
        return $this->processID;
1483
    }
1484
1485
    /**
1486
     * Returns a md5 hash generated from a serialized configuration array.
1487
     *
1488
     * @return string
1489
     */
1490 9
    protected function getConfigurationHash(array $configuration)
1491
    {
1492 9
        unset($configuration['paramExpanded']);
1493 9
        unset($configuration['URLs']);
1494 9
        return md5(serialize($configuration));
1495
    }
1496
1497 1
    protected function swapIfFirstIsLargerThanSecond(array $reg): array
1498
    {
1499
        // Swap if first is larger than last:
1500 1
        if ($reg[1] > $reg[2]) {
1501
            $temp = $reg[2];
1502
            $reg[2] = $reg[1];
1503
            $reg[1] = $temp;
1504
        }
1505
1506 1
        return $reg;
1507
    }
1508
1509 5
    private function getMaximumUrlsToCompile(): int
1510
    {
1511 5
        return $this->maximumUrlsToCompile;
1512
    }
1513
1514
    /**
1515
     * @return BackendUserAuthentication
1516
     */
1517 3
    private function getBackendUser()
1518
    {
1519
        // Make sure the _cli_ user is loaded
1520 3
        Bootstrap::initializeBackendAuthentication();
1521 3
        if ($this->backendUser === null) {
1522 3
            $this->backendUser = $GLOBALS['BE_USER'];
1523
        }
1524 3
        return $this->backendUser;
1525
    }
1526
1527
    /**
1528
     * Get querybuilder for given table
1529
     *
1530
     * @return QueryBuilder
1531
     */
1532 8
    private function getQueryBuilder(string $table)
1533
    {
1534 8
        return GeneralUtility::makeInstance(ConnectionPool::class)->getQueryBuilderForTable($table);
1535
    }
1536
}
1537