Passed
Push — wip/remove-deprecations-for-v1... ( 107830...7e6bff )
by Tomas Norre
05:46
created

CrawlerController::expandParameters()   F

Complexity

Conditions 25
Paths 831

Size

Total Lines 129
Code Lines 74

Duplication

Lines 0
Ratio 0 %

Code Coverage

Tests 57
CRAP Score 28.2868

Importance

Changes 1
Bugs 0 Features 0
Metric Value
cc 25
eloc 74
c 1
b 0
f 0
nc 831
nop 2
dl 0
loc 129
ccs 57
cts 69
cp 0.8261
crap 28.2868
rs 0.2347

How to fix   Long Method    Complexity   

Long Method

Small methods make your code easier to understand, in particular if combined with a good name. Besides, if your method is small, finding a good name is usually much easier.

For example, if you find yourself adding comments to a method's body, this is usually a good sign to extract the commented part to a new method, and use the comment as a starting point when coming up with a good name for this new method.

Commonly applied refactorings include:

1
<?php
2
3
declare(strict_types=1);
4
5
namespace AOE\Crawler\Controller;
6
7
/***************************************************************
8
 *  Copyright notice
9
 *
10
 *  (c) 2020 AOE GmbH <[email protected]>
11
 *
12
 *  All rights reserved
13
 *
14
 *  This script is part of the TYPO3 project. The TYPO3 project is
15
 *  free software; you can redistribute it and/or modify
16
 *  it under the terms of the GNU General Public License as published by
17
 *  the Free Software Foundation; either version 3 of the License, or
18
 *  (at your option) any later version.
19
 *
20
 *  The GNU General Public License can be found at
21
 *  http://www.gnu.org/copyleft/gpl.html.
22
 *
23
 *  This script is distributed in the hope that it will be useful,
24
 *  but WITHOUT ANY WARRANTY; without even the implied warranty of
25
 *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
26
 *  GNU General Public License for more details.
27
 *
28
 *  This copyright notice MUST APPEAR in all copies of the script!
29
 ***************************************************************/
30
31
use AOE\Crawler\Configuration\ExtensionConfigurationProvider;
32
use AOE\Crawler\Converter\JsonCompatibilityConverter;
33
use AOE\Crawler\Crawler;
34
use AOE\Crawler\CrawlStrategy\CrawlStrategyFactory;
35
use AOE\Crawler\Domain\Model\Process;
36
use AOE\Crawler\Domain\Repository\ConfigurationRepository;
37
use AOE\Crawler\Domain\Repository\ProcessRepository;
38
use AOE\Crawler\Domain\Repository\QueueRepository;
39
use AOE\Crawler\QueueExecutor;
40
use AOE\Crawler\Service\ConfigurationService;
41
use AOE\Crawler\Service\UrlService;
42
use AOE\Crawler\Service\UserService;
43
use PDO;
44
use Psr\Log\LoggerAwareInterface;
45
use Psr\Log\LoggerAwareTrait;
46
use TYPO3\CMS\Backend\Tree\View\PageTreeView;
47
use TYPO3\CMS\Backend\Utility\BackendUtility;
48
use TYPO3\CMS\Core\Authentication\BackendUserAuthentication;
49
use TYPO3\CMS\Core\Compatibility\PublicMethodDeprecationTrait;
50
use TYPO3\CMS\Core\Compatibility\PublicPropertyDeprecationTrait;
51
use TYPO3\CMS\Core\Core\Bootstrap;
52
use TYPO3\CMS\Core\Database\Connection;
53
use TYPO3\CMS\Core\Database\ConnectionPool;
54
use TYPO3\CMS\Core\Database\Query\QueryBuilder;
55
use TYPO3\CMS\Core\Database\Query\Restriction\DeletedRestriction;
56
use TYPO3\CMS\Core\Database\QueryGenerator;
57
use TYPO3\CMS\Core\Domain\Repository\PageRepository;
58
use TYPO3\CMS\Core\Imaging\Icon;
59
use TYPO3\CMS\Core\Imaging\IconFactory;
60
use TYPO3\CMS\Core\Type\Bitmask\Permission;
61
use TYPO3\CMS\Core\TypoScript\Parser\TypoScriptParser;
62
use TYPO3\CMS\Core\Utility\DebugUtility;
63
use TYPO3\CMS\Core\Utility\GeneralUtility;
64
use TYPO3\CMS\Core\Utility\MathUtility;
65
use TYPO3\CMS\Extbase\Object\ObjectManager;
66
67
/**
68
 * Class CrawlerController
69
 *
70
 * @package AOE\Crawler\Controller
71
 */
72
class CrawlerController implements LoggerAwareInterface
73
{
74
    use LoggerAwareTrait;
75
    use PublicMethodDeprecationTrait;
0 ignored issues
show
Bug introduced by
The trait TYPO3\CMS\Core\Compatibi...cMethodDeprecationTrait requires the property $deprecatedPublicMethods which is not provided by AOE\Crawler\Controller\CrawlerController.
Loading history...
76
    use PublicPropertyDeprecationTrait;
0 ignored issues
show
Bug introduced by
The trait TYPO3\CMS\Core\Compatibi...ropertyDeprecationTrait requires the property $deprecatedPublicProperties which is not provided by AOE\Crawler\Controller\CrawlerController.
Loading history...
77
78
    public const CLI_STATUS_NOTHING_PROCCESSED = 0;
79
80
    //queue not empty
81
    public const CLI_STATUS_REMAIN = 1;
82
83
    //(some) queue items where processed
84
    public const CLI_STATUS_PROCESSED = 2;
85
86
    //instance didn't finish
87
    public const CLI_STATUS_ABORTED = 4;
88
89
    public const CLI_STATUS_POLLABLE_PROCESSED = 8;
90
91
    /**
92
     * @var integer
93
     */
94
    public $setID = 0;
95
96
    /**
97
     * @var string
98
     */
99
    public $processID = '';
100
101
    /**
102
     * @var array
103
     */
104
    public $duplicateTrack = [];
105
106
    /**
107
     * @var array
108
     */
109
    public $downloadUrls = [];
110
111
    /**
112
     * @var array
113
     */
114
    public $incomingProcInstructions = [];
115
116
    /**
117
     * @var array
118
     */
119
    public $incomingConfigurationSelection = [];
120
121
    /**
122
     * @var bool
123
     */
124
    public $registerQueueEntriesInternallyOnly = false;
125
126
    /**
127
     * @var array
128
     */
129
    public $queueEntries = [];
130
131
    /**
132
     * @var array
133
     */
134
    public $urlList = [];
135
136
    /**
137
     * @var array
138
     */
139
    public $extensionSettings = [];
140
141
    /**
142
     * Mount Point
143
     *
144
     * @var bool
145
     * Todo: Check what this is used for and adjust the type hint or code, as bool doesn't match the current code.
146
     */
147
    public $MP = false;
148
149
    /**
150
     * @var QueueRepository
151
     */
152
    protected $queueRepository;
153
154
    /**
155
     * @var ProcessRepository
156
     */
157
    protected $processRepository;
158
159
    /**
160
     * @var ConfigurationRepository
161
     */
162
    protected $configurationRepository;
163
164
    /**
165
     * @var string
166
     */
167
    protected $tableName = 'tx_crawler_queue';
168
169
    /**
170
     * @var QueueExecutor
171
     */
172
    protected $queueExecutor;
173
174
    /**
175
     * @var int
176
     */
177
    protected $maximumUrlsToCompile = 10000;
178
179
    /**
180
     * @var IconFactory
181
     */
182
    protected $iconFactory;
183
184
    /**
185
     * @var BackendUserAuthentication|null
186
     */
187
    private $backendUser;
188
189
    /**
190
     * @var integer
191
     */
192
    private $scheduledTime = 0;
193
194
    /**
195
     * @var integer
196
     */
197
    private $reqMinute = 0;
198
199
    /**
200
     * @var bool
201
     */
202
    private $submitCrawlUrls = false;
203
204
    /**
205
     * @var bool
206
     */
207
    private $downloadCrawlUrls = false;
208
209
    /**
210
     * @var PageRepository
211
     */
212
    private $pageRepository;
213
214
    /**
215
     * @var Crawler
216
     */
217
    private $crawler;
218
219
    /************************************
220
     *
221
     * Getting URLs based on Page TSconfig
222
     *
223
     ************************************/
224
225 19
    public function __construct()
226
    {
227 19
        $objectManager = GeneralUtility::makeInstance(ObjectManager::class);
228 19
        $crawlStrategyFactory = GeneralUtility::makeInstance(CrawlStrategyFactory::class);
229 19
        $this->queueRepository = $objectManager->get(QueueRepository::class);
230 19
        $this->processRepository = $objectManager->get(ProcessRepository::class);
231 19
        $this->configurationRepository = $objectManager->get(ConfigurationRepository::class);
232 19
        $this->pageRepository = GeneralUtility::makeInstance(PageRepository::class);
233 19
        $this->queueExecutor = GeneralUtility::makeInstance(QueueExecutor::class, $crawlStrategyFactory);
234 19
        $this->iconFactory = GeneralUtility::makeInstance(IconFactory::class);
235 19
        $this->crawler = GeneralUtility::makeInstance(Crawler::class);
236
237
        /** @var ExtensionConfigurationProvider $configurationProvider */
238 19
        $configurationProvider = GeneralUtility::makeInstance(ExtensionConfigurationProvider::class);
239 19
        $settings = $configurationProvider->getExtensionConfiguration();
240 19
        $this->extensionSettings = is_array($settings) ? $settings : [];
0 ignored issues
show
introduced by
The condition is_array($settings) is always true.
Loading history...
241
242
        // set defaults:
243 19
        if (MathUtility::convertToPositiveInteger($this->extensionSettings['countInARun']) === 0) {
244
            $this->extensionSettings['countInARun'] = 100;
245
        }
246
247 19
        $this->extensionSettings['processLimit'] = MathUtility::forceIntegerInRange($this->extensionSettings['processLimit'], 1, 99, 1);
248 19
        $this->setMaximumUrlsToCompile(MathUtility::forceIntegerInRange($this->extensionSettings['maxCompileUrls'], 1, 1000000000, 10000));
249 19
    }
250
251 23
    public function setMaximumUrlsToCompile(int $maximumUrlsToCompile): void
252
    {
253 23
        $this->maximumUrlsToCompile = $maximumUrlsToCompile;
254 23
    }
255
256
    /**
257
     * Sets the extensions settings (unserialized pendant of $TYPO3_CONF_VARS['EXT']['extConf']['crawler']).
258
     */
259 8
    public function setExtensionSettings(array $extensionSettings): void
260
    {
261 8
        $this->extensionSettings = $extensionSettings;
262 8
    }
263
264
    /**
265
     * Check if the given page should be crawled
266
     *
267
     * @return false|string false if the page should be crawled (not excluded), true / skipMessage if it should be skipped
268
     */
269 11
    public function checkIfPageShouldBeSkipped(array $pageRow)
270
    {
271
        // if page is hidden
272 11
        if (! $this->extensionSettings['crawlHiddenPages'] && $pageRow['hidden']) {
273 1
            return 'Because page is hidden';
274
        }
275
276 10
        if (GeneralUtility::inList('3,4,199,254,255', $pageRow['doktype'])) {
277 3
            return 'Because doktype is not allowed';
278
        }
279
280 7
        foreach ($GLOBALS['TYPO3_CONF_VARS']['EXTCONF']['crawler']['excludeDoktype'] ?? [] as $key => $doktypeList) {
281 1
            if (GeneralUtility::inList($doktypeList, $pageRow['doktype'])) {
282 1
                return 'Doktype was excluded by "' . $key . '"';
283
            }
284
        }
285
286
        // veto hook
287 6
        foreach ($GLOBALS['TYPO3_CONF_VARS']['EXTCONF']['crawler']['pageVeto'] ?? [] as $key => $func) {
288
            $params = [
289 2
                'pageRow' => $pageRow,
290
            ];
291
            // expects "false" if page is ok and "true" or a skipMessage if this page should _not_ be crawled
292 2
            $veto = GeneralUtility::callUserFunction($func, $params, $this);
293 2
            if ($veto !== false) {
294 2
                if (is_string($veto)) {
295 1
                    return $veto;
296
                }
297 1
                return 'Veto from hook "' . htmlspecialchars($key) . '"';
298
            }
299
        }
300
301 4
        return false;
302
    }
303
304
    /**
305
     * Wrapper method for getUrlsForPageId()
306
     * It returns an array of configurations and no urls!
307
     *
308
     * @param array $pageRow Page record with at least dok-type and uid columns.
309
     * @param string $skipMessage
310
     * @return array
311
     * @see getUrlsForPageId()
312
     */
313 5
    public function getUrlsForPageRow(array $pageRow, &$skipMessage = '')
314
    {
315 5
        if (! is_int($pageRow['uid'])) {
316
            $skipMessage = 'PageUid ' . $pageRow['uid'] . ' was not an integer';
317
            return [];
318
        }
319
320 5
        $message = $this->checkIfPageShouldBeSkipped($pageRow);
321 5
        if ($message === false) {
322 4
            $res = $this->getUrlsForPageId($pageRow['uid']);
323 4
            $skipMessage = '';
324
        } else {
325 1
            $skipMessage = $message;
326 1
            $res = [];
327
        }
328
329 5
        return $res;
330
    }
331
332
    /**
333
     * Creates a list of URLs from input array (and submits them to queue if asked for)
334
     * See Web > Info module script + "indexed_search"'s crawler hook-client using this!
335
     *
336
     * @param array $vv Information about URLs from pageRow to crawl.
337
     * @param array $pageRow Page row
338
     * @param int $scheduledTime Unix time to schedule indexing to, typically time()
339
     * @param int $reqMinute Number of requests per minute (creates the interleave between requests)
340
     * @param bool $submitCrawlUrls If set, submits the URLs to queue
341
     * @param bool $downloadCrawlUrls If set (and submitcrawlUrls is false) will fill $downloadUrls with entries)
342
     * @param array $duplicateTrack Array which is passed by reference and contains the an id per url to secure we will not crawl duplicates
343
     * @param array $downloadUrls Array which will be filled with URLS for download if flag is set.
344
     * @param array $incomingProcInstructions Array of processing instructions
345
     * @return string List of URLs (meant for display in backend module)
346
     */
347 3
    public function urlListFromUrlArray(
348
        array $vv,
349
        array $pageRow,
350
        $scheduledTime,
351
        $reqMinute,
352
        $submitCrawlUrls,
353
        $downloadCrawlUrls,
354
        array &$duplicateTrack,
355
        array &$downloadUrls,
356
        array $incomingProcInstructions
357
    ) {
358 3
        if (! is_array($vv['URLs'])) {
359
            return 'ERROR - no URL generated';
360
        }
361 3
        $urlLog = [];
362 3
        $pageId = (int) $pageRow['uid'];
363 3
        $configurationHash = $this->getConfigurationHash($vv);
364 3
        $skipInnerCheck = $this->queueRepository->noUnprocessedQueueEntriesForPageWithConfigurationHashExist($pageId, $configurationHash);
365
366 3
        $urlService = new UrlService();
367
368 3
        foreach ($vv['URLs'] as $urlQuery) {
369 3
            if (! $this->drawURLs_PIfilter($vv['subCfg']['procInstrFilter'], $incomingProcInstructions)) {
370
                continue;
371
            }
372 3
            $url = (string) $urlService->getUrlFromPageAndQueryParameters(
373 3
                $pageId,
374
                $urlQuery,
375 3
                $vv['subCfg']['baseUrl'] ?? null,
376 3
                $vv['subCfg']['force_ssl'] ?? 0
377
            );
378
379
            // Create key by which to determine unique-ness:
380 3
            $uKey = $url . '|' . $vv['subCfg']['userGroups'] . '|' . $vv['subCfg']['procInstrFilter'];
381
382 3
            if (isset($duplicateTrack[$uKey])) {
383
                //if the url key is registered just display it and do not resubmit is
384
                $urlLog[] = '<em><span class="text-muted">' . htmlspecialchars($url) . '</span></em>';
385
            } else {
386
                // Scheduled time:
387 3
                $schTime = $scheduledTime + round(count($duplicateTrack) * (60 / $reqMinute));
388 3
                $schTime = intval($schTime / 60) * 60;
389 3
                $formattedDate = BackendUtility::datetime($schTime);
390 3
                $this->urlList[] = '[' . $formattedDate . '] ' . $url;
391 3
                $urlList = '[' . $formattedDate . '] ' . htmlspecialchars($url);
392
393
                // Submit for crawling!
394 3
                if ($submitCrawlUrls) {
395 3
                    $added = $this->addUrl(
396 3
                        $pageId,
397
                        $url,
398 3
                        $vv['subCfg'],
399
                        $scheduledTime,
400
                        $configurationHash,
401
                        $skipInnerCheck
402
                    );
403 3
                    if ($added === false) {
404 3
                        $urlList .= ' (URL already existed)';
405
                    }
406
                } elseif ($downloadCrawlUrls) {
407
                    $downloadUrls[$url] = $url;
408
                }
409 3
                $urlLog[] = $urlList;
410
            }
411 3
            $duplicateTrack[$uKey] = true;
412
        }
413
414 3
        return implode('<br>', $urlLog);
415
    }
416
417
    /**
418
     * Returns true if input processing instruction is among registered ones.
419
     *
420
     * @param string $piString PI to test
421
     * @param array $incomingProcInstructions Processing instructions
422
     * @return boolean
423
     */
424 8
    public function drawURLs_PIfilter($piString, array $incomingProcInstructions)
425
    {
426 8
        if (empty($incomingProcInstructions)) {
427 4
            return true;
428
        }
429
430 4
        foreach ($incomingProcInstructions as $pi) {
431 4
            if (GeneralUtility::inList($piString, $pi)) {
432 2
                return true;
433
            }
434
        }
435 2
        return false;
436
    }
437
438 5
    public function getPageTSconfigForId($id): array
439
    {
440 5
        if (! $this->MP) {
441 5
            $pageTSconfig = BackendUtility::getPagesTSconfig($id);
442
        } else {
443
            // TODO: Please check, this makes no sense to split a boolean value.
444
            [, $mountPointId] = explode('-', $this->MP);
0 ignored issues
show
Bug introduced by
$this->MP of type true is incompatible with the type string expected by parameter $string of explode(). ( Ignorable by Annotation )

If this is a false-positive, you can also ignore this issue in your code via the ignore-type  annotation

444
            [, $mountPointId] = explode('-', /** @scrutinizer ignore-type */ $this->MP);
Loading history...
445
            $pageTSconfig = BackendUtility::getPagesTSconfig($mountPointId);
0 ignored issues
show
Bug introduced by
$mountPointId of type string is incompatible with the type integer expected by parameter $id of TYPO3\CMS\Backend\Utilit...ity::getPagesTSconfig(). ( Ignorable by Annotation )

If this is a false-positive, you can also ignore this issue in your code via the ignore-type  annotation

445
            $pageTSconfig = BackendUtility::getPagesTSconfig(/** @scrutinizer ignore-type */ $mountPointId);
Loading history...
446
        }
447
448
        // Call a hook to alter configuration
449 5
        if (is_array($GLOBALS['TYPO3_CONF_VARS']['EXTCONF']['crawler']['getPageTSconfigForId'])) {
450
            $params = [
451
                'pageId' => $id,
452
                'pageTSConfig' => &$pageTSconfig,
453
            ];
454
            foreach ($GLOBALS['TYPO3_CONF_VARS']['EXTCONF']['crawler']['getPageTSconfigForId'] as $userFunc) {
455
                GeneralUtility::callUserFunction($userFunc, $params, $this);
456
            }
457
        }
458 5
        return $pageTSconfig;
459
    }
460
461
    /**
462
     * This methods returns an array of configurations.
463
     * Adds no urls!
464
     */
465 3
    public function getUrlsForPageId(int $pageId): array
466
    {
467
        // Get page TSconfig for page ID
468 3
        $pageTSconfig = $this->getPageTSconfigForId($pageId);
469
470 3
        $res = [];
471
472
        // Fetch Crawler Configuration from pageTSconfig
473 3
        $crawlerCfg = $pageTSconfig['tx_crawler.']['crawlerCfg.']['paramSets.'] ?? [];
474 3
        foreach ($crawlerCfg as $key => $values) {
475 3
            if (! is_array($values)) {
476 3
                continue;
477
            }
478 3
            $key = str_replace('.', '', $key);
479
            // Sub configuration for a single configuration string:
480 3
            $subCfg = (array) $crawlerCfg[$key . '.'];
481 3
            $subCfg['key'] = $key;
482
483 3
            if (strcmp($subCfg['procInstrFilter'] ?? '', '')) {
484 3
                $subCfg['procInstrFilter'] = implode(',', GeneralUtility::trimExplode(',', $subCfg['procInstrFilter']));
485
            }
486 3
            $pidOnlyList = implode(',', GeneralUtility::trimExplode(',', $subCfg['pidsOnly'], true));
487
488
            // process configuration if it is not page-specific or if the specific page is the current page:
489
            // TODO: Check if $pidOnlyList can be kept as Array instead of imploded
490 3
            if (! strcmp((string) $subCfg['pidsOnly'], '') || GeneralUtility::inList($pidOnlyList, strval($pageId))) {
491
492
                // Explode, process etc.:
493 3
                $res[$key] = [];
494 3
                $res[$key]['subCfg'] = $subCfg;
495 3
                $res[$key]['paramParsed'] = GeneralUtility::explodeUrl2Array($crawlerCfg[$key]);
496 3
                $res[$key]['paramExpanded'] = $this->expandParameters($res[$key]['paramParsed'], $pageId);
497 3
                $res[$key]['origin'] = 'pagets';
498
499
                // recognize MP value
500 3
                if (! $this->MP) {
501 3
                    $res[$key]['URLs'] = $this->compileUrls($res[$key]['paramExpanded'], ['?id=' . $pageId]);
502
                } else {
503
                    $res[$key]['URLs'] = $this->compileUrls($res[$key]['paramExpanded'], ['?id=' . $pageId . '&MP=' . $this->MP]);
0 ignored issues
show
Bug introduced by
Are you sure $this->MP of type true can be used in concatenation? ( Ignorable by Annotation )

If this is a false-positive, you can also ignore this issue in your code via the ignore-type  annotation

503
                    $res[$key]['URLs'] = $this->compileUrls($res[$key]['paramExpanded'], ['?id=' . $pageId . '&MP=' . /** @scrutinizer ignore-type */ $this->MP]);
Loading history...
504
                }
505
            }
506
        }
507
508
        // Get configuration from tx_crawler_configuration records up the rootline
509 3
        $crawlerConfigurations = $this->configurationRepository->getCrawlerConfigurationRecordsFromRootLine($pageId);
510 3
        foreach ($crawlerConfigurations as $configurationRecord) {
511
512
            // check access to the configuration record
513
            if (empty($configurationRecord['begroups']) || $this->getBackendUser()->isAdmin() || UserService::hasGroupAccess($this->getBackendUser()->user['usergroup_cached_list'], $configurationRecord['begroups'])) {
514
                $pidOnlyList = implode(',', GeneralUtility::trimExplode(',', $configurationRecord['pidsonly'], true));
515
516
                // process configuration if it is not page-specific or if the specific page is the current page:
517
                // TODO: Check if $pidOnlyList can be kept as Array instead of imploded
518
                if (! strcmp($configurationRecord['pidsonly'], '') || GeneralUtility::inList($pidOnlyList, strval($pageId))) {
519
                    $key = $configurationRecord['name'];
520
521
                    // don't overwrite previously defined paramSets
522
                    if (! isset($res[$key])) {
523
524
                        /* @var $TSparserObject TypoScriptParser */
525
                        $TSparserObject = GeneralUtility::makeInstance(TypoScriptParser::class);
526
                        $TSparserObject->parse($configurationRecord['processing_instruction_parameters_ts']);
527
528
                        $subCfg = [
529
                            'procInstrFilter' => $configurationRecord['processing_instruction_filter'],
530
                            'procInstrParams.' => $TSparserObject->setup,
531
                            'baseUrl' => $configurationRecord['base_url'],
532
                            'force_ssl' => (int) $configurationRecord['force_ssl'],
533
                            'userGroups' => $configurationRecord['fegroups'],
534
                            'exclude' => $configurationRecord['exclude'],
535
                            'key' => $key,
536
                        ];
537
538
                        if (! in_array($pageId, $this->expandExcludeString($subCfg['exclude']), true)) {
539
                            $res[$key] = [];
540
                            $res[$key]['subCfg'] = $subCfg;
541
                            $res[$key]['paramParsed'] = GeneralUtility::explodeUrl2Array($configurationRecord['configuration']);
542
                            $res[$key]['paramExpanded'] = $this->expandParameters($res[$key]['paramParsed'], $pageId);
543
                            $res[$key]['URLs'] = $this->compileUrls($res[$key]['paramExpanded'], ['?id=' . $pageId]);
544
                            $res[$key]['origin'] = 'tx_crawler_configuration_' . $configurationRecord['uid'];
545
                        }
546
                    }
547
                }
548
            }
549
        }
550
551 3
        foreach ($GLOBALS['TYPO3_CONF_VARS']['EXTCONF']['crawler']['processUrls'] ?? [] as $func) {
552
            $params = [
553
                'res' => &$res,
554
            ];
555
            GeneralUtility::callUserFunction($func, $params, $this);
556
        }
557 3
        return $res;
558
    }
559
560
    /**
561
     * Find all configurations of subpages of a page
562
     * TODO: Write Functional Tests
563
     */
564 2
    public function getConfigurationsForBranch(int $rootid, int $depth): array
565
    {
566 2
        $configurationsForBranch = [];
567 2
        $pageTSconfig = $this->getPageTSconfigForId($rootid);
568 2
        $sets = $pageTSconfig['tx_crawler.']['crawlerCfg.']['paramSets.'] ?? [];
569 2
        foreach ($sets as $key => $value) {
570
            if (! is_array($value)) {
571
                continue;
572
            }
573
            $configurationsForBranch[] = substr($key, -1) === '.' ? substr($key, 0, -1) : $key;
574
        }
575 2
        $pids = [];
576 2
        $rootLine = BackendUtility::BEgetRootLine($rootid);
577 2
        foreach ($rootLine as $node) {
578 1
            $pids[] = $node['uid'];
579
        }
580
        /* @var PageTreeView $tree */
581 2
        $tree = GeneralUtility::makeInstance(PageTreeView::class);
582 2
        $perms_clause = $this->getBackendUser()->getPagePermsClause(Permission::PAGE_SHOW);
583 2
        $tree->init(empty($perms_clause) ? '' : ('AND ' . $perms_clause));
584 2
        $tree->getTree($rootid, $depth, '');
585 2
        foreach ($tree->tree as $node) {
586
            $pids[] = $node['row']['uid'];
587
        }
588
589 2
        $queryBuilder = $this->getQueryBuilder('tx_crawler_configuration');
590
        $statement = $queryBuilder
591 2
            ->select('name')
592 2
            ->from('tx_crawler_configuration')
593 2
            ->where(
594 2
                $queryBuilder->expr()->in('pid', $queryBuilder->createNamedParameter($pids, Connection::PARAM_INT_ARRAY))
595
            )
596 2
            ->execute();
597
598 2
        while ($row = $statement->fetch()) {
599 1
            $configurationsForBranch[] = $row['name'];
600
        }
601 2
        return $configurationsForBranch;
602
    }
603
604
    /**
605
     * Will expand the parameters configuration to individual values. This follows a certain syntax of the value of each parameter.
606
     * Syntax of values:
607
     * - Basically: If the value is wrapped in [...] it will be expanded according to the following syntax, otherwise the value is taken literally
608
     * - Configuration is splitted by "|" and the parts are processed individually and finally added together
609
     * - For each configuration part:
610
     *         - "[int]-[int]" = Integer range, will be expanded to all values in between, values included, starting from low to high (max. 1000). Example "1-34" or "-40--30"
611
     *         - "_TABLE:[TCA table name];[_PID:[optional page id, default is current page]];[_ENABLELANG:1]" = Look up of table records from PID, filtering out deleted records. Example "_TABLE:tt_content; _PID:123"
612
     *        _ENABLELANG:1 picks only original records without their language overlays
613
     *         - Default: Literal value
614
     *
615
     * @param array $paramArray Array with key (GET var name) and values (value of GET var which is configuration for expansion)
616
     * @param integer $pid Current page ID
617
     * @return array
618
     *
619
     * TODO: Write Functional Tests
620
     */
621 10
    public function expandParameters($paramArray, $pid)
622
    {
623
        // Traverse parameter names:
624 10
        foreach ($paramArray as $p => $v) {
625 10
            $v = trim($v);
626
627
            // If value is encapsulated in square brackets it means there are some ranges of values to find, otherwise the value is literal
628 10
            if (strpos($v, '[') === 0 && substr($v, -1) === ']') {
629
                // So, find the value inside brackets and reset the paramArray value as an array.
630 10
                $v = substr($v, 1, -1);
631 10
                $paramArray[$p] = [];
632
633
                // Explode parts and traverse them:
634 10
                $parts = explode('|', $v);
635 10
                foreach ($parts as $pV) {
636
637
                    // Look for integer range: (fx. 1-34 or -40--30 // reads minus 40 to minus 30)
638 10
                    if (preg_match('/^(-?[0-9]+)\s*-\s*(-?[0-9]+)$/', trim($pV), $reg)) {
639 1
                        $reg = $this->swapIfFirstIsLargerThanSecond($reg);
640
641
                        // Traverse range, add values:
642
                        // Limit to size of range!
643 1
                        $runAwayBrake = 1000;
644 1
                        for ($a = $reg[1]; $a <= $reg[2]; $a++) {
645 1
                            $paramArray[$p][] = $a;
646 1
                            $runAwayBrake--;
647 1
                            if ($runAwayBrake <= 0) {
648
                                break;
649
                            }
650
                        }
651 9
                    } elseif (strpos(trim($pV), '_TABLE:') === 0) {
652
653
                        // Parse parameters:
654 6
                        $subparts = GeneralUtility::trimExplode(';', $pV);
655 6
                        $subpartParams = [];
656 6
                        foreach ($subparts as $spV) {
657 6
                            [$pKey, $pVal] = GeneralUtility::trimExplode(':', $spV);
658 6
                            $subpartParams[$pKey] = $pVal;
659
                        }
660
661
                        // Table exists:
662 6
                        if (isset($GLOBALS['TCA'][$subpartParams['_TABLE']])) {
663 6
                            $lookUpPid = isset($subpartParams['_PID']) ? intval($subpartParams['_PID']) : intval($pid);
664 6
                            $recursiveDepth = isset($subpartParams['_RECURSIVE']) ? intval($subpartParams['_RECURSIVE']) : 0;
665 6
                            $pidField = isset($subpartParams['_PIDFIELD']) ? trim($subpartParams['_PIDFIELD']) : 'pid';
666 6
                            $where = $subpartParams['_WHERE'] ?? '';
667 6
                            $addTable = $subpartParams['_ADDTABLE'] ?? '';
668
669 6
                            $fieldName = $subpartParams['_FIELD'] ? $subpartParams['_FIELD'] : 'uid';
670 6
                            if ($fieldName === 'uid' || $GLOBALS['TCA'][$subpartParams['_TABLE']]['columns'][$fieldName]) {
671 6
                                $queryBuilder = $this->getQueryBuilder($subpartParams['_TABLE']);
672
673 6
                                if ($recursiveDepth > 0) {
674
                                    /** @var QueryGenerator $queryGenerator */
675 2
                                    $queryGenerator = GeneralUtility::makeInstance(QueryGenerator::class);
676 2
                                    $pidList = $queryGenerator->getTreeList($lookUpPid, $recursiveDepth, 0, 1);
677 2
                                    $pidArray = GeneralUtility::intExplode(',', $pidList);
678
                                } else {
679 4
                                    $pidArray = [(string) $lookUpPid];
680
                                }
681
682 6
                                $queryBuilder->getRestrictions()
683 6
                                    ->removeAll()
684 6
                                    ->add(GeneralUtility::makeInstance(DeletedRestriction::class));
685
686
                                $queryBuilder
687 6
                                    ->select($fieldName)
688 6
                                    ->from($subpartParams['_TABLE'])
689 6
                                    ->where(
690 6
                                        $queryBuilder->expr()->in($pidField, $queryBuilder->createNamedParameter($pidArray, Connection::PARAM_INT_ARRAY)),
691
                                        $where
692
                                    );
693
694 6
                                if (! empty($addTable)) {
695
                                    // TODO: Check if this works as intended!
696
                                    $queryBuilder->add('from', $addTable);
697
                                }
698 6
                                $transOrigPointerField = $GLOBALS['TCA'][$subpartParams['_TABLE']]['ctrl']['transOrigPointerField'];
699
700 6
                                if ($subpartParams['_ENABLELANG'] && $transOrigPointerField) {
701
                                    $queryBuilder->andWhere(
702
                                        $queryBuilder->expr()->lte(
703
                                            $transOrigPointerField,
704
                                            0
705
                                        )
706
                                    );
707
                                }
708
709 6
                                $statement = $queryBuilder->execute();
710
711 6
                                $rows = [];
712 6
                                while ($row = $statement->fetch()) {
713 6
                                    $rows[$row[$fieldName]] = $row;
714
                                }
715
716 6
                                if (is_array($rows)) {
717 6
                                    $paramArray[$p] = array_merge($paramArray[$p], array_keys($rows));
718
                                }
719
                            }
720
                        }
721
                    } else {
722
                        // Just add value:
723 3
                        $paramArray[$p][] = $pV;
724
                    }
725
                    // Hook for processing own expandParameters place holder
726 10
                    if (is_array($GLOBALS['TYPO3_CONF_VARS']['SC_OPTIONS']['crawler/class.tx_crawler_lib.php']['expandParameters'])) {
727
                        $_params = [
728
                            'pObj' => &$this,
729
                            'paramArray' => &$paramArray,
730
                            'currentKey' => $p,
731
                            'currentValue' => $pV,
732
                            'pid' => $pid,
733
                        ];
734
                        foreach ($GLOBALS['TYPO3_CONF_VARS']['SC_OPTIONS']['crawler/class.tx_crawler_lib.php']['expandParameters'] as $_funcRef) {
735
                            GeneralUtility::callUserFunction($_funcRef, $_params, $this);
736
                        }
737
                    }
738
                }
739
740
                // Make unique set of values and sort array by key:
741 10
                $paramArray[$p] = array_unique($paramArray[$p]);
742 10
                ksort($paramArray);
743
            } else {
744
                // Set the literal value as only value in array:
745 3
                $paramArray[$p] = [$v];
746
            }
747
        }
748
749 10
        return $paramArray;
750
    }
751
752
    /**
753
     * Compiling URLs from parameter array (output of expandParameters())
754
     * The number of URLs will be the multiplication of the number of parameter values for each key
755
     *
756
     * @param array $paramArray Output of expandParameters(): Array with keys (GET var names) and for each an array of values
757
     * @param array $urls URLs accumulated in this array (for recursion)
758
     * @return array
759
     */
760 7
    public function compileUrls($paramArray, array $urls)
761
    {
762 7
        if (empty($paramArray)) {
763 7
            return $urls;
764
        }
765 6
        $varName = key($paramArray);
766 6
        $valueSet = array_shift($paramArray);
767
768
        // Traverse value set:
769 6
        $newUrls = [];
770 6
        foreach ($urls as $url) {
771 5
            foreach ($valueSet as $val) {
772 5
                if (count($newUrls) < $this->getMaximumUrlsToCompile()) {
773 5
                    $newUrls[] = $url . (strcmp((string) $val, '') ? '&' . rawurlencode($varName) . '=' . rawurlencode((string) $val) : '');
774
                }
775
            }
776
        }
777 6
        return $this->compileUrls($paramArray, $newUrls);
778
    }
779
780
    /************************************
781
     *
782
     * Crawler log
783
     *
784
     ************************************/
785
786
    /**
787
     * Adding call back entries to log (called from hooks typically, see indexed search class "class.crawler.php"
788
     *
789
     * @param integer $setId Set ID
790
     * @param array $params Parameters to pass to call back function
791
     * @param string $callBack Call back object reference, eg. 'EXT:indexed_search/class.crawler.php:&tx_indexedsearch_crawler'
792
     * @param integer $page_id Page ID to attach it to
793
     * @param integer $schedule Time at which to activate
794
     */
795
    public function addQueueEntry_callBack($setId, $params, $callBack, $page_id = 0, $schedule = 0): void
796
    {
797
        if (! is_array($params)) {
0 ignored issues
show
introduced by
The condition is_array($params) is always true.
Loading history...
798
            $params = [];
799
        }
800
        $params['_CALLBACKOBJ'] = $callBack;
801
802
        GeneralUtility::makeInstance(ConnectionPool::class)->getConnectionForTable('tx_crawler_queue')
803
            ->insert(
804
                'tx_crawler_queue',
805
                [
806
                    'page_id' => (int) $page_id,
807
                    'parameters' => json_encode($params),
808
                    'scheduled' => (int) $schedule ?: $this->getCurrentTime(),
809
                    'exec_time' => 0,
810
                    'set_id' => (int) $setId,
811
                    'result_data' => '',
812
                ]
813
            );
814
    }
815
816
    /************************************
817
     *
818
     * URL setting
819
     *
820
     ************************************/
821
822
    /**
823
     * Setting a URL for crawling:
824
     *
825
     * @param integer $id Page ID
826
     * @param string $url Complete URL
827
     * @param array $subCfg Sub configuration array (from TS config)
828
     * @param integer $tstamp Scheduled-time
829
     * @param string $configurationHash (optional) configuration hash
830
     * @param bool $skipInnerDuplicationCheck (optional) skip inner duplication check
831
     * @return bool
832
     */
833 7
    public function addUrl(
834
        $id,
835
        $url,
836
        array $subCfg,
837
        $tstamp,
838
        $configurationHash = '',
839
        $skipInnerDuplicationCheck = false
840
    ) {
841 7
        $urlAdded = false;
842 7
        $rows = [];
843
844
        // Creating parameters:
845
        $parameters = [
846 7
            'url' => $url,
847
        ];
848
849
        // fe user group simulation:
850 7
        $uGs = implode(',', array_unique(GeneralUtility::intExplode(',', $subCfg['userGroups'], true)));
851 7
        if ($uGs) {
852 1
            $parameters['feUserGroupList'] = $uGs;
853
        }
854
855
        // Setting processing instructions
856 7
        $parameters['procInstructions'] = GeneralUtility::trimExplode(',', $subCfg['procInstrFilter']);
857 7
        if (is_array($subCfg['procInstrParams.'])) {
858 4
            $parameters['procInstrParams'] = $subCfg['procInstrParams.'];
859
        }
860
861
        // Compile value array:
862 7
        $parameters_serialized = json_encode($parameters);
863
        $fieldArray = [
864 7
            'page_id' => (int) $id,
865 7
            'parameters' => $parameters_serialized,
866 7
            'parameters_hash' => GeneralUtility::shortMD5($parameters_serialized),
867 7
            'configuration_hash' => $configurationHash,
868 7
            'scheduled' => $tstamp,
869 7
            'exec_time' => 0,
870 7
            'set_id' => (int) $this->setID,
871 7
            'result_data' => '',
872 7
            'configuration' => $subCfg['key'],
873
        ];
874
875 7
        if ($this->registerQueueEntriesInternallyOnly) {
876
            //the entries will only be registered and not stored to the database
877 1
            $this->queueEntries[] = $fieldArray;
878
        } else {
879 6
            if (! $skipInnerDuplicationCheck) {
880
                // check if there is already an equal entry
881 5
                $rows = $this->queueRepository->getDuplicateQueueItemsIfExists(
882 5
                    (bool) $this->extensionSettings['enableTimeslot'],
883
                    $tstamp,
884 5
                    $this->getCurrentTime(),
885 5
                    $fieldArray['page_id'],
886 5
                    $fieldArray['parameters_hash']
887
                );
888
            }
889
890 6
            if (empty($rows)) {
891 5
                $connectionForCrawlerQueue = GeneralUtility::makeInstance(ConnectionPool::class)->getConnectionForTable('tx_crawler_queue');
892 5
                $connectionForCrawlerQueue->insert(
893 5
                    'tx_crawler_queue',
894
                    $fieldArray
895
                );
896 5
                $uid = $connectionForCrawlerQueue->lastInsertId('tx_crawler_queue', 'qid');
897 5
                $rows[] = $uid;
898 5
                $urlAdded = true;
899
            }
900
        }
901
902 7
        return $urlAdded;
903
    }
904
905
    /**
906
     * Returns the current system time
907
     *
908
     * @return int
909
     */
910 4
    public function getCurrentTime()
911
    {
912 4
        return time();
913
    }
914
915
    /************************************
916
     *
917
     * URL reading
918
     *
919
     ************************************/
920
921
    /**
922
     * Read URL for single queue entry
923
     *
924
     * @param integer $queueId
925
     * @param boolean $force If set, will process even if exec_time has been set!
926
     * @return integer
927
     */
928 2
    public function readUrl($queueId, $force = false)
929
    {
930 2
        $queryBuilder = GeneralUtility::makeInstance(ConnectionPool::class)->getQueryBuilderForTable($this->tableName);
931 2
        $ret = 0;
932 2
        $this->logger->debug('crawler-readurl start ' . microtime(true));
933
        // Get entry:
934
        $queryBuilder
935 2
            ->select('*')
936 2
            ->from('tx_crawler_queue')
937 2
            ->where(
938 2
                $queryBuilder->expr()->eq('qid', $queryBuilder->createNamedParameter($queueId, PDO::PARAM_INT))
939
            );
940 2
        if (! $force) {
941
            $queryBuilder
942 2
                ->andWhere('exec_time = 0')
943 2
                ->andWhere('process_scheduled > 0');
944
        }
945 2
        $queueRec = $queryBuilder->execute()->fetch();
946
947 2
        if (! is_array($queueRec)) {
948
            return;
949
        }
950
951
        // Set exec_time to lock record:
952 2
        $field_array = ['exec_time' => $this->getCurrentTime()];
953
954 2
        if (isset($this->processID)) {
955
            //if mulitprocessing is used we need to store the id of the process which has handled this entry
956 2
            $field_array['process_id_completed'] = $this->processID;
957
        }
958
959 2
        GeneralUtility::makeInstance(ConnectionPool::class)->getConnectionForTable('tx_crawler_queue')
960 2
            ->update(
961 2
                'tx_crawler_queue',
962
                $field_array,
963 2
                ['qid' => (int) $queueId]
964
            );
965
966 2
        $result = $this->queueExecutor->executeQueueItem($queueRec, $this);
967 2
        if ($result['content'] === null) {
968
            $resultData = 'An errors happened';
969
        } else {
970
            /** @var JsonCompatibilityConverter $jsonCompatibilityConverter */
971 2
            $jsonCompatibilityConverter = GeneralUtility::makeInstance(JsonCompatibilityConverter::class);
972 2
            $resultData = $jsonCompatibilityConverter->convert($result['content']);
973
        }
974
975
        //atm there's no need to point to specific pollable extensions
976 2
        if (is_array($GLOBALS['TYPO3_CONF_VARS']['EXTCONF']['crawler']['pollSuccess'])) {
977
            foreach ($GLOBALS['TYPO3_CONF_VARS']['EXTCONF']['crawler']['pollSuccess'] as $pollable) {
978
                // only check the success value if the instruction is runnig
979
                // it is important to name the pollSuccess key same as the procInstructions key
980
                if (is_array($resultData['parameters']['procInstructions'])
981
                    && in_array(
982
                        $pollable,
983
                        $resultData['parameters']['procInstructions'], true
984
                    )
985
                ) {
986
                    if (! empty($resultData['success'][$pollable]) && $resultData['success'][$pollable]) {
987
                        $ret |= self::CLI_STATUS_POLLABLE_PROCESSED;
988
                    }
989
                }
990
            }
991
        }
992
993
        // Set result in log which also denotes the end of the processing of this entry.
994 2
        $field_array = ['result_data' => json_encode($result)];
995
996 2
        GeneralUtility::makeInstance(ConnectionPool::class)->getConnectionForTable('tx_crawler_queue')
997 2
            ->update(
998 2
                'tx_crawler_queue',
999
                $field_array,
1000 2
                ['qid' => (int) $queueId]
1001
            );
1002
1003 2
        $this->logger->debug('crawler-readurl stop ' . microtime(true));
1004 2
        return $ret;
1005
    }
1006
1007
    /**
1008
     * Read URL for not-yet-inserted log-entry
1009
     *
1010
     * @param array $field_array Queue field array,
1011
     *
1012
     * @return array|bool|mixed|string
1013
     */
1014
    public function readUrlFromArray($field_array)
1015
    {
1016
        // Set exec_time to lock record:
1017
        $field_array['exec_time'] = $this->getCurrentTime();
1018
        $connectionForCrawlerQueue = GeneralUtility::makeInstance(ConnectionPool::class)->getConnectionForTable($this->tableName);
1019
        $connectionForCrawlerQueue->insert(
1020
            $this->tableName,
1021
            $field_array
1022
        );
1023
        $queueId = $connectionForCrawlerQueue->lastInsertId($this->tableName, 'qid');
1024
        $result = $this->queueExecutor->executeQueueItem($field_array, $this);
1025
1026
        // Set result in log which also denotes the end of the processing of this entry.
1027
        $field_array = ['result_data' => json_encode($result)];
1028
1029
        $connectionForCrawlerQueue->update(
1030
            $this->tableName,
1031
            $field_array,
1032
            ['qid' => $queueId]
1033
        );
1034
1035
        return $result;
1036
    }
1037
1038
    /*****************************
1039
     *
1040
     * Compiling URLs to crawl - tools
1041
     *
1042
     *****************************/
1043
1044
    /**
1045
     * @param integer $id Root page id to start from.
1046
     * @param integer $depth Depth of tree, 0=only id-page, 1= on sublevel, 99 = infinite
1047
     * @param integer $scheduledTime Unix Time when the URL is timed to be visited when put in queue
1048
     * @param integer $reqMinute Number of requests per minute (creates the interleave between requests)
1049
     * @param boolean $submitCrawlUrls If set, submits the URLs to queue in database (real crawling)
1050
     * @param boolean $downloadCrawlUrls If set (and submitcrawlUrls is false) will fill $downloadUrls with entries)
1051
     * @param array $incomingProcInstructions Array of processing instructions
1052
     * @param array $configurationSelection Array of configuration keys
1053
     * @return string
1054
     */
1055
    public function getPageTreeAndUrls(
1056
        $id,
1057
        $depth,
1058
        $scheduledTime,
1059
        $reqMinute,
1060
        $submitCrawlUrls,
1061
        $downloadCrawlUrls,
1062
        array $incomingProcInstructions,
1063
        array $configurationSelection
1064
    ) {
1065
        $this->scheduledTime = $scheduledTime;
1066
        $this->reqMinute = $reqMinute;
1067
        $this->submitCrawlUrls = $submitCrawlUrls;
1068
        $this->downloadCrawlUrls = $downloadCrawlUrls;
1069
        $this->incomingProcInstructions = $incomingProcInstructions;
1070
        $this->incomingConfigurationSelection = $configurationSelection;
1071
1072
        $this->duplicateTrack = [];
1073
        $this->downloadUrls = [];
1074
1075
        // Drawing tree:
1076
        /* @var PageTreeView $tree */
1077
        $tree = GeneralUtility::makeInstance(PageTreeView::class);
1078
        $perms_clause = $this->getBackendUser()->getPagePermsClause(Permission::PAGE_SHOW);
1079
        $tree->init('AND ' . $perms_clause);
1080
1081
        $pageInfo = BackendUtility::readPageAccess($id, $perms_clause);
1082
        if (is_array($pageInfo)) {
1083
            // Set root row:
1084
            $tree->tree[] = [
1085
                'row' => $pageInfo,
1086
                'HTML' => $this->iconFactory->getIconForRecord('pages', $pageInfo, Icon::SIZE_SMALL),
1087
            ];
1088
        }
1089
1090
        // Get branch beneath:
1091
        if ($depth) {
1092
            $tree->getTree($id, $depth, '');
1093
        }
1094
1095
        // Traverse page tree:
1096
        $code = '';
1097
1098
        foreach ($tree->tree as $data) {
1099
            $this->MP = false;
1100
1101
            // recognize mount points
1102
            if ($data['row']['doktype'] === PageRepository::DOKTYPE_MOUNTPOINT) {
1103
                $mountpage = $this->pageRepository->getPage($data['row']['uid']);
1104
1105
                // fetch mounted pages
1106
                $this->MP = $mountpage[0]['mount_pid'] . '-' . $data['row']['uid'];
0 ignored issues
show
Documentation Bug introduced by
The property $MP was declared of type boolean, but $mountpage[0]['mount_pid...' . $data['row']['uid'] is of type string. Maybe add a type cast?

This check looks for assignments to scalar types that may be of the wrong type.

To ensure the code behaves as expected, it may be a good idea to add an explicit type cast.

$answer = 42;

$correct = false;

$correct = (bool) $answer;
Loading history...
1107
1108
                $mountTree = GeneralUtility::makeInstance(PageTreeView::class);
1109
                $mountTree->init('AND ' . $perms_clause);
1110
                $mountTree->getTree($mountpage[0]['mount_pid'], $depth);
1111
1112
                foreach ($mountTree->tree as $mountData) {
1113
                    $code .= $this->drawURLs_addRowsForPage(
1114
                        $mountData['row'],
1115
                        $mountData['HTML'] . BackendUtility::getRecordTitle('pages', $mountData['row'], true)
1116
                    );
1117
                }
1118
1119
                // replace page when mount_pid_ol is enabled
1120
                if ($mountpage[0]['mount_pid_ol']) {
1121
                    $data['row']['uid'] = $mountpage[0]['mount_pid'];
1122
                } else {
1123
                    // if the mount_pid_ol is not set the MP must not be used for the mountpoint page
1124
                    $this->MP = false;
1125
                }
1126
            }
1127
1128
            $code .= $this->drawURLs_addRowsForPage(
1129
                $data['row'],
1130
                $data['HTML'] . BackendUtility::getRecordTitle('pages', $data['row'], true)
1131
            );
1132
        }
1133
1134
        return $code;
1135
    }
1136
1137
    /**
1138
     * Expands exclude string
1139
     *
1140
     * @param string $excludeString Exclude string
1141
     * @return array
1142
     */
1143 1
    public function expandExcludeString($excludeString)
1144
    {
1145
        // internal static caches;
1146 1
        static $expandedExcludeStringCache;
1147 1
        static $treeCache;
1148
1149 1
        if (empty($expandedExcludeStringCache[$excludeString])) {
1150 1
            $pidList = [];
1151
1152 1
            if (! empty($excludeString)) {
1153
                /** @var PageTreeView $tree */
1154 1
                $tree = GeneralUtility::makeInstance(PageTreeView::class);
1155 1
                $tree->init('AND ' . $this->getBackendUser()->getPagePermsClause(Permission::PAGE_SHOW));
1156
1157 1
                $excludeParts = GeneralUtility::trimExplode(',', $excludeString);
1158
1159 1
                foreach ($excludeParts as $excludePart) {
1160 1
                    [$pid, $depth] = GeneralUtility::trimExplode('+', $excludePart);
1161
1162
                    // default is "page only" = "depth=0"
1163 1
                    if (empty($depth)) {
1164 1
                        $depth = (stristr($excludePart, '+')) ? 99 : 0;
1165
                    }
1166
1167 1
                    $pidList[] = (int) $pid;
1168
1169 1
                    if ($depth > 0) {
1170
                        if (empty($treeCache[$pid][$depth])) {
1171
                            $tree->reset();
1172
                            $tree->getTree($pid, $depth);
0 ignored issues
show
Bug introduced by
$pid of type string is incompatible with the type integer expected by parameter $uid of TYPO3\CMS\Backend\Tree\V...ractTreeView::getTree(). ( Ignorable by Annotation )

If this is a false-positive, you can also ignore this issue in your code via the ignore-type  annotation

1172
                            $tree->getTree(/** @scrutinizer ignore-type */ $pid, $depth);
Loading history...
1173
                            $treeCache[$pid][$depth] = $tree->tree;
1174
                        }
1175
1176
                        foreach ($treeCache[$pid][$depth] as $data) {
1177
                            $pidList[] = (int) $data['row']['uid'];
1178
                        }
1179
                    }
1180
                }
1181
            }
1182
1183 1
            $expandedExcludeStringCache[$excludeString] = array_unique($pidList);
1184
        }
1185
1186 1
        return $expandedExcludeStringCache[$excludeString];
1187
    }
1188
1189
    /**
1190
     * Create the rows for display of the page tree
1191
     * For each page a number of rows are shown displaying GET variable configuration
1192
     */
1193
    public function drawURLs_addRowsForPage(array $pageRow, string $pageTitle): string
1194
    {
1195
        $skipMessage = '';
1196
1197
        // Get list of configurations
1198
        $configurations = $this->getUrlsForPageRow($pageRow, $skipMessage);
1199
        $configurations = ConfigurationService::removeDisallowedConfigurations($this->incomingConfigurationSelection, $configurations);
1200
1201
        // Traverse parameter combinations:
1202
        $c = 0;
1203
        $content = '';
1204
        if (! empty($configurations)) {
1205
            foreach ($configurations as $confKey => $confArray) {
1206
1207
                // Title column:
1208
                if (! $c) {
1209
                    $titleClm = '<td rowspan="' . count($configurations) . '">' . $pageTitle . '</td>';
1210
                } else {
1211
                    $titleClm = '';
1212
                }
1213
1214
                if (! in_array($pageRow['uid'], $this->expandExcludeString($confArray['subCfg']['exclude']), true)) {
1215
1216
                    // URL list:
1217
                    $urlList = $this->urlListFromUrlArray(
1218
                        $confArray,
1219
                        $pageRow,
1220
                        $this->scheduledTime,
1221
                        $this->reqMinute,
1222
                        $this->submitCrawlUrls,
1223
                        $this->downloadCrawlUrls,
1224
                        $this->duplicateTrack,
1225
                        $this->downloadUrls,
1226
                        // if empty the urls won't be filtered by processing instructions
1227
                        $this->incomingProcInstructions
1228
                    );
1229
1230
                    // Expanded parameters:
1231
                    $paramExpanded = '';
1232
                    $calcAccu = [];
1233
                    $calcRes = 1;
1234
                    foreach ($confArray['paramExpanded'] as $gVar => $gVal) {
1235
                        $paramExpanded .= '
1236
                            <tr>
1237
                                <td>' . htmlspecialchars('&' . $gVar . '=') . '<br/>' .
1238
                            '(' . count($gVal) . ')' .
1239
                            '</td>
1240
                                <td nowrap="nowrap">' . nl2br(htmlspecialchars(implode(chr(10), $gVal))) . '</td>
1241
                            </tr>
1242
                        ';
1243
                        $calcRes *= count($gVal);
1244
                        $calcAccu[] = count($gVal);
1245
                    }
1246
                    $paramExpanded = '<table>' . $paramExpanded . '</table>';
1247
                    $paramExpanded .= 'Comb: ' . implode('*', $calcAccu) . '=' . $calcRes;
1248
1249
                    // Options
1250
                    $optionValues = '';
1251
                    if ($confArray['subCfg']['userGroups']) {
1252
                        $optionValues .= 'User Groups: ' . $confArray['subCfg']['userGroups'] . '<br/>';
1253
                    }
1254
                    if ($confArray['subCfg']['procInstrFilter']) {
1255
                        $optionValues .= 'ProcInstr: ' . $confArray['subCfg']['procInstrFilter'] . '<br/>';
1256
                    }
1257
1258
                    // Compile row:
1259
                    $content .= '
1260
                        <tr>
1261
                            ' . $titleClm . '
1262
                            <td>' . htmlspecialchars($confKey) . '</td>
1263
                            <td>' . nl2br(htmlspecialchars(rawurldecode(trim(str_replace('&', chr(10) . '&', GeneralUtility::implodeArrayForUrl('', $confArray['paramParsed'])))))) . '</td>
1264
                            <td>' . $paramExpanded . '</td>
1265
                            <td nowrap="nowrap">' . $urlList . '</td>
1266
                            <td nowrap="nowrap">' . $optionValues . '</td>
1267
                            <td nowrap="nowrap">' . DebugUtility::viewArray($confArray['subCfg']['procInstrParams.']) . '</td>
1268
                        </tr>';
1269
                } else {
1270
                    $content .= '<tr>
1271
                            ' . $titleClm . '
1272
                            <td>' . htmlspecialchars($confKey) . '</td>
1273
                            <td colspan="5"><em>No entries</em> (Page is excluded in this configuration)</td>
1274
                        </tr>';
1275
                }
1276
1277
                $c++;
1278
            }
1279
        } else {
1280
            $message = ! empty($skipMessage) ? ' (' . $skipMessage . ')' : '';
1281
1282
            // Compile row:
1283
            $content .= '
1284
                <tr>
1285
                    <td>' . $pageTitle . '</td>
1286
                    <td colspan="6"><em>No entries</em>' . $message . '</td>
1287
                </tr>';
1288
        }
1289
1290
        return $content;
1291
    }
1292
1293
    /*****************************
1294
     *
1295
     * CLI functions
1296
     *
1297
     *****************************/
1298
1299
    /**
1300
     * Running the functionality of the CLI (crawling URLs from queue)
1301
     */
1302 2
    public function CLI_run(int $countInARun, int $sleepTime, int $sleepAfterFinish): int
1303
    {
1304 2
        $result = 0;
1305 2
        $counter = 0;
1306
1307
        // Clean up the queue
1308 2
        $this->queueRepository->cleanupQueue();
1309
1310
        // Select entries:
1311 2
        $rows = $this->queueRepository->fetchRecordsToBeCrawled($countInARun);
1312
1313 2
        if (! empty($rows)) {
1314 2
            $quidList = [];
1315
1316 2
            foreach ($rows as $r) {
1317 2
                $quidList[] = $r['qid'];
1318
            }
1319
1320 2
            $processId = $this->CLI_buildProcessId();
1321
1322
            //save the number of assigned queue entries to determine how many have been processed later
1323 2
            $numberOfAffectedRows = $this->queueRepository->updateProcessIdAndSchedulerForQueueIds($quidList, $processId);
1324 2
            $this->processRepository->updateProcessAssignItemsCount($numberOfAffectedRows, $processId);
1325
1326 2
            if ($numberOfAffectedRows !== count($quidList)) {
1327
                return ($result | self::CLI_STATUS_ABORTED);
1328
            }
1329
1330 2
            foreach ($rows as $r) {
1331 2
                $result |= $this->readUrl($r['qid']);
1332
1333 2
                $counter++;
1334
                // Just to relax the system
1335 2
                usleep((int) $sleepTime);
1336
1337
                // if during the start and the current read url the cli has been disable we need to return from the function
1338
                // mark the process NOT as ended.
1339 2
                if ($this->crawler->isDisabled()) {
1340
                    return ($result | self::CLI_STATUS_ABORTED);
1341
                }
1342
1343 2
                if (! $this->processRepository->isProcessActive($this->CLI_buildProcessId())) {
1344
                    $this->CLI_debug('conflict / timeout (' . $this->CLI_buildProcessId() . ')');
0 ignored issues
show
Bug introduced by
The method CLI_debug() does not exist on AOE\Crawler\Controller\CrawlerController. Since you implemented __call, consider adding a @method annotation. ( Ignorable by Annotation )

If this is a false-positive, you can also ignore this issue in your code via the ignore-call  annotation

1344
                    $this->/** @scrutinizer ignore-call */ 
1345
                           CLI_debug('conflict / timeout (' . $this->CLI_buildProcessId() . ')');
Loading history...
1345
                    $result |= self::CLI_STATUS_ABORTED;
1346
                    //possible timeout
1347
                    break;
1348
                }
1349
            }
1350
1351 2
            sleep((int) $sleepAfterFinish);
1352
        }
1353
1354 2
        if ($counter > 0) {
1355 2
            $result |= self::CLI_STATUS_PROCESSED;
1356
        }
1357
1358 2
        return $result;
1359
    }
1360
1361
    /**
1362
     * Try to acquire a new process with the given id
1363
     * also performs some auto-cleanup for orphan processes
1364
     * @param string $id identification string for the process
1365
     * @return boolean
1366
     * @todo preemption might not be the most elegant way to clean up
1367
     */
1368 2
    public function CLI_checkAndAcquireNewProcess($id)
1369
    {
1370 2
        $ret = true;
1371
1372 2
        $systemProcessId = getmypid();
1373 2
        if (! $systemProcessId) {
1374
            return false;
1375
        }
1376
1377 2
        $processCount = 0;
1378 2
        $orphanProcesses = [];
1379
1380 2
        $activeProcesses = $this->processRepository->findAllActive();
1381 2
        $currentTime = $this->getCurrentTime();
1382
1383
        /** @var Process $process */
1384 2
        foreach ($activeProcesses as $process) {
1385
            if ($process->getTtl() < $currentTime) {
1386
                $orphanProcesses[] = $process->getProcessId();
1387
            } else {
1388
                $processCount++;
1389
            }
1390
        }
1391
1392
        // if there are less than allowed active processes then add a new one
1393 2
        if ($processCount < (int) $this->extensionSettings['processLimit']) {
1394 2
            GeneralUtility::makeInstance(ConnectionPool::class)->getConnectionForTable('tx_crawler_process')->insert(
1395 2
                'tx_crawler_process',
1396
                [
1397 2
                    'process_id' => $id,
1398 2
                    'active' => 1,
1399 2
                    'ttl' => $currentTime + (int) $this->extensionSettings['processMaxRunTime'],
1400 2
                    'system_process_id' => $systemProcessId,
1401
                ]
1402
            );
1403
        } else {
1404
            $ret = false;
1405
        }
1406
1407 2
        $this->processRepository->deleteProcessesMarkedAsDeleted();
1408 2
        $this->CLI_releaseProcesses($orphanProcesses);
1409
1410 2
        return $ret;
1411
    }
1412
1413
    /**
1414
     * Release a process and the required resources
1415
     *
1416
     * @param mixed $releaseIds string with a single process-id or array with multiple process-ids
1417
     * @return boolean
1418
     */
1419 2
    public function CLI_releaseProcesses($releaseIds)
1420
    {
1421 2
        $queryBuilder = GeneralUtility::makeInstance(ConnectionPool::class)->getQueryBuilderForTable($this->tableName);
1422
1423 2
        if (! is_array($releaseIds)) {
1424 2
            $releaseIds = [$releaseIds];
1425
        }
1426
1427 2
        if (empty($releaseIds)) {
1428
            //nothing to release
1429 2
            return false;
1430
        }
1431
1432
        // some kind of 2nd chance algo - this way you need at least 2 processes to have a real cleanup
1433
        // this ensures that a single process can't mess up the entire process table
1434
1435
        // mark all processes as deleted which have no "waiting" queue-entires and which are not active
1436
1437
        $queryBuilder
1438 2
            ->update($this->tableName, 'q')
1439 2
            ->where(
1440 2
                'q.process_id IN(SELECT p.process_id FROM tx_crawler_process as p WHERE p.active = 0)'
1441
            )
1442 2
            ->set('q.process_scheduled', 0)
1443 2
            ->set('q.process_id', '')
1444 2
            ->execute();
1445
1446
        // FIXME: Not entirely sure that this is equivalent to the previous version
1447 2
        $queryBuilder->resetQueryPart('set');
1448
1449
        $queryBuilder
1450 2
            ->update('tx_crawler_process')
1451 2
            ->where(
1452 2
                $queryBuilder->expr()->eq('active', 0),
1453 2
                'process_id IN(SELECT q.process_id FROM tx_crawler_queue as q WHERE q.exec_time = 0)'
1454
            )
1455 2
            ->set('system_process_id', 0)
1456 2
            ->execute();
1457
1458 2
        $this->processRepository->markRequestedProcessesAsNotActive($releaseIds);
1459 2
        $this->queueRepository->unsetProcessScheduledAndProcessIdForQueueEntries($releaseIds);
1460
1461 2
        return true;
1462
    }
1463
1464
    /**
1465
     * Create a unique Id for the current process
1466
     *
1467
     * @return string the ID
1468
     */
1469 3
    public function CLI_buildProcessId()
1470
    {
1471 3
        if (! $this->processID) {
1472 2
            $this->processID = GeneralUtility::shortMD5(microtime(true));
1473
        }
1474 3
        return $this->processID;
1475
    }
1476
1477
    /**
1478
     * Returns a md5 hash generated from a serialized configuration array.
1479
     *
1480
     * @return string
1481
     */
1482 9
    protected function getConfigurationHash(array $configuration)
1483
    {
1484 9
        unset($configuration['paramExpanded']);
1485 9
        unset($configuration['URLs']);
1486 9
        return md5(serialize($configuration));
1487
    }
1488
1489 1
    protected function swapIfFirstIsLargerThanSecond(array $reg): array
1490
    {
1491
        // Swap if first is larger than last:
1492 1
        if ($reg[1] > $reg[2]) {
1493
            $temp = $reg[2];
1494
            $reg[2] = $reg[1];
1495
            $reg[1] = $temp;
1496
        }
1497
1498 1
        return $reg;
1499
    }
1500
1501 5
    private function getMaximumUrlsToCompile(): int
1502
    {
1503 5
        return $this->maximumUrlsToCompile;
1504
    }
1505
1506
    /**
1507
     * @return BackendUserAuthentication
1508
     */
1509 3
    private function getBackendUser()
1510
    {
1511
        // Make sure the _cli_ user is loaded
1512 3
        Bootstrap::initializeBackendAuthentication();
1513 3
        if ($this->backendUser === null) {
1514 3
            $this->backendUser = $GLOBALS['BE_USER'];
1515
        }
1516 3
        return $this->backendUser;
1517
    }
1518
1519
    /**
1520
     * Get querybuilder for given table
1521
     *
1522
     * @return QueryBuilder
1523
     */
1524 8
    private function getQueryBuilder(string $table)
1525
    {
1526 8
        return GeneralUtility::makeInstance(ConnectionPool::class)->getQueryBuilderForTable($table);
1527
    }
1528
}
1529