Passed
Push — wip/remove-deprecations-for-v1... ( 176f3c...107830 )
by Tomas Norre
05:56
created

CrawlerController::urlListFromUrlArray()   B

Complexity

Conditions 8
Paths 8

Size

Total Lines 68
Code Lines 39

Duplication

Lines 0
Ratio 0 %

Code Coverage

Tests 34
CRAP Score 8.1348

Importance

Changes 1
Bugs 0 Features 0
Metric Value
cc 8
eloc 39
c 1
b 0
f 0
nc 8
nop 9
dl 0
loc 68
ccs 34
cts 39
cp 0.8718
crap 8.1348
rs 8.0515

How to fix   Long Method    Many Parameters   

Long Method

Small methods make your code easier to understand, in particular if combined with a good name. Besides, if your method is small, finding a good name is usually much easier.

For example, if you find yourself adding comments to a method's body, this is usually a good sign to extract the commented part to a new method, and use the comment as a starting point when coming up with a good name for this new method.

Commonly applied refactorings include:

Many Parameters

Methods with many parameters are not only hard to understand, but their parameters also often become inconsistent when you need more, or different data.

There are several approaches to avoid long parameter lists:

1
<?php
2
3
declare(strict_types=1);
4
5
namespace AOE\Crawler\Controller;
6
7
/***************************************************************
8
 *  Copyright notice
9
 *
10
 *  (c) 2020 AOE GmbH <[email protected]>
11
 *
12
 *  All rights reserved
13
 *
14
 *  This script is part of the TYPO3 project. The TYPO3 project is
15
 *  free software; you can redistribute it and/or modify
16
 *  it under the terms of the GNU General Public License as published by
17
 *  the Free Software Foundation; either version 3 of the License, or
18
 *  (at your option) any later version.
19
 *
20
 *  The GNU General Public License can be found at
21
 *  http://www.gnu.org/copyleft/gpl.html.
22
 *
23
 *  This script is distributed in the hope that it will be useful,
24
 *  but WITHOUT ANY WARRANTY; without even the implied warranty of
25
 *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
26
 *  GNU General Public License for more details.
27
 *
28
 *  This copyright notice MUST APPEAR in all copies of the script!
29
 ***************************************************************/
30
31
use AOE\Crawler\Configuration\ExtensionConfigurationProvider;
32
use AOE\Crawler\Converter\JsonCompatibilityConverter;
33
use AOE\Crawler\Crawler;
34
use AOE\Crawler\CrawlStrategy\CrawlStrategyFactory;
35
use AOE\Crawler\Domain\Model\Process;
36
use AOE\Crawler\Domain\Repository\ConfigurationRepository;
37
use AOE\Crawler\Domain\Repository\ProcessRepository;
38
use AOE\Crawler\Domain\Repository\QueueRepository;
39
use AOE\Crawler\QueueExecutor;
40
use AOE\Crawler\Service\ConfigurationService;
41
use AOE\Crawler\Service\UrlService;
42
use AOE\Crawler\Service\UserService;
43
use PDO;
44
use Psr\Log\LoggerAwareInterface;
45
use Psr\Log\LoggerAwareTrait;
46
use TYPO3\CMS\Backend\Tree\View\PageTreeView;
47
use TYPO3\CMS\Backend\Utility\BackendUtility;
48
use TYPO3\CMS\Core\Authentication\BackendUserAuthentication;
49
use TYPO3\CMS\Core\Compatibility\PublicMethodDeprecationTrait;
50
use TYPO3\CMS\Core\Compatibility\PublicPropertyDeprecationTrait;
51
use TYPO3\CMS\Core\Core\Bootstrap;
52
use TYPO3\CMS\Core\Core\Environment;
53
use TYPO3\CMS\Core\Database\Connection;
54
use TYPO3\CMS\Core\Database\ConnectionPool;
55
use TYPO3\CMS\Core\Database\Query\QueryBuilder;
56
use TYPO3\CMS\Core\Database\Query\Restriction\DeletedRestriction;
57
use TYPO3\CMS\Core\Database\QueryGenerator;
58
use TYPO3\CMS\Core\Domain\Repository\PageRepository;
59
use TYPO3\CMS\Core\Imaging\Icon;
60
use TYPO3\CMS\Core\Imaging\IconFactory;
61
use TYPO3\CMS\Core\Type\Bitmask\Permission;
62
use TYPO3\CMS\Core\TypoScript\Parser\TypoScriptParser;
63
use TYPO3\CMS\Core\Utility\DebugUtility;
64
use TYPO3\CMS\Core\Utility\GeneralUtility;
65
use TYPO3\CMS\Core\Utility\MathUtility;
66
use TYPO3\CMS\Extbase\Object\ObjectManager;
67
68
/**
69
 * Class CrawlerController
70
 *
71
 * @package AOE\Crawler\Controller
72
 */
73
class CrawlerController implements LoggerAwareInterface
74
{
75
    use LoggerAwareTrait;
76
    use PublicMethodDeprecationTrait;
0 ignored issues
show
Bug introduced by
The trait TYPO3\CMS\Core\Compatibi...cMethodDeprecationTrait requires the property $deprecatedPublicMethods which is not provided by AOE\Crawler\Controller\CrawlerController.
Loading history...
77
    use PublicPropertyDeprecationTrait;
0 ignored issues
show
Bug introduced by
The trait TYPO3\CMS\Core\Compatibi...ropertyDeprecationTrait requires the property $deprecatedPublicProperties which is not provided by AOE\Crawler\Controller\CrawlerController.
Loading history...
78
79
    public const CLI_STATUS_NOTHING_PROCCESSED = 0;
80
81
    //queue not empty
82
    public const CLI_STATUS_REMAIN = 1;
83
84
    //(some) queue items where processed
85
    public const CLI_STATUS_PROCESSED = 2;
86
87
    //instance didn't finish
88
    public const CLI_STATUS_ABORTED = 4;
89
90
    public const CLI_STATUS_POLLABLE_PROCESSED = 8;
91
92
    /**
93
     * @var integer
94
     */
95
    public $setID = 0;
96
97
    /**
98
     * @var string
99
     */
100
    public $processID = '';
101
102
    /**
103
     * @var array
104
     */
105
    public $duplicateTrack = [];
106
107
    /**
108
     * @var array
109
     */
110
    public $downloadUrls = [];
111
112
    /**
113
     * @var array
114
     */
115
    public $incomingProcInstructions = [];
116
117
    /**
118
     * @var array
119
     */
120
    public $incomingConfigurationSelection = [];
121
122
    /**
123
     * @var bool
124
     */
125
    public $registerQueueEntriesInternallyOnly = false;
126
127
    /**
128
     * @var array
129
     */
130
    public $queueEntries = [];
131
132
    /**
133
     * @var array
134
     */
135
    public $urlList = [];
136
137
    /**
138
     * @var array
139
     */
140
    public $extensionSettings = [];
141
142
    /**
143
     * Mount Point
144
     *
145
     * @var bool
146
     * Todo: Check what this is used for and adjust the type hint or code, as bool doesn't match the current code.
147
     */
148
    public $MP = false;
149
150
    /**
151
     * @var QueueRepository
152
     */
153
    protected $queueRepository;
154
155
    /**
156
     * @var ProcessRepository
157
     */
158
    protected $processRepository;
159
160
    /**
161
     * @var ConfigurationRepository
162
     */
163
    protected $configurationRepository;
164
165
    /**
166
     * @var string
167
     */
168
    protected $tableName = 'tx_crawler_queue';
169
170
    /**
171
     * @var QueueExecutor
172
     */
173
    protected $queueExecutor;
174
175
    /**
176
     * @var int
177
     */
178
    protected $maximumUrlsToCompile = 10000;
179
180
    /**
181
     * @var IconFactory
182
     */
183
    protected $iconFactory;
184
185
    /**
186
     * @var BackendUserAuthentication|null
187
     */
188
    private $backendUser;
189
190
    /**
191
     * @var integer
192
     */
193
    private $scheduledTime = 0;
194
195
    /**
196
     * @var integer
197
     */
198
    private $reqMinute = 0;
199
200
    /**
201
     * @var bool
202
     */
203
    private $submitCrawlUrls = false;
204
205
    /**
206
     * @var bool
207
     */
208
    private $downloadCrawlUrls = false;
209
210
    /**
211
     * @var PageRepository
212
     */
213
    private $pageRepository;
214
215
    /**
216
     * @var Crawler
217
     */
218
    private $crawler;
219
220
    /************************************
221
     *
222
     * Getting URLs based on Page TSconfig
223
     *
224
     ************************************/
225
226 19
    public function __construct()
227
    {
228 19
        $objectManager = GeneralUtility::makeInstance(ObjectManager::class);
229 19
        $crawlStrategyFactory = GeneralUtility::makeInstance(CrawlStrategyFactory::class);
230 19
        $this->queueRepository = $objectManager->get(QueueRepository::class);
231 19
        $this->processRepository = $objectManager->get(ProcessRepository::class);
232 19
        $this->configurationRepository = $objectManager->get(ConfigurationRepository::class);
233 19
        $this->pageRepository = GeneralUtility::makeInstance(PageRepository::class);
234 19
        $this->queueExecutor = GeneralUtility::makeInstance(QueueExecutor::class, $crawlStrategyFactory);
235 19
        $this->iconFactory = GeneralUtility::makeInstance(IconFactory::class);
236 19
        $this->crawler = GeneralUtility::makeInstance(Crawler::class);
237
238
        /** @var ExtensionConfigurationProvider $configurationProvider */
239 19
        $configurationProvider = GeneralUtility::makeInstance(ExtensionConfigurationProvider::class);
240 19
        $settings = $configurationProvider->getExtensionConfiguration();
241 19
        $this->extensionSettings = is_array($settings) ? $settings : [];
0 ignored issues
show
introduced by
The condition is_array($settings) is always true.
Loading history...
242
243
        // set defaults:
244 19
        if (MathUtility::convertToPositiveInteger($this->extensionSettings['countInARun']) === 0) {
245
            $this->extensionSettings['countInARun'] = 100;
246
        }
247
248 19
        $this->extensionSettings['processLimit'] = MathUtility::forceIntegerInRange($this->extensionSettings['processLimit'], 1, 99, 1);
249 19
        $this->setMaximumUrlsToCompile(MathUtility::forceIntegerInRange($this->extensionSettings['maxCompileUrls'], 1, 1000000000, 10000));
250 19
    }
251
252 23
    public function setMaximumUrlsToCompile(int $maximumUrlsToCompile): void
253
    {
254 23
        $this->maximumUrlsToCompile = $maximumUrlsToCompile;
255 23
    }
256
257
    /**
258
     * Sets the extensions settings (unserialized pendant of $TYPO3_CONF_VARS['EXT']['extConf']['crawler']).
259
     */
260 8
    public function setExtensionSettings(array $extensionSettings): void
261
    {
262 8
        $this->extensionSettings = $extensionSettings;
263 8
    }
264
265
    /**
266
     * Check if the given page should be crawled
267
     *
268
     * @return false|string false if the page should be crawled (not excluded), true / skipMessage if it should be skipped
269
     */
270 11
    public function checkIfPageShouldBeSkipped(array $pageRow)
271
    {
272
        // if page is hidden
273 11
        if (! $this->extensionSettings['crawlHiddenPages'] && $pageRow['hidden']) {
274 1
            return 'Because page is hidden';
275
        }
276
277 10
        if (GeneralUtility::inList('3,4,199,254,255', $pageRow['doktype'])) {
278 3
            return 'Because doktype is not allowed';
279
        }
280
281 7
        foreach ($GLOBALS['TYPO3_CONF_VARS']['EXTCONF']['crawler']['excludeDoktype'] ?? [] as $key => $doktypeList) {
282 1
            if (GeneralUtility::inList($doktypeList, $pageRow['doktype'])) {
283 1
                return 'Doktype was excluded by "' . $key . '"';
284
            }
285
        }
286
287
        // veto hook
288 6
        foreach ($GLOBALS['TYPO3_CONF_VARS']['EXTCONF']['crawler']['pageVeto'] ?? [] as $key => $func) {
289
            $params = [
290 2
                'pageRow' => $pageRow,
291
            ];
292
            // expects "false" if page is ok and "true" or a skipMessage if this page should _not_ be crawled
293 2
            $veto = GeneralUtility::callUserFunction($func, $params, $this);
294 2
            if ($veto !== false) {
295 2
                if (is_string($veto)) {
296 1
                    return $veto;
297
                }
298 1
                return 'Veto from hook "' . htmlspecialchars($key) . '"';
299
            }
300
        }
301
302 4
        return false;
303
    }
304
305
    /**
306
     * Wrapper method for getUrlsForPageId()
307
     * It returns an array of configurations and no urls!
308
     *
309
     * @param array $pageRow Page record with at least dok-type and uid columns.
310
     * @param string $skipMessage
311
     * @return array
312
     * @see getUrlsForPageId()
313
     */
314 5
    public function getUrlsForPageRow(array $pageRow, &$skipMessage = '')
315
    {
316 5
        if (! is_int($pageRow['uid'])) {
317
            $skipMessage = 'PageUid ' . $pageRow['uid'] . ' was not an integer';
318
            return [];
319
        }
320
321 5
        $message = $this->checkIfPageShouldBeSkipped($pageRow);
322 5
        if ($message === false) {
323 4
            $res = $this->getUrlsForPageId($pageRow['uid']);
324 4
            $skipMessage = '';
325
        } else {
326 1
            $skipMessage = $message;
327 1
            $res = [];
328
        }
329
330 5
        return $res;
331
    }
332
333
    /**
334
     * Creates a list of URLs from input array (and submits them to queue if asked for)
335
     * See Web > Info module script + "indexed_search"'s crawler hook-client using this!
336
     *
337
     * @param array $vv Information about URLs from pageRow to crawl.
338
     * @param array $pageRow Page row
339
     * @param int $scheduledTime Unix time to schedule indexing to, typically time()
340
     * @param int $reqMinute Number of requests per minute (creates the interleave between requests)
341
     * @param bool $submitCrawlUrls If set, submits the URLs to queue
342
     * @param bool $downloadCrawlUrls If set (and submitcrawlUrls is false) will fill $downloadUrls with entries)
343
     * @param array $duplicateTrack Array which is passed by reference and contains the an id per url to secure we will not crawl duplicates
344
     * @param array $downloadUrls Array which will be filled with URLS for download if flag is set.
345
     * @param array $incomingProcInstructions Array of processing instructions
346
     * @return string List of URLs (meant for display in backend module)
347
     */
348 3
    public function urlListFromUrlArray(
349
        array $vv,
350
        array $pageRow,
351
        $scheduledTime,
352
        $reqMinute,
353
        $submitCrawlUrls,
354
        $downloadCrawlUrls,
355
        array &$duplicateTrack,
356
        array &$downloadUrls,
357
        array $incomingProcInstructions
358
    ) {
359 3
        if (! is_array($vv['URLs'])) {
360
            return 'ERROR - no URL generated';
361
        }
362 3
        $urlLog = [];
363 3
        $pageId = (int) $pageRow['uid'];
364 3
        $configurationHash = $this->getConfigurationHash($vv);
365 3
        $skipInnerCheck = $this->queueRepository->noUnprocessedQueueEntriesForPageWithConfigurationHashExist($pageId, $configurationHash);
366
367 3
        $urlService = new UrlService();
368
369 3
        foreach ($vv['URLs'] as $urlQuery) {
370 3
            if (! $this->drawURLs_PIfilter($vv['subCfg']['procInstrFilter'], $incomingProcInstructions)) {
371
                continue;
372
            }
373 3
            $url = (string) $urlService->getUrlFromPageAndQueryParameters(
374 3
                $pageId,
375 3
                $urlQuery,
376 3
                $vv['subCfg']['baseUrl'] ?? null,
377 3
                $vv['subCfg']['force_ssl'] ?? 0
378
            );
379
380
            // Create key by which to determine unique-ness:
381 3
            $uKey = $url . '|' . $vv['subCfg']['userGroups'] . '|' . $vv['subCfg']['procInstrFilter'];
382
383 3
            if (isset($duplicateTrack[$uKey])) {
384
                //if the url key is registered just display it and do not resubmit is
385
                $urlLog[] = '<em><span class="text-muted">' . htmlspecialchars($url) . '</span></em>';
386
            } else {
387
                // Scheduled time:
388 3
                $schTime = $scheduledTime + round(count($duplicateTrack) * (60 / $reqMinute));
389 3
                $schTime = intval($schTime / 60) * 60;
390 3
                $formattedDate = BackendUtility::datetime($schTime);
391 3
                $this->urlList[] = '[' . $formattedDate . '] ' . $url;
392 3
                $urlList = '[' . $formattedDate . '] ' . htmlspecialchars($url);
393
394
                // Submit for crawling!
395 3
                if ($submitCrawlUrls) {
396 3
                    $added = $this->addUrl(
397 3
                        $pageId,
398 3
                        $url,
399 3
                        $vv['subCfg'],
400 3
                        $scheduledTime,
401 3
                        $configurationHash,
402 3
                        $skipInnerCheck
403
                    );
404 3
                    if ($added === false) {
405 3
                        $urlList .= ' (URL already existed)';
406
                    }
407
                } elseif ($downloadCrawlUrls) {
408
                    $downloadUrls[$url] = $url;
409
                }
410 3
                $urlLog[] = $urlList;
411
            }
412 3
            $duplicateTrack[$uKey] = true;
413
        }
414
415 3
        return implode('<br>', $urlLog);
416
    }
417
418
    /**
419
     * Returns true if input processing instruction is among registered ones.
420
     *
421
     * @param string $piString PI to test
422
     * @param array $incomingProcInstructions Processing instructions
423
     * @return boolean
424
     */
425 8
    public function drawURLs_PIfilter($piString, array $incomingProcInstructions)
426
    {
427 8
        if (empty($incomingProcInstructions)) {
428 4
            return true;
429
        }
430
431 4
        foreach ($incomingProcInstructions as $pi) {
432 4
            if (GeneralUtility::inList($piString, $pi)) {
433 2
                return true;
434
            }
435
        }
436 2
        return false;
437
    }
438
439 5
    public function getPageTSconfigForId($id): array
440
    {
441 5
        if (! $this->MP) {
442 5
            $pageTSconfig = BackendUtility::getPagesTSconfig($id);
443
        } else {
444
            // TODO: Please check, this makes no sense to split a boolean value.
445
            [, $mountPointId] = explode('-', $this->MP);
0 ignored issues
show
Bug introduced by
$this->MP of type true is incompatible with the type string expected by parameter $string of explode(). ( Ignorable by Annotation )

If this is a false-positive, you can also ignore this issue in your code via the ignore-type  annotation

445
            [, $mountPointId] = explode('-', /** @scrutinizer ignore-type */ $this->MP);
Loading history...
446
            $pageTSconfig = BackendUtility::getPagesTSconfig($mountPointId);
0 ignored issues
show
Bug introduced by
$mountPointId of type string is incompatible with the type integer expected by parameter $id of TYPO3\CMS\Backend\Utilit...ity::getPagesTSconfig(). ( Ignorable by Annotation )

If this is a false-positive, you can also ignore this issue in your code via the ignore-type  annotation

446
            $pageTSconfig = BackendUtility::getPagesTSconfig(/** @scrutinizer ignore-type */ $mountPointId);
Loading history...
447
        }
448
449
        // Call a hook to alter configuration
450 5
        if (is_array($GLOBALS['TYPO3_CONF_VARS']['EXTCONF']['crawler']['getPageTSconfigForId'])) {
451
            $params = [
452
                'pageId' => $id,
453
                'pageTSConfig' => &$pageTSconfig,
454
            ];
455
            foreach ($GLOBALS['TYPO3_CONF_VARS']['EXTCONF']['crawler']['getPageTSconfigForId'] as $userFunc) {
456
                GeneralUtility::callUserFunction($userFunc, $params, $this);
457
            }
458
        }
459 5
        return $pageTSconfig;
460
    }
461
462
    /**
463
     * This methods returns an array of configurations.
464
     * Adds no urls!
465
     */
466 3
    public function getUrlsForPageId(int $pageId): array
467
    {
468
        // Get page TSconfig for page ID
469 3
        $pageTSconfig = $this->getPageTSconfigForId($pageId);
470
471 3
        $res = [];
472
473
        // Fetch Crawler Configuration from pageTSconfig
474 3
        $crawlerCfg = $pageTSconfig['tx_crawler.']['crawlerCfg.']['paramSets.'] ?? [];
475 3
        foreach ($crawlerCfg as $key => $values) {
476 3
            if (! is_array($values)) {
477 3
                continue;
478
            }
479 3
            $key = str_replace('.', '', $key);
480
            // Sub configuration for a single configuration string:
481 3
            $subCfg = (array) $crawlerCfg[$key . '.'];
482 3
            $subCfg['key'] = $key;
483
484 3
            if (strcmp($subCfg['procInstrFilter'] ?? '', '')) {
485 3
                $subCfg['procInstrFilter'] = implode(',', GeneralUtility::trimExplode(',', $subCfg['procInstrFilter']));
486
            }
487 3
            $pidOnlyList = implode(',', GeneralUtility::trimExplode(',', $subCfg['pidsOnly'], true));
488
489
            // process configuration if it is not page-specific or if the specific page is the current page:
490
            // TODO: Check if $pidOnlyList can be kept as Array instead of imploded
491 3
            if (! strcmp((string) $subCfg['pidsOnly'], '') || GeneralUtility::inList($pidOnlyList, strval($pageId))) {
492
493
                // Explode, process etc.:
494 3
                $res[$key] = [];
495 3
                $res[$key]['subCfg'] = $subCfg;
496 3
                $res[$key]['paramParsed'] = GeneralUtility::explodeUrl2Array($crawlerCfg[$key]);
497 3
                $res[$key]['paramExpanded'] = $this->expandParameters($res[$key]['paramParsed'], $pageId);
498 3
                $res[$key]['origin'] = 'pagets';
499
500
                // recognize MP value
501 3
                if (! $this->MP) {
502 3
                    $res[$key]['URLs'] = $this->compileUrls($res[$key]['paramExpanded'], ['?id=' . $pageId]);
503
                } else {
504
                    $res[$key]['URLs'] = $this->compileUrls($res[$key]['paramExpanded'], ['?id=' . $pageId . '&MP=' . $this->MP]);
0 ignored issues
show
Bug introduced by
Are you sure $this->MP of type true can be used in concatenation? ( Ignorable by Annotation )

If this is a false-positive, you can also ignore this issue in your code via the ignore-type  annotation

504
                    $res[$key]['URLs'] = $this->compileUrls($res[$key]['paramExpanded'], ['?id=' . $pageId . '&MP=' . /** @scrutinizer ignore-type */ $this->MP]);
Loading history...
505
                }
506
            }
507
        }
508
509
        // Get configuration from tx_crawler_configuration records up the rootline
510 3
        $crawlerConfigurations = $this->configurationRepository->getCrawlerConfigurationRecordsFromRootLine($pageId);
511 3
        foreach ($crawlerConfigurations as $configurationRecord) {
512
513
            // check access to the configuration record
514
            if (empty($configurationRecord['begroups']) || $this->getBackendUser()->isAdmin() || UserService::hasGroupAccess($this->getBackendUser()->user['usergroup_cached_list'], $configurationRecord['begroups'])) {
515
                $pidOnlyList = implode(',', GeneralUtility::trimExplode(',', $configurationRecord['pidsonly'], true));
516
517
                // process configuration if it is not page-specific or if the specific page is the current page:
518
                // TODO: Check if $pidOnlyList can be kept as Array instead of imploded
519
                if (! strcmp($configurationRecord['pidsonly'], '') || GeneralUtility::inList($pidOnlyList, strval($pageId))) {
520
                    $key = $configurationRecord['name'];
521
522
                    // don't overwrite previously defined paramSets
523
                    if (! isset($res[$key])) {
524
525
                        /* @var $TSparserObject TypoScriptParser */
526
                        $TSparserObject = GeneralUtility::makeInstance(TypoScriptParser::class);
527
                        $TSparserObject->parse($configurationRecord['processing_instruction_parameters_ts']);
528
529
                        $subCfg = [
530
                            'procInstrFilter' => $configurationRecord['processing_instruction_filter'],
531
                            'procInstrParams.' => $TSparserObject->setup,
532
                            'baseUrl' => $configurationRecord['base_url'],
533
                            'force_ssl' => (int) $configurationRecord['force_ssl'],
534
                            'userGroups' => $configurationRecord['fegroups'],
535
                            'exclude' => $configurationRecord['exclude'],
536
                            'key' => $key,
537
                        ];
538
539
                        if (! in_array($pageId, $this->expandExcludeString($subCfg['exclude']), true)) {
540
                            $res[$key] = [];
541
                            $res[$key]['subCfg'] = $subCfg;
542
                            $res[$key]['paramParsed'] = GeneralUtility::explodeUrl2Array($configurationRecord['configuration']);
543
                            $res[$key]['paramExpanded'] = $this->expandParameters($res[$key]['paramParsed'], $pageId);
544
                            $res[$key]['URLs'] = $this->compileUrls($res[$key]['paramExpanded'], ['?id=' . $pageId]);
545
                            $res[$key]['origin'] = 'tx_crawler_configuration_' . $configurationRecord['uid'];
546
                        }
547
                    }
548
                }
549
            }
550
        }
551
552 3
        foreach ($GLOBALS['TYPO3_CONF_VARS']['EXTCONF']['crawler']['processUrls'] ?? [] as $func) {
553
            $params = [
554
                'res' => &$res,
555
            ];
556
            GeneralUtility::callUserFunction($func, $params, $this);
557
        }
558 3
        return $res;
559
    }
560
561
    /**
562
     * Find all configurations of subpages of a page
563
     * TODO: Write Functional Tests
564
     */
565 2
    public function getConfigurationsForBranch(int $rootid, int $depth): array
566
    {
567 2
        $configurationsForBranch = [];
568 2
        $pageTSconfig = $this->getPageTSconfigForId($rootid);
569 2
        $sets = $pageTSconfig['tx_crawler.']['crawlerCfg.']['paramSets.'] ?? [];
570 2
        foreach ($sets as $key => $value) {
571
            if (! is_array($value)) {
572
                continue;
573
            }
574
            $configurationsForBranch[] = substr($key, -1) === '.' ? substr($key, 0, -1) : $key;
575
        }
576 2
        $pids = [];
577 2
        $rootLine = BackendUtility::BEgetRootLine($rootid);
578 2
        foreach ($rootLine as $node) {
579 1
            $pids[] = $node['uid'];
580
        }
581
        /* @var PageTreeView $tree */
582 2
        $tree = GeneralUtility::makeInstance(PageTreeView::class);
583 2
        $perms_clause = $this->getBackendUser()->getPagePermsClause(Permission::PAGE_SHOW);
584 2
        $tree->init(empty($perms_clause) ? '' : ('AND ' . $perms_clause));
585 2
        $tree->getTree($rootid, $depth, '');
586 2
        foreach ($tree->tree as $node) {
587
            $pids[] = $node['row']['uid'];
588
        }
589
590 2
        $queryBuilder = $this->getQueryBuilder('tx_crawler_configuration');
591
        $statement = $queryBuilder
592 2
            ->select('name')
593 2
            ->from('tx_crawler_configuration')
594 2
            ->where(
595 2
                $queryBuilder->expr()->in('pid', $queryBuilder->createNamedParameter($pids, Connection::PARAM_INT_ARRAY))
596
            )
597 2
            ->execute();
598
599 2
        while ($row = $statement->fetch()) {
600 1
            $configurationsForBranch[] = $row['name'];
601
        }
602 2
        return $configurationsForBranch;
603
    }
604
605
    /**
606
     * Will expand the parameters configuration to individual values. This follows a certain syntax of the value of each parameter.
607
     * Syntax of values:
608
     * - Basically: If the value is wrapped in [...] it will be expanded according to the following syntax, otherwise the value is taken literally
609
     * - Configuration is splitted by "|" and the parts are processed individually and finally added together
610
     * - For each configuration part:
611
     *         - "[int]-[int]" = Integer range, will be expanded to all values in between, values included, starting from low to high (max. 1000). Example "1-34" or "-40--30"
612
     *         - "_TABLE:[TCA table name];[_PID:[optional page id, default is current page]];[_ENABLELANG:1]" = Look up of table records from PID, filtering out deleted records. Example "_TABLE:tt_content; _PID:123"
613
     *        _ENABLELANG:1 picks only original records without their language overlays
614
     *         - Default: Literal value
615
     *
616
     * @param array $paramArray Array with key (GET var name) and values (value of GET var which is configuration for expansion)
617
     * @param integer $pid Current page ID
618
     * @return array
619
     *
620
     * TODO: Write Functional Tests
621
     */
622 10
    public function expandParameters($paramArray, $pid)
623
    {
624
        // Traverse parameter names:
625 10
        foreach ($paramArray as $p => $v) {
626 10
            $v = trim($v);
627
628
            // If value is encapsulated in square brackets it means there are some ranges of values to find, otherwise the value is literal
629 10
            if (strpos($v, '[') === 0 && substr($v, -1) === ']') {
630
                // So, find the value inside brackets and reset the paramArray value as an array.
631 10
                $v = substr($v, 1, -1);
632 10
                $paramArray[$p] = [];
633
634
                // Explode parts and traverse them:
635 10
                $parts = explode('|', $v);
636 10
                foreach ($parts as $pV) {
637
638
                    // Look for integer range: (fx. 1-34 or -40--30 // reads minus 40 to minus 30)
639 10
                    if (preg_match('/^(-?[0-9]+)\s*-\s*(-?[0-9]+)$/', trim($pV), $reg)) {
640 1
                        $reg = $this->swapIfFirstIsLargerThanSecond($reg);
641
642
                        // Traverse range, add values:
643
                        // Limit to size of range!
644 1
                        $runAwayBrake = 1000;
645 1
                        for ($a = $reg[1]; $a <= $reg[2]; $a++) {
646 1
                            $paramArray[$p][] = $a;
647 1
                            $runAwayBrake--;
648 1
                            if ($runAwayBrake <= 0) {
649
                                break;
650
                            }
651
                        }
652 9
                    } elseif (strpos(trim($pV), '_TABLE:') === 0) {
653
654
                        // Parse parameters:
655 6
                        $subparts = GeneralUtility::trimExplode(';', $pV);
656 6
                        $subpartParams = [];
657 6
                        foreach ($subparts as $spV) {
658 6
                            [$pKey, $pVal] = GeneralUtility::trimExplode(':', $spV);
659 6
                            $subpartParams[$pKey] = $pVal;
660
                        }
661
662
                        // Table exists:
663 6
                        if (isset($GLOBALS['TCA'][$subpartParams['_TABLE']])) {
664 6
                            $lookUpPid = isset($subpartParams['_PID']) ? intval($subpartParams['_PID']) : intval($pid);
665 6
                            $recursiveDepth = isset($subpartParams['_RECURSIVE']) ? intval($subpartParams['_RECURSIVE']) : 0;
666 6
                            $pidField = isset($subpartParams['_PIDFIELD']) ? trim($subpartParams['_PIDFIELD']) : 'pid';
667 6
                            $where = $subpartParams['_WHERE'] ?? '';
668 6
                            $addTable = $subpartParams['_ADDTABLE'] ?? '';
669
670 6
                            $fieldName = $subpartParams['_FIELD'] ? $subpartParams['_FIELD'] : 'uid';
671 6
                            if ($fieldName === 'uid' || $GLOBALS['TCA'][$subpartParams['_TABLE']]['columns'][$fieldName]) {
672 6
                                $queryBuilder = $this->getQueryBuilder($subpartParams['_TABLE']);
673
674 6
                                if ($recursiveDepth > 0) {
675
                                    /** @var QueryGenerator $queryGenerator */
676 2
                                    $queryGenerator = GeneralUtility::makeInstance(QueryGenerator::class);
677 2
                                    $pidList = $queryGenerator->getTreeList($lookUpPid, $recursiveDepth, 0, 1);
678 2
                                    $pidArray = GeneralUtility::intExplode(',', $pidList);
679
                                } else {
680 4
                                    $pidArray = [(string) $lookUpPid];
681
                                }
682
683 6
                                $queryBuilder->getRestrictions()
684 6
                                    ->removeAll()
685 6
                                    ->add(GeneralUtility::makeInstance(DeletedRestriction::class));
686
687
                                $queryBuilder
688 6
                                    ->select($fieldName)
689 6
                                    ->from($subpartParams['_TABLE'])
690 6
                                    ->where(
691 6
                                        $queryBuilder->expr()->in($pidField, $queryBuilder->createNamedParameter($pidArray, Connection::PARAM_INT_ARRAY)),
692 6
                                        $where
693
                                    );
694
695 6
                                if (! empty($addTable)) {
696
                                    // TODO: Check if this works as intended!
697
                                    $queryBuilder->add('from', $addTable);
698
                                }
699 6
                                $transOrigPointerField = $GLOBALS['TCA'][$subpartParams['_TABLE']]['ctrl']['transOrigPointerField'];
700
701 6
                                if ($subpartParams['_ENABLELANG'] && $transOrigPointerField) {
702
                                    $queryBuilder->andWhere(
703
                                        $queryBuilder->expr()->lte(
704
                                            $transOrigPointerField,
705
                                            0
706
                                        )
707
                                    );
708
                                }
709
710 6
                                $statement = $queryBuilder->execute();
711
712 6
                                $rows = [];
713 6
                                while ($row = $statement->fetch()) {
714 6
                                    $rows[$row[$fieldName]] = $row;
715
                                }
716
717 6
                                if (is_array($rows)) {
718 6
                                    $paramArray[$p] = array_merge($paramArray[$p], array_keys($rows));
719
                                }
720
                            }
721
                        }
722
                    } else {
723
                        // Just add value:
724 3
                        $paramArray[$p][] = $pV;
725
                    }
726
                    // Hook for processing own expandParameters place holder
727 10
                    if (is_array($GLOBALS['TYPO3_CONF_VARS']['SC_OPTIONS']['crawler/class.tx_crawler_lib.php']['expandParameters'])) {
728
                        $_params = [
729
                            'pObj' => &$this,
730
                            'paramArray' => &$paramArray,
731
                            'currentKey' => $p,
732
                            'currentValue' => $pV,
733
                            'pid' => $pid,
734
                        ];
735
                        foreach ($GLOBALS['TYPO3_CONF_VARS']['SC_OPTIONS']['crawler/class.tx_crawler_lib.php']['expandParameters'] as $_funcRef) {
736
                            GeneralUtility::callUserFunction($_funcRef, $_params, $this);
737
                        }
738
                    }
739
                }
740
741
                // Make unique set of values and sort array by key:
742 10
                $paramArray[$p] = array_unique($paramArray[$p]);
743 10
                ksort($paramArray);
744
            } else {
745
                // Set the literal value as only value in array:
746 3
                $paramArray[$p] = [$v];
747
            }
748
        }
749
750 10
        return $paramArray;
751
    }
752
753
    /**
754
     * Compiling URLs from parameter array (output of expandParameters())
755
     * The number of URLs will be the multiplication of the number of parameter values for each key
756
     *
757
     * @param array $paramArray Output of expandParameters(): Array with keys (GET var names) and for each an array of values
758
     * @param array $urls URLs accumulated in this array (for recursion)
759
     * @return array
760
     */
761 7
    public function compileUrls($paramArray, array $urls)
762
    {
763 7
        if (empty($paramArray)) {
764 7
            return $urls;
765
        }
766 6
        $varName = key($paramArray);
767 6
        $valueSet = array_shift($paramArray);
768
769
        // Traverse value set:
770 6
        $newUrls = [];
771 6
        foreach ($urls as $url) {
772 5
            foreach ($valueSet as $val) {
773 5
                if (count($newUrls) < $this->getMaximumUrlsToCompile()) {
774 5
                    $newUrls[] = $url . (strcmp((string) $val, '') ? '&' . rawurlencode($varName) . '=' . rawurlencode((string) $val) : '');
775
                }
776
            }
777
        }
778 6
        return $this->compileUrls($paramArray, $newUrls);
779
    }
780
781
    /************************************
782
     *
783
     * Crawler log
784
     *
785
     ************************************/
786
787
    /**
788
     * Adding call back entries to log (called from hooks typically, see indexed search class "class.crawler.php"
789
     *
790
     * @param integer $setId Set ID
791
     * @param array $params Parameters to pass to call back function
792
     * @param string $callBack Call back object reference, eg. 'EXT:indexed_search/class.crawler.php:&tx_indexedsearch_crawler'
793
     * @param integer $page_id Page ID to attach it to
794
     * @param integer $schedule Time at which to activate
795
     */
796
    public function addQueueEntry_callBack($setId, $params, $callBack, $page_id = 0, $schedule = 0): void
797
    {
798
        if (! is_array($params)) {
0 ignored issues
show
introduced by
The condition is_array($params) is always true.
Loading history...
799
            $params = [];
800
        }
801
        $params['_CALLBACKOBJ'] = $callBack;
802
803
        GeneralUtility::makeInstance(ConnectionPool::class)->getConnectionForTable('tx_crawler_queue')
804
            ->insert(
805
                'tx_crawler_queue',
806
                [
807
                    'page_id' => (int) $page_id,
808
                    'parameters' => json_encode($params),
809
                    'scheduled' => (int) $schedule ?: $this->getCurrentTime(),
810
                    'exec_time' => 0,
811
                    'set_id' => (int) $setId,
812
                    'result_data' => '',
813
                ]
814
            );
815
    }
816
817
    /************************************
818
     *
819
     * URL setting
820
     *
821
     ************************************/
822
823
    /**
824
     * Setting a URL for crawling:
825
     *
826
     * @param integer $id Page ID
827
     * @param string $url Complete URL
828
     * @param array $subCfg Sub configuration array (from TS config)
829
     * @param integer $tstamp Scheduled-time
830
     * @param string $configurationHash (optional) configuration hash
831
     * @param bool $skipInnerDuplicationCheck (optional) skip inner duplication check
832
     * @return bool
833
     */
834 7
    public function addUrl(
835
        $id,
836
        $url,
837
        array $subCfg,
838
        $tstamp,
839
        $configurationHash = '',
840
        $skipInnerDuplicationCheck = false
841
    ) {
842 7
        $urlAdded = false;
843 7
        $rows = [];
844
845
        // Creating parameters:
846
        $parameters = [
847 7
            'url' => $url,
848
        ];
849
850
        // fe user group simulation:
851 7
        $uGs = implode(',', array_unique(GeneralUtility::intExplode(',', $subCfg['userGroups'], true)));
852 7
        if ($uGs) {
853 1
            $parameters['feUserGroupList'] = $uGs;
854
        }
855
856
        // Setting processing instructions
857 7
        $parameters['procInstructions'] = GeneralUtility::trimExplode(',', $subCfg['procInstrFilter']);
858 7
        if (is_array($subCfg['procInstrParams.'])) {
859 4
            $parameters['procInstrParams'] = $subCfg['procInstrParams.'];
860
        }
861
862
        // Compile value array:
863 7
        $parameters_serialized = json_encode($parameters);
864
        $fieldArray = [
865 7
            'page_id' => (int) $id,
866 7
            'parameters' => $parameters_serialized,
867 7
            'parameters_hash' => GeneralUtility::shortMD5($parameters_serialized),
868 7
            'configuration_hash' => $configurationHash,
869 7
            'scheduled' => $tstamp,
870 7
            'exec_time' => 0,
871 7
            'set_id' => (int) $this->setID,
872 7
            'result_data' => '',
873 7
            'configuration' => $subCfg['key'],
874
        ];
875
876 7
        if ($this->registerQueueEntriesInternallyOnly) {
877
            //the entries will only be registered and not stored to the database
878 1
            $this->queueEntries[] = $fieldArray;
879
        } else {
880 6
            if (! $skipInnerDuplicationCheck) {
881
                // check if there is already an equal entry
882 5
                $rows = $this->queueRepository->getDuplicateQueueItemsIfExists(
883 5
                    (bool) $this->extensionSettings['enableTimeslot'],
884 5
                    $tstamp,
885 5
                    $this->getCurrentTime(),
886 5
                    $fieldArray['page_id'],
887 5
                    $fieldArray['parameters_hash']
888
                );
889
            }
890
891 6
            if (empty($rows)) {
892 5
                $connectionForCrawlerQueue = GeneralUtility::makeInstance(ConnectionPool::class)->getConnectionForTable('tx_crawler_queue');
893 5
                $connectionForCrawlerQueue->insert(
894 5
                    'tx_crawler_queue',
895 5
                    $fieldArray
896
                );
897 5
                $uid = $connectionForCrawlerQueue->lastInsertId('tx_crawler_queue', 'qid');
898 5
                $rows[] = $uid;
899 5
                $urlAdded = true;
900
            }
901
        }
902
903 7
        return $urlAdded;
904
    }
905
906
    /**
907
     * Returns the current system time
908
     *
909
     * @return int
910
     */
911 4
    public function getCurrentTime()
912
    {
913 4
        return time();
914
    }
915
916
    /************************************
917
     *
918
     * URL reading
919
     *
920
     ************************************/
921
922
    /**
923
     * Read URL for single queue entry
924
     *
925
     * @param integer $queueId
926
     * @param boolean $force If set, will process even if exec_time has been set!
927
     * @return integer
928
     */
929 2
    public function readUrl($queueId, $force = false)
930
    {
931 2
        $queryBuilder = GeneralUtility::makeInstance(ConnectionPool::class)->getQueryBuilderForTable($this->tableName);
932 2
        $ret = 0;
933 2
        $this->logger->debug('crawler-readurl start ' . microtime(true));
934
        // Get entry:
935
        $queryBuilder
936 2
            ->select('*')
937 2
            ->from('tx_crawler_queue')
938 2
            ->where(
939 2
                $queryBuilder->expr()->eq('qid', $queryBuilder->createNamedParameter($queueId, PDO::PARAM_INT))
940
            );
941 2
        if (! $force) {
942
            $queryBuilder
943 2
                ->andWhere('exec_time = 0')
944 2
                ->andWhere('process_scheduled > 0');
945
        }
946 2
        $queueRec = $queryBuilder->execute()->fetch();
947
948 2
        if (! is_array($queueRec)) {
949
            return;
950
        }
951
952
        // Set exec_time to lock record:
953 2
        $field_array = ['exec_time' => $this->getCurrentTime()];
954
955 2
        if (isset($this->processID)) {
956
            //if mulitprocessing is used we need to store the id of the process which has handled this entry
957 2
            $field_array['process_id_completed'] = $this->processID;
958
        }
959
960 2
        GeneralUtility::makeInstance(ConnectionPool::class)->getConnectionForTable('tx_crawler_queue')
961 2
            ->update(
962 2
                'tx_crawler_queue',
963 2
                $field_array,
964 2
                ['qid' => (int) $queueId]
965
            );
966
967 2
        $result = $this->queueExecutor->executeQueueItem($queueRec, $this);
968 2
        if ($result['content'] === null) {
969
            $resultData = 'An errors happened';
970
        } else {
971
            /** @var JsonCompatibilityConverter $jsonCompatibilityConverter */
972 2
            $jsonCompatibilityConverter = GeneralUtility::makeInstance(JsonCompatibilityConverter::class);
973 2
            $resultData = $jsonCompatibilityConverter->convert($result['content']);
974
        }
975
976
        //atm there's no need to point to specific pollable extensions
977 2
        if (is_array($GLOBALS['TYPO3_CONF_VARS']['EXTCONF']['crawler']['pollSuccess'])) {
978
            foreach ($GLOBALS['TYPO3_CONF_VARS']['EXTCONF']['crawler']['pollSuccess'] as $pollable) {
979
                // only check the success value if the instruction is runnig
980
                // it is important to name the pollSuccess key same as the procInstructions key
981
                if (is_array($resultData['parameters']['procInstructions'])
982
                    && in_array(
983
                        $pollable,
984
                        $resultData['parameters']['procInstructions'], true
985
                    )
986
                ) {
987
                    if (! empty($resultData['success'][$pollable]) && $resultData['success'][$pollable]) {
988
                        $ret |= self::CLI_STATUS_POLLABLE_PROCESSED;
989
                    }
990
                }
991
            }
992
        }
993
994
        // Set result in log which also denotes the end of the processing of this entry.
995 2
        $field_array = ['result_data' => json_encode($result)];
996
997 2
        GeneralUtility::makeInstance(ConnectionPool::class)->getConnectionForTable('tx_crawler_queue')
998 2
            ->update(
999 2
                'tx_crawler_queue',
1000 2
                $field_array,
1001 2
                ['qid' => (int) $queueId]
1002
            );
1003
1004 2
        $this->logger->debug('crawler-readurl stop ' . microtime(true));
1005 2
        return $ret;
1006
    }
1007
1008
    /**
1009
     * Read URL for not-yet-inserted log-entry
1010
     *
1011
     * @param array $field_array Queue field array,
1012
     *
1013
     * @return array|bool|mixed|string
1014
     */
1015
    public function readUrlFromArray($field_array)
1016
    {
1017
        // Set exec_time to lock record:
1018
        $field_array['exec_time'] = $this->getCurrentTime();
1019
        $connectionForCrawlerQueue = GeneralUtility::makeInstance(ConnectionPool::class)->getConnectionForTable($this->tableName);
1020
        $connectionForCrawlerQueue->insert(
1021
            $this->tableName,
1022
            $field_array
1023
        );
1024
        $queueId = $connectionForCrawlerQueue->lastInsertId($this->tableName, 'qid');
1025
        $result = $this->queueExecutor->executeQueueItem($field_array, $this);
1026
1027
        // Set result in log which also denotes the end of the processing of this entry.
1028
        $field_array = ['result_data' => json_encode($result)];
1029
1030
        $connectionForCrawlerQueue->update(
1031
            $this->tableName,
1032
            $field_array,
1033
            ['qid' => $queueId]
1034
        );
1035
1036
        return $result;
1037
    }
1038
1039
    /*****************************
1040
     *
1041
     * Compiling URLs to crawl - tools
1042
     *
1043
     *****************************/
1044
1045
    /**
1046
     * @param integer $id Root page id to start from.
1047
     * @param integer $depth Depth of tree, 0=only id-page, 1= on sublevel, 99 = infinite
1048
     * @param integer $scheduledTime Unix Time when the URL is timed to be visited when put in queue
1049
     * @param integer $reqMinute Number of requests per minute (creates the interleave between requests)
1050
     * @param boolean $submitCrawlUrls If set, submits the URLs to queue in database (real crawling)
1051
     * @param boolean $downloadCrawlUrls If set (and submitcrawlUrls is false) will fill $downloadUrls with entries)
1052
     * @param array $incomingProcInstructions Array of processing instructions
1053
     * @param array $configurationSelection Array of configuration keys
1054
     * @return string
1055
     */
1056
    public function getPageTreeAndUrls(
1057
        $id,
1058
        $depth,
1059
        $scheduledTime,
1060
        $reqMinute,
1061
        $submitCrawlUrls,
1062
        $downloadCrawlUrls,
1063
        array $incomingProcInstructions,
1064
        array $configurationSelection
1065
    ) {
1066
        $this->scheduledTime = $scheduledTime;
1067
        $this->reqMinute = $reqMinute;
1068
        $this->submitCrawlUrls = $submitCrawlUrls;
1069
        $this->downloadCrawlUrls = $downloadCrawlUrls;
1070
        $this->incomingProcInstructions = $incomingProcInstructions;
1071
        $this->incomingConfigurationSelection = $configurationSelection;
1072
1073
        $this->duplicateTrack = [];
1074
        $this->downloadUrls = [];
1075
1076
        // Drawing tree:
1077
        /* @var PageTreeView $tree */
1078
        $tree = GeneralUtility::makeInstance(PageTreeView::class);
1079
        $perms_clause = $this->getBackendUser()->getPagePermsClause(Permission::PAGE_SHOW);
1080
        $tree->init('AND ' . $perms_clause);
1081
1082
        $pageInfo = BackendUtility::readPageAccess($id, $perms_clause);
1083
        if (is_array($pageInfo)) {
1084
            // Set root row:
1085
            $tree->tree[] = [
1086
                'row' => $pageInfo,
1087
                'HTML' => $this->iconFactory->getIconForRecord('pages', $pageInfo, Icon::SIZE_SMALL),
1088
            ];
1089
        }
1090
1091
        // Get branch beneath:
1092
        if ($depth) {
1093
            $tree->getTree($id, $depth, '');
1094
        }
1095
1096
        // Traverse page tree:
1097
        $code = '';
1098
1099
        foreach ($tree->tree as $data) {
1100
            $this->MP = false;
1101
1102
            // recognize mount points
1103
            if ($data['row']['doktype'] === PageRepository::DOKTYPE_MOUNTPOINT) {
1104
                $mountpage = $this->pageRepository->getPage($data['row']['uid']);
1105
1106
                // fetch mounted pages
1107
                $this->MP = $mountpage[0]['mount_pid'] . '-' . $data['row']['uid'];
0 ignored issues
show
Documentation Bug introduced by
The property $MP was declared of type boolean, but $mountpage[0]['mount_pid...' . $data['row']['uid'] is of type string. Maybe add a type cast?

This check looks for assignments to scalar types that may be of the wrong type.

To ensure the code behaves as expected, it may be a good idea to add an explicit type cast.

$answer = 42;

$correct = false;

$correct = (bool) $answer;
Loading history...
1108
1109
                $mountTree = GeneralUtility::makeInstance(PageTreeView::class);
1110
                $mountTree->init('AND ' . $perms_clause);
1111
                $mountTree->getTree($mountpage[0]['mount_pid'], $depth);
1112
1113
                foreach ($mountTree->tree as $mountData) {
1114
                    $code .= $this->drawURLs_addRowsForPage(
1115
                        $mountData['row'],
1116
                        $mountData['HTML'] . BackendUtility::getRecordTitle('pages', $mountData['row'], true)
1117
                    );
1118
                }
1119
1120
                // replace page when mount_pid_ol is enabled
1121
                if ($mountpage[0]['mount_pid_ol']) {
1122
                    $data['row']['uid'] = $mountpage[0]['mount_pid'];
1123
                } else {
1124
                    // if the mount_pid_ol is not set the MP must not be used for the mountpoint page
1125
                    $this->MP = false;
1126
                }
1127
            }
1128
1129
            $code .= $this->drawURLs_addRowsForPage(
1130
                $data['row'],
1131
                $data['HTML'] . BackendUtility::getRecordTitle('pages', $data['row'], true)
1132
            );
1133
        }
1134
1135
        return $code;
1136
    }
1137
1138
    /**
1139
     * Expands exclude string
1140
     *
1141
     * @param string $excludeString Exclude string
1142
     * @return array
1143
     */
1144 1
    public function expandExcludeString($excludeString)
1145
    {
1146
        // internal static caches;
1147 1
        static $expandedExcludeStringCache;
1148 1
        static $treeCache;
1149
1150 1
        if (empty($expandedExcludeStringCache[$excludeString])) {
1151 1
            $pidList = [];
1152
1153 1
            if (! empty($excludeString)) {
1154
                /** @var PageTreeView $tree */
1155 1
                $tree = GeneralUtility::makeInstance(PageTreeView::class);
1156 1
                $tree->init('AND ' . $this->getBackendUser()->getPagePermsClause(Permission::PAGE_SHOW));
1157
1158 1
                $excludeParts = GeneralUtility::trimExplode(',', $excludeString);
1159
1160 1
                foreach ($excludeParts as $excludePart) {
1161 1
                    [$pid, $depth] = GeneralUtility::trimExplode('+', $excludePart);
1162
1163
                    // default is "page only" = "depth=0"
1164 1
                    if (empty($depth)) {
1165 1
                        $depth = (stristr($excludePart, '+')) ? 99 : 0;
1166
                    }
1167
1168 1
                    $pidList[] = (int) $pid;
1169
1170 1
                    if ($depth > 0) {
1171
                        if (empty($treeCache[$pid][$depth])) {
1172
                            $tree->reset();
1173
                            $tree->getTree($pid, $depth);
0 ignored issues
show
Bug introduced by
$pid of type string is incompatible with the type integer expected by parameter $uid of TYPO3\CMS\Backend\Tree\V...ractTreeView::getTree(). ( Ignorable by Annotation )

If this is a false-positive, you can also ignore this issue in your code via the ignore-type  annotation

1173
                            $tree->getTree(/** @scrutinizer ignore-type */ $pid, $depth);
Loading history...
1174
                            $treeCache[$pid][$depth] = $tree->tree;
1175
                        }
1176
1177
                        foreach ($treeCache[$pid][$depth] as $data) {
1178
                            $pidList[] = (int) $data['row']['uid'];
1179
                        }
1180
                    }
1181
                }
1182
            }
1183
1184 1
            $expandedExcludeStringCache[$excludeString] = array_unique($pidList);
1185
        }
1186
1187 1
        return $expandedExcludeStringCache[$excludeString];
1188
    }
1189
1190
    /**
1191
     * Create the rows for display of the page tree
1192
     * For each page a number of rows are shown displaying GET variable configuration
1193
     */
1194
    public function drawURLs_addRowsForPage(array $pageRow, string $pageTitle): string
1195
    {
1196
        $skipMessage = '';
1197
1198
        // Get list of configurations
1199
        $configurations = $this->getUrlsForPageRow($pageRow, $skipMessage);
1200
        $configurations = ConfigurationService::removeDisallowedConfigurations($this->incomingConfigurationSelection, $configurations);
1201
1202
        // Traverse parameter combinations:
1203
        $c = 0;
1204
        $content = '';
1205
        if (! empty($configurations)) {
1206
            foreach ($configurations as $confKey => $confArray) {
1207
1208
                // Title column:
1209
                if (! $c) {
1210
                    $titleClm = '<td rowspan="' . count($configurations) . '">' . $pageTitle . '</td>';
1211
                } else {
1212
                    $titleClm = '';
1213
                }
1214
1215
                if (! in_array($pageRow['uid'], $this->expandExcludeString($confArray['subCfg']['exclude']), true)) {
1216
1217
                    // URL list:
1218
                    $urlList = $this->urlListFromUrlArray(
1219
                        $confArray,
1220
                        $pageRow,
1221
                        $this->scheduledTime,
1222
                        $this->reqMinute,
1223
                        $this->submitCrawlUrls,
1224
                        $this->downloadCrawlUrls,
1225
                        $this->duplicateTrack,
1226
                        $this->downloadUrls,
1227
                        // if empty the urls won't be filtered by processing instructions
1228
                        $this->incomingProcInstructions
1229
                    );
1230
1231
                    // Expanded parameters:
1232
                    $paramExpanded = '';
1233
                    $calcAccu = [];
1234
                    $calcRes = 1;
1235
                    foreach ($confArray['paramExpanded'] as $gVar => $gVal) {
1236
                        $paramExpanded .= '
1237
                            <tr>
1238
                                <td>' . htmlspecialchars('&' . $gVar . '=') . '<br/>' .
1239
                            '(' . count($gVal) . ')' .
1240
                            '</td>
1241
                                <td nowrap="nowrap">' . nl2br(htmlspecialchars(implode(chr(10), $gVal))) . '</td>
1242
                            </tr>
1243
                        ';
1244
                        $calcRes *= count($gVal);
1245
                        $calcAccu[] = count($gVal);
1246
                    }
1247
                    $paramExpanded = '<table>' . $paramExpanded . '</table>';
1248
                    $paramExpanded .= 'Comb: ' . implode('*', $calcAccu) . '=' . $calcRes;
1249
1250
                    // Options
1251
                    $optionValues = '';
1252
                    if ($confArray['subCfg']['userGroups']) {
1253
                        $optionValues .= 'User Groups: ' . $confArray['subCfg']['userGroups'] . '<br/>';
1254
                    }
1255
                    if ($confArray['subCfg']['procInstrFilter']) {
1256
                        $optionValues .= 'ProcInstr: ' . $confArray['subCfg']['procInstrFilter'] . '<br/>';
1257
                    }
1258
1259
                    // Compile row:
1260
                    $content .= '
1261
                        <tr>
1262
                            ' . $titleClm . '
1263
                            <td>' . htmlspecialchars($confKey) . '</td>
1264
                            <td>' . nl2br(htmlspecialchars(rawurldecode(trim(str_replace('&', chr(10) . '&', GeneralUtility::implodeArrayForUrl('', $confArray['paramParsed'])))))) . '</td>
1265
                            <td>' . $paramExpanded . '</td>
1266
                            <td nowrap="nowrap">' . $urlList . '</td>
1267
                            <td nowrap="nowrap">' . $optionValues . '</td>
1268
                            <td nowrap="nowrap">' . DebugUtility::viewArray($confArray['subCfg']['procInstrParams.']) . '</td>
1269
                        </tr>';
1270
                } else {
1271
                    $content .= '<tr>
1272
                            ' . $titleClm . '
1273
                            <td>' . htmlspecialchars($confKey) . '</td>
1274
                            <td colspan="5"><em>No entries</em> (Page is excluded in this configuration)</td>
1275
                        </tr>';
1276
                }
1277
1278
                $c++;
1279
            }
1280
        } else {
1281
            $message = ! empty($skipMessage) ? ' (' . $skipMessage . ')' : '';
1282
1283
            // Compile row:
1284
            $content .= '
1285
                <tr>
1286
                    <td>' . $pageTitle . '</td>
1287
                    <td colspan="6"><em>No entries</em>' . $message . '</td>
1288
                </tr>';
1289
        }
1290
1291
        return $content;
1292
    }
1293
1294
    /*****************************
1295
     *
1296
     * CLI functions
1297
     *
1298
     *****************************/
1299
1300
    /**
1301
     * Running the functionality of the CLI (crawling URLs from queue)
1302
     */
1303 2
    public function CLI_run(int $countInARun, int $sleepTime, int $sleepAfterFinish): int
1304
    {
1305 2
        $result = 0;
1306 2
        $counter = 0;
1307
1308
        // Clean up the queue
1309 2
        $this->queueRepository->cleanupQueue();
1310
1311
        // Select entries:
1312 2
        $rows = $this->queueRepository->fetchRecordsToBeCrawled($countInARun);
1313
1314 2
        if (! empty($rows)) {
1315 2
            $quidList = [];
1316
1317 2
            foreach ($rows as $r) {
1318 2
                $quidList[] = $r['qid'];
1319
            }
1320
1321 2
            $processId = $this->CLI_buildProcessId();
1322
1323
            //save the number of assigned queue entries to determine how many have been processed later
1324 2
            $numberOfAffectedRows = $this->queueRepository->updateProcessIdAndSchedulerForQueueIds($quidList, $processId);
1325 2
            $this->processRepository->updateProcessAssignItemsCount($numberOfAffectedRows, $processId);
1326
1327 2
            if ($numberOfAffectedRows !== count($quidList)) {
1328
                return ($result | self::CLI_STATUS_ABORTED);
1329
            }
1330
1331 2
            foreach ($rows as $r) {
1332 2
                $result |= $this->readUrl($r['qid']);
1333
1334 2
                $counter++;
1335
                // Just to relax the system
1336 2
                usleep((int) $sleepTime);
1337
1338
                // if during the start and the current read url the cli has been disable we need to return from the function
1339
                // mark the process NOT as ended.
1340 2
                if ($this->crawler->isDisabled()) {
1341
                    return ($result | self::CLI_STATUS_ABORTED);
1342
                }
1343
1344 2
                if (! $this->processRepository->isProcessActive($this->CLI_buildProcessId())) {
1345
                    $this->CLI_debug('conflict / timeout (' . $this->CLI_buildProcessId() . ')');
0 ignored issues
show
Bug introduced by
The method CLI_debug() does not exist on AOE\Crawler\Controller\CrawlerController. Since you implemented __call, consider adding a @method annotation. ( Ignorable by Annotation )

If this is a false-positive, you can also ignore this issue in your code via the ignore-call  annotation

1345
                    $this->/** @scrutinizer ignore-call */ 
1346
                           CLI_debug('conflict / timeout (' . $this->CLI_buildProcessId() . ')');
Loading history...
1346
                    $result |= self::CLI_STATUS_ABORTED;
1347
                    //possible timeout
1348
                    break;
1349
                }
1350
            }
1351
1352 2
            sleep((int) $sleepAfterFinish);
1353
        }
1354
1355 2
        if ($counter > 0) {
1356 2
            $result |= self::CLI_STATUS_PROCESSED;
1357
        }
1358
1359 2
        return $result;
1360
    }
1361
1362
    /**
1363
     * Try to acquire a new process with the given id
1364
     * also performs some auto-cleanup for orphan processes
1365
     * @param string $id identification string for the process
1366
     * @return boolean
1367
     * @todo preemption might not be the most elegant way to clean up
1368
     */
1369 2
    public function CLI_checkAndAcquireNewProcess($id)
1370
    {
1371 2
        $ret = true;
1372
1373 2
        $systemProcessId = getmypid();
1374 2
        if (! $systemProcessId) {
1375
            return false;
1376
        }
1377
1378 2
        $processCount = 0;
1379 2
        $orphanProcesses = [];
1380
1381 2
        $activeProcesses = $this->processRepository->findAllActive();
1382 2
        $currentTime = $this->getCurrentTime();
1383
1384
        /** @var Process $process */
1385 2
        foreach ($activeProcesses as $process) {
1386
            if ($process->getTtl() < $currentTime) {
1387
                $orphanProcesses[] = $process->getProcessId();
1388
            } else {
1389
                $processCount++;
1390
            }
1391
        }
1392
1393
        // if there are less than allowed active processes then add a new one
1394 2
        if ($processCount < (int) $this->extensionSettings['processLimit']) {
1395 2
            GeneralUtility::makeInstance(ConnectionPool::class)->getConnectionForTable('tx_crawler_process')->insert(
1396 2
                'tx_crawler_process',
1397
                [
1398 2
                    'process_id' => $id,
1399 2
                    'active' => 1,
1400 2
                    'ttl' => $currentTime + (int) $this->extensionSettings['processMaxRunTime'],
1401 2
                    'system_process_id' => $systemProcessId,
1402
                ]
1403
            );
1404
        } else {
1405
            $ret = false;
1406
        }
1407
1408 2
        $this->processRepository->deleteProcessesMarkedAsDeleted();
1409 2
        $this->CLI_releaseProcesses($orphanProcesses);
1410
1411 2
        return $ret;
1412
    }
1413
1414
    /**
1415
     * Release a process and the required resources
1416
     *
1417
     * @param mixed $releaseIds string with a single process-id or array with multiple process-ids
1418
     * @return boolean
1419
     */
1420 2
    public function CLI_releaseProcesses($releaseIds)
1421
    {
1422 2
        $queryBuilder = GeneralUtility::makeInstance(ConnectionPool::class)->getQueryBuilderForTable($this->tableName);
1423
1424 2
        if (! is_array($releaseIds)) {
1425 2
            $releaseIds = [$releaseIds];
1426
        }
1427
1428 2
        if (empty($releaseIds)) {
1429
            //nothing to release
1430 2
            return false;
1431
        }
1432
1433
        // some kind of 2nd chance algo - this way you need at least 2 processes to have a real cleanup
1434
        // this ensures that a single process can't mess up the entire process table
1435
1436
        // mark all processes as deleted which have no "waiting" queue-entires and which are not active
1437
1438
        $queryBuilder
1439 2
            ->update($this->tableName, 'q')
1440 2
            ->where(
1441 2
                'q.process_id IN(SELECT p.process_id FROM tx_crawler_process as p WHERE p.active = 0)'
1442
            )
1443 2
            ->set('q.process_scheduled', 0)
1444 2
            ->set('q.process_id', '')
1445 2
            ->execute();
1446
1447
        // FIXME: Not entirely sure that this is equivalent to the previous version
1448 2
        $queryBuilder->resetQueryPart('set');
1449
1450
        $queryBuilder
1451 2
            ->update('tx_crawler_process')
1452 2
            ->where(
1453 2
                $queryBuilder->expr()->eq('active', 0),
1454 2
                'process_id IN(SELECT q.process_id FROM tx_crawler_queue as q WHERE q.exec_time = 0)'
1455
            )
1456 2
            ->set('system_process_id', 0)
1457 2
            ->execute();
1458
1459 2
        $this->processRepository->markRequestedProcessesAsNotActive($releaseIds);
1460 2
        $this->queueRepository->unsetProcessScheduledAndProcessIdForQueueEntries($releaseIds);
1461
1462 2
        return true;
1463
    }
1464
1465
    /**
1466
     * Create a unique Id for the current process
1467
     *
1468
     * @return string the ID
1469
     */
1470 3
    public function CLI_buildProcessId()
1471
    {
1472 3
        if (! $this->processID) {
1473 2
            $this->processID = GeneralUtility::shortMD5(microtime(true));
1474
        }
1475 3
        return $this->processID;
1476
    }
1477
1478
    /**
1479
     * Returns a md5 hash generated from a serialized configuration array.
1480
     *
1481
     * @return string
1482
     */
1483 9
    protected function getConfigurationHash(array $configuration)
1484
    {
1485 9
        unset($configuration['paramExpanded']);
1486 9
        unset($configuration['URLs']);
1487 9
        return md5(serialize($configuration));
1488
    }
1489
1490 1
    protected function swapIfFirstIsLargerThanSecond(array $reg): array
1491
    {
1492
        // Swap if first is larger than last:
1493 1
        if ($reg[1] > $reg[2]) {
1494
            $temp = $reg[2];
1495
            $reg[2] = $reg[1];
1496
            $reg[1] = $temp;
1497
        }
1498
1499 1
        return $reg;
1500
    }
1501
1502 5
    private function getMaximumUrlsToCompile(): int
1503
    {
1504 5
        return $this->maximumUrlsToCompile;
1505
    }
1506
1507
    /**
1508
     * @return BackendUserAuthentication
1509
     */
1510 3
    private function getBackendUser()
1511
    {
1512
        // Make sure the _cli_ user is loaded
1513 3
        Bootstrap::initializeBackendAuthentication();
1514 3
        if ($this->backendUser === null) {
1515 3
            $this->backendUser = $GLOBALS['BE_USER'];
1516
        }
1517 3
        return $this->backendUser;
1518
    }
1519
1520
    /**
1521
     * Get querybuilder for given table
1522
     *
1523
     * @return QueryBuilder
1524
     */
1525 8
    private function getQueryBuilder(string $table)
1526
    {
1527 8
        return GeneralUtility::makeInstance(ConnectionPool::class)->getQueryBuilderForTable($table);
1528
    }
1529
}
1530