Passed
Push — wip/remove-deprecations-for-v1... ( f97f5b )
by Tomas Norre
05:15
created

CrawlerController::getLogEntriesForSetId()   B

Complexity

Conditions 6
Paths 9

Size

Total Lines 40
Code Lines 29

Duplication

Lines 0
Ratio 0 %

Code Coverage

Tests 28
CRAP Score 6

Importance

Changes 0
Metric Value
cc 6
eloc 29
c 0
b 0
f 0
nc 9
nop 5
dl 0
loc 40
ccs 28
cts 28
cp 1
crap 6
rs 8.8337
1
<?php
2
3
declare(strict_types=1);
4
5
namespace AOE\Crawler\Controller;
6
7
/***************************************************************
8
 *  Copyright notice
9
 *
10
 *  (c) 2020 AOE GmbH <[email protected]>
11
 *
12
 *  All rights reserved
13
 *
14
 *  This script is part of the TYPO3 project. The TYPO3 project is
15
 *  free software; you can redistribute it and/or modify
16
 *  it under the terms of the GNU General Public License as published by
17
 *  the Free Software Foundation; either version 3 of the License, or
18
 *  (at your option) any later version.
19
 *
20
 *  The GNU General Public License can be found at
21
 *  http://www.gnu.org/copyleft/gpl.html.
22
 *
23
 *  This script is distributed in the hope that it will be useful,
24
 *  but WITHOUT ANY WARRANTY; without even the implied warranty of
25
 *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
26
 *  GNU General Public License for more details.
27
 *
28
 *  This copyright notice MUST APPEAR in all copies of the script!
29
 ***************************************************************/
30
31
use AOE\Crawler\Configuration\ExtensionConfigurationProvider;
32
use AOE\Crawler\Converter\JsonCompatibilityConverter;
33
use AOE\Crawler\Crawler;
34
use AOE\Crawler\CrawlStrategy\CrawlStrategyFactory;
35
use AOE\Crawler\Domain\Model\Process;
36
use AOE\Crawler\Domain\Repository\ConfigurationRepository;
37
use AOE\Crawler\Domain\Repository\ProcessRepository;
38
use AOE\Crawler\Domain\Repository\QueueRepository;
39
use AOE\Crawler\QueueExecutor;
40
use AOE\Crawler\Service\ConfigurationService;
41
use AOE\Crawler\Service\UrlService;
42
use AOE\Crawler\Service\UserService;
43
use AOE\Crawler\Value\QueueFilter;
44
use PDO;
45
use Psr\Http\Message\UriInterface;
46
use Psr\Log\LoggerAwareInterface;
47
use Psr\Log\LoggerAwareTrait;
48
use TYPO3\CMS\Backend\Tree\View\PageTreeView;
49
use TYPO3\CMS\Backend\Utility\BackendUtility;
50
use TYPO3\CMS\Core\Authentication\BackendUserAuthentication;
51
use TYPO3\CMS\Core\Compatibility\PublicMethodDeprecationTrait;
52
use TYPO3\CMS\Core\Compatibility\PublicPropertyDeprecationTrait;
53
use TYPO3\CMS\Core\Core\Bootstrap;
54
use TYPO3\CMS\Core\Core\Environment;
55
use TYPO3\CMS\Core\Database\Connection;
56
use TYPO3\CMS\Core\Database\ConnectionPool;
57
use TYPO3\CMS\Core\Database\Query\QueryBuilder;
58
use TYPO3\CMS\Core\Database\Query\Restriction\DeletedRestriction;
59
use TYPO3\CMS\Core\Database\QueryGenerator;
60
use TYPO3\CMS\Core\Domain\Repository\PageRepository;
61
use TYPO3\CMS\Core\Exception\SiteNotFoundException;
62
use TYPO3\CMS\Core\Imaging\Icon;
63
use TYPO3\CMS\Core\Imaging\IconFactory;
64
use TYPO3\CMS\Core\Routing\InvalidRouteArgumentsException;
65
use TYPO3\CMS\Core\Site\Entity\Site;
66
use TYPO3\CMS\Core\Type\Bitmask\Permission;
67
use TYPO3\CMS\Core\TypoScript\Parser\TypoScriptParser;
68
use TYPO3\CMS\Core\Utility\DebugUtility;
69
use TYPO3\CMS\Core\Utility\GeneralUtility;
70
use TYPO3\CMS\Core\Utility\MathUtility;
71
use TYPO3\CMS\Extbase\Object\ObjectManager;
72
73
/**
74
 * Class CrawlerController
75
 *
76
 * @package AOE\Crawler\Controller
77
 */
78
class CrawlerController implements LoggerAwareInterface
79
{
80
    use LoggerAwareTrait;
81
    use PublicMethodDeprecationTrait;
0 ignored issues
show
Bug introduced by
The trait TYPO3\CMS\Core\Compatibi...cMethodDeprecationTrait requires the property $deprecatedPublicMethods which is not provided by AOE\Crawler\Controller\CrawlerController.
Loading history...
82
    use PublicPropertyDeprecationTrait;
0 ignored issues
show
Bug introduced by
The trait TYPO3\CMS\Core\Compatibi...ropertyDeprecationTrait requires the property $deprecatedPublicProperties which is not provided by AOE\Crawler\Controller\CrawlerController.
Loading history...
83
84
    public const CLI_STATUS_NOTHING_PROCCESSED = 0;
85
86
    //queue not empty
87
    public const CLI_STATUS_REMAIN = 1;
88
89
    //(some) queue items where processed
90
    public const CLI_STATUS_PROCESSED = 2;
91
92
    //instance didn't finish
93
    public const CLI_STATUS_ABORTED = 4;
94
95
    public const CLI_STATUS_POLLABLE_PROCESSED = 8;
96
97
    /**
98
     * @var integer
99
     */
100
    public $setID = 0;
101
102
    /**
103
     * @var string
104
     */
105
    public $processID = '';
106
107
    /**
108
     * @var array
109
     */
110
    public $duplicateTrack = [];
111
112
    /**
113
     * @var array
114
     */
115
    public $downloadUrls = [];
116
117
    /**
118
     * @var array
119
     */
120
    public $incomingProcInstructions = [];
121
122
    /**
123
     * @var array
124
     */
125
    public $incomingConfigurationSelection = [];
126
127
    /**
128
     * @var bool
129
     */
130
    public $registerQueueEntriesInternallyOnly = false;
131
132
    /**
133
     * @var array
134
     */
135
    public $queueEntries = [];
136
137
    /**
138
     * @var array
139
     */
140
    public $urlList = [];
141
142
    /**
143
     * @var array
144
     */
145
    public $extensionSettings = [];
146
147
    /**
148
     * Mount Point
149
     *
150
     * @var bool
151
     * Todo: Check what this is used for and adjust the type hint or code, as bool doesn't match the current code.
152
     */
153
    public $MP = false;
154
155
    /**
156
     * @var QueueRepository
157
     */
158
    protected $queueRepository;
159
160
    /**
161
     * @var ProcessRepository
162
     */
163
    protected $processRepository;
164
165
    /**
166
     * @var ConfigurationRepository
167
     */
168
    protected $configurationRepository;
169
170
    /**
171
     * @var string
172
     */
173
    protected $tableName = 'tx_crawler_queue';
174
175
    /**
176
     * @var QueueExecutor
177
     */
178
    protected $queueExecutor;
179
180
    /**
181
     * @var int
182
     */
183
    protected $maximumUrlsToCompile = 10000;
184
185
    /**
186
     * @var IconFactory
187
     */
188
    protected $iconFactory;
189
190
    /**
191
     * @var BackendUserAuthentication|null
192
     */
193
    private $backendUser;
194
195
    /**
196
     * @var integer
197
     */
198
    private $scheduledTime = 0;
199
200
    /**
201
     * @var integer
202
     */
203
    private $reqMinute = 0;
204
205
    /**
206
     * @var bool
207
     */
208
    private $submitCrawlUrls = false;
209
210
    /**
211
     * @var bool
212
     */
213
    private $downloadCrawlUrls = false;
214
215
    /**
216
     * @var PageRepository
217
     */
218
    private $pageRepository;
219
220
    /**
221
     * @var Crawler
222
     */
223
    private $crawler;
224
225
    /************************************
226
     *
227
     * Getting URLs based on Page TSconfig
228
     *
229
     ************************************/
230
231 19
    public function __construct()
232
    {
233 19
        $objectManager = GeneralUtility::makeInstance(ObjectManager::class);
234 19
        $crawlStrategyFactory = GeneralUtility::makeInstance(CrawlStrategyFactory::class);
235 19
        $this->queueRepository = $objectManager->get(QueueRepository::class);
236 19
        $this->processRepository = $objectManager->get(ProcessRepository::class);
237 19
        $this->configurationRepository = $objectManager->get(ConfigurationRepository::class);
238 19
        $this->pageRepository = GeneralUtility::makeInstance(PageRepository::class);
239 19
        $this->queueExecutor = GeneralUtility::makeInstance(QueueExecutor::class, $crawlStrategyFactory);
240 19
        $this->iconFactory = GeneralUtility::makeInstance(IconFactory::class);
241 19
        $this->crawler = GeneralUtility::makeInstance(Crawler::class);
242
243 19
        $this->processFilename = Environment::getVarPath() . '/lock/tx_crawler.proc';
0 ignored issues
show
Bug Best Practice introduced by
The property processFilename does not exist. Although not strictly required by PHP, it is generally a best practice to declare properties explicitly.
Loading history...
244
245
        /** @var ExtensionConfigurationProvider $configurationProvider */
246 19
        $configurationProvider = GeneralUtility::makeInstance(ExtensionConfigurationProvider::class);
247 19
        $settings = $configurationProvider->getExtensionConfiguration();
248 19
        $this->extensionSettings = is_array($settings) ? $settings : [];
0 ignored issues
show
introduced by
The condition is_array($settings) is always true.
Loading history...
249
250
        // set defaults:
251 19
        if (MathUtility::convertToPositiveInteger($this->extensionSettings['countInARun']) === 0) {
252
            $this->extensionSettings['countInARun'] = 100;
253
        }
254
255 19
        $this->extensionSettings['processLimit'] = MathUtility::forceIntegerInRange($this->extensionSettings['processLimit'], 1, 99, 1);
256 19
        $this->setMaximumUrlsToCompile(MathUtility::forceIntegerInRange($this->extensionSettings['maxCompileUrls'], 1, 1000000000, 10000));
257 19
    }
258
259 23
    public function setMaximumUrlsToCompile(int $maximumUrlsToCompile): void
260
    {
261 23
        $this->maximumUrlsToCompile = $maximumUrlsToCompile;
262 23
    }
263
264
    /**
265
     * Sets the extensions settings (unserialized pendant of $TYPO3_CONF_VARS['EXT']['extConf']['crawler']).
266
     */
267 8
    public function setExtensionSettings(array $extensionSettings): void
268
    {
269 8
        $this->extensionSettings = $extensionSettings;
270 8
    }
271
272
    /**
273
     * Check if the given page should be crawled
274
     *
275
     * @return false|string false if the page should be crawled (not excluded), true / skipMessage if it should be skipped
276
     */
277 11
    public function checkIfPageShouldBeSkipped(array $pageRow)
278
    {
279
        // if page is hidden
280 11
        if (! $this->extensionSettings['crawlHiddenPages'] && $pageRow['hidden']) {
281 1
            return 'Because page is hidden';
282
        }
283
284 10
        if (GeneralUtility::inList('3,4,199,254,255', $pageRow['doktype'])) {
285 3
            return 'Because doktype is not allowed';
286
        }
287
288 7
        foreach ($GLOBALS['TYPO3_CONF_VARS']['EXTCONF']['crawler']['excludeDoktype'] ?? [] as $key => $doktypeList) {
289 1
            if (GeneralUtility::inList($doktypeList, $pageRow['doktype'])) {
290 1
                return 'Doktype was excluded by "' . $key . '"';
291
            }
292
        }
293
294
        // veto hook
295 6
        foreach ($GLOBALS['TYPO3_CONF_VARS']['EXTCONF']['crawler']['pageVeto'] ?? [] as $key => $func) {
296
            $params = [
297 2
                'pageRow' => $pageRow,
298
            ];
299
            // expects "false" if page is ok and "true" or a skipMessage if this page should _not_ be crawled
300 2
            $veto = GeneralUtility::callUserFunction($func, $params, $this);
301 2
            if ($veto !== false) {
302 2
                if (is_string($veto)) {
303 1
                    return $veto;
304
                }
305 1
                return 'Veto from hook "' . htmlspecialchars($key) . '"';
306
            }
307
        }
308
309 4
        return false;
310
    }
311
312
    /**
313
     * Wrapper method for getUrlsForPageId()
314
     * It returns an array of configurations and no urls!
315
     *
316
     * @param array $pageRow Page record with at least dok-type and uid columns.
317
     * @param string $skipMessage
318
     * @return array
319
     * @see getUrlsForPageId()
320
     */
321 5
    public function getUrlsForPageRow(array $pageRow, &$skipMessage = '')
322
    {
323 5
        if (! is_int($pageRow['uid'])) {
324
            $skipMessage = 'PageUid ' . $pageRow['uid'] . ' was not an integer';
325
            return [];
326
        }
327
328 5
        $message = $this->checkIfPageShouldBeSkipped($pageRow);
329 5
        if ($message === false) {
330 4
            $res = $this->getUrlsForPageId($pageRow['uid']);
331 4
            $skipMessage = '';
332
        } else {
333 1
            $skipMessage = $message;
334 1
            $res = [];
335
        }
336
337 5
        return $res;
338
    }
339
340
    /**
341
     * Creates a list of URLs from input array (and submits them to queue if asked for)
342
     * See Web > Info module script + "indexed_search"'s crawler hook-client using this!
343
     *
344
     * @param array $vv Information about URLs from pageRow to crawl.
345
     * @param array $pageRow Page row
346
     * @param int $scheduledTime Unix time to schedule indexing to, typically time()
347
     * @param int $reqMinute Number of requests per minute (creates the interleave between requests)
348
     * @param bool $submitCrawlUrls If set, submits the URLs to queue
349
     * @param bool $downloadCrawlUrls If set (and submitcrawlUrls is false) will fill $downloadUrls with entries)
350
     * @param array $duplicateTrack Array which is passed by reference and contains the an id per url to secure we will not crawl duplicates
351
     * @param array $downloadUrls Array which will be filled with URLS for download if flag is set.
352
     * @param array $incomingProcInstructions Array of processing instructions
353
     * @return string List of URLs (meant for display in backend module)
354
     */
355 3
    public function urlListFromUrlArray(
356
        array $vv,
357
        array $pageRow,
358
        $scheduledTime,
359
        $reqMinute,
360
        $submitCrawlUrls,
361
        $downloadCrawlUrls,
362
        array &$duplicateTrack,
363
        array &$downloadUrls,
364
        array $incomingProcInstructions
365
    ) {
366 3
        if (! is_array($vv['URLs'])) {
367
            return 'ERROR - no URL generated';
368
        }
369 3
        $urlLog = [];
370 3
        $pageId = (int) $pageRow['uid'];
371 3
        $configurationHash = $this->getConfigurationHash($vv);
372 3
        $skipInnerCheck = $this->queueRepository->noUnprocessedQueueEntriesForPageWithConfigurationHashExist($pageId, $configurationHash);
373
374 3
        $urlService = new UrlService();
375
376 3
        foreach ($vv['URLs'] as $urlQuery) {
377 3
            if (! $this->drawURLs_PIfilter($vv['subCfg']['procInstrFilter'], $incomingProcInstructions)) {
378
                continue;
379
            }
380 3
            $url = (string) $urlService->getUrlFromPageAndQueryParameters(
381 3
                $pageId,
382
                $urlQuery,
383 3
                $vv['subCfg']['baseUrl'] ?? null,
384 3
                $vv['subCfg']['force_ssl'] ?? 0
385
            );
386
387
            // Create key by which to determine unique-ness:
388 3
            $uKey = $url . '|' . $vv['subCfg']['userGroups'] . '|' . $vv['subCfg']['procInstrFilter'];
389
390 3
            if (isset($duplicateTrack[$uKey])) {
391
                //if the url key is registered just display it and do not resubmit is
392
                $urlLog[] = '<em><span class="text-muted">' . htmlspecialchars($url) . '</span></em>';
393
            } else {
394
                // Scheduled time:
395 3
                $schTime = $scheduledTime + round(count($duplicateTrack) * (60 / $reqMinute));
396 3
                $schTime = intval($schTime / 60) * 60;
397 3
                $formattedDate = BackendUtility::datetime($schTime);
398 3
                $this->urlList[] = '[' . $formattedDate . '] ' . $url;
399 3
                $urlList = '[' . $formattedDate . '] ' . htmlspecialchars($url);
400
401
                // Submit for crawling!
402 3
                if ($submitCrawlUrls) {
403 3
                    $added = $this->addUrl(
404 3
                        $pageId,
405
                        $url,
406 3
                        $vv['subCfg'],
407
                        $scheduledTime,
408
                        $configurationHash,
409
                        $skipInnerCheck
410
                    );
411 3
                    if ($added === false) {
412 3
                        $urlList .= ' (URL already existed)';
413
                    }
414
                } elseif ($downloadCrawlUrls) {
415
                    $downloadUrls[$url] = $url;
416
                }
417 3
                $urlLog[] = $urlList;
418
            }
419 3
            $duplicateTrack[$uKey] = true;
420
        }
421
422 3
        return implode('<br>', $urlLog);
423
    }
424
425
    /**
426
     * Returns true if input processing instruction is among registered ones.
427
     *
428
     * @param string $piString PI to test
429
     * @param array $incomingProcInstructions Processing instructions
430
     * @return boolean
431
     */
432 8
    public function drawURLs_PIfilter($piString, array $incomingProcInstructions)
433
    {
434 8
        if (empty($incomingProcInstructions)) {
435 4
            return true;
436
        }
437
438 4
        foreach ($incomingProcInstructions as $pi) {
439 4
            if (GeneralUtility::inList($piString, $pi)) {
440 2
                return true;
441
            }
442
        }
443 2
        return false;
444
    }
445
446 5
    public function getPageTSconfigForId($id): array
447
    {
448 5
        if (! $this->MP) {
449 5
            $pageTSconfig = BackendUtility::getPagesTSconfig($id);
450
        } else {
451
            // TODO: Please check, this makes no sense to split a boolean value.
452
            [, $mountPointId] = explode('-', $this->MP);
0 ignored issues
show
Bug introduced by
$this->MP of type true is incompatible with the type string expected by parameter $string of explode(). ( Ignorable by Annotation )

If this is a false-positive, you can also ignore this issue in your code via the ignore-type  annotation

452
            [, $mountPointId] = explode('-', /** @scrutinizer ignore-type */ $this->MP);
Loading history...
453
            $pageTSconfig = BackendUtility::getPagesTSconfig($mountPointId);
0 ignored issues
show
Bug introduced by
$mountPointId of type string is incompatible with the type integer expected by parameter $id of TYPO3\CMS\Backend\Utilit...ity::getPagesTSconfig(). ( Ignorable by Annotation )

If this is a false-positive, you can also ignore this issue in your code via the ignore-type  annotation

453
            $pageTSconfig = BackendUtility::getPagesTSconfig(/** @scrutinizer ignore-type */ $mountPointId);
Loading history...
454
        }
455
456
        // Call a hook to alter configuration
457 5
        if (is_array($GLOBALS['TYPO3_CONF_VARS']['EXTCONF']['crawler']['getPageTSconfigForId'])) {
458
            $params = [
459
                'pageId' => $id,
460
                'pageTSConfig' => &$pageTSconfig,
461
            ];
462
            foreach ($GLOBALS['TYPO3_CONF_VARS']['EXTCONF']['crawler']['getPageTSconfigForId'] as $userFunc) {
463
                GeneralUtility::callUserFunction($userFunc, $params, $this);
464
            }
465
        }
466 5
        return $pageTSconfig;
467
    }
468
469
    /**
470
     * This methods returns an array of configurations.
471
     * Adds no urls!
472
     */
473 3
    public function getUrlsForPageId(int $pageId): array
474
    {
475
        // Get page TSconfig for page ID
476 3
        $pageTSconfig = $this->getPageTSconfigForId($pageId);
477
478 3
        $res = [];
479
480
        // Fetch Crawler Configuration from pageTSconfig
481 3
        $crawlerCfg = $pageTSconfig['tx_crawler.']['crawlerCfg.']['paramSets.'] ?? [];
482 3
        foreach ($crawlerCfg as $key => $values) {
483 3
            if (! is_array($values)) {
484 3
                continue;
485
            }
486 3
            $key = str_replace('.', '', $key);
487
            // Sub configuration for a single configuration string:
488 3
            $subCfg = (array) $crawlerCfg[$key . '.'];
489 3
            $subCfg['key'] = $key;
490
491 3
            if (strcmp($subCfg['procInstrFilter'] ?? '', '')) {
492 3
                $subCfg['procInstrFilter'] = implode(',', GeneralUtility::trimExplode(',', $subCfg['procInstrFilter']));
493
            }
494 3
            $pidOnlyList = implode(',', GeneralUtility::trimExplode(',', $subCfg['pidsOnly'], true));
495
496
            // process configuration if it is not page-specific or if the specific page is the current page:
497
            // TODO: Check if $pidOnlyList can be kept as Array instead of imploded
498 3
            if (! strcmp((string) $subCfg['pidsOnly'], '') || GeneralUtility::inList($pidOnlyList, strval($pageId))) {
499
500
                // Explode, process etc.:
501 3
                $res[$key] = [];
502 3
                $res[$key]['subCfg'] = $subCfg;
503 3
                $res[$key]['paramParsed'] = GeneralUtility::explodeUrl2Array($crawlerCfg[$key]);
504 3
                $res[$key]['paramExpanded'] = $this->expandParameters($res[$key]['paramParsed'], $pageId);
505 3
                $res[$key]['origin'] = 'pagets';
506
507
                // recognize MP value
508 3
                if (! $this->MP) {
509 3
                    $res[$key]['URLs'] = $this->compileUrls($res[$key]['paramExpanded'], ['?id=' . $pageId]);
510
                } else {
511
                    $res[$key]['URLs'] = $this->compileUrls($res[$key]['paramExpanded'], ['?id=' . $pageId . '&MP=' . $this->MP]);
0 ignored issues
show
Bug introduced by
Are you sure $this->MP of type true can be used in concatenation? ( Ignorable by Annotation )

If this is a false-positive, you can also ignore this issue in your code via the ignore-type  annotation

511
                    $res[$key]['URLs'] = $this->compileUrls($res[$key]['paramExpanded'], ['?id=' . $pageId . '&MP=' . /** @scrutinizer ignore-type */ $this->MP]);
Loading history...
512
                }
513
            }
514
        }
515
516
        // Get configuration from tx_crawler_configuration records up the rootline
517 3
        $crawlerConfigurations = $this->configurationRepository->getCrawlerConfigurationRecordsFromRootLine($pageId);
518 3
        foreach ($crawlerConfigurations as $configurationRecord) {
519
520
            // check access to the configuration record
521
            if (empty($configurationRecord['begroups']) || $this->getBackendUser()->isAdmin() || UserService::hasGroupAccess($this->getBackendUser()->user['usergroup_cached_list'], $configurationRecord['begroups'])) {
522
                $pidOnlyList = implode(',', GeneralUtility::trimExplode(',', $configurationRecord['pidsonly'], true));
523
524
                // process configuration if it is not page-specific or if the specific page is the current page:
525
                // TODO: Check if $pidOnlyList can be kept as Array instead of imploded
526
                if (! strcmp($configurationRecord['pidsonly'], '') || GeneralUtility::inList($pidOnlyList, strval($pageId))) {
527
                    $key = $configurationRecord['name'];
528
529
                    // don't overwrite previously defined paramSets
530
                    if (! isset($res[$key])) {
531
532
                        /* @var $TSparserObject TypoScriptParser */
533
                        $TSparserObject = GeneralUtility::makeInstance(TypoScriptParser::class);
534
                        $TSparserObject->parse($configurationRecord['processing_instruction_parameters_ts']);
535
536
                        $subCfg = [
537
                            'procInstrFilter' => $configurationRecord['processing_instruction_filter'],
538
                            'procInstrParams.' => $TSparserObject->setup,
539
                            'baseUrl' => $configurationRecord['base_url'],
540
                            'force_ssl' => (int) $configurationRecord['force_ssl'],
541
                            'userGroups' => $configurationRecord['fegroups'],
542
                            'exclude' => $configurationRecord['exclude'],
543
                            'key' => $key,
544
                        ];
545
546
                        if (! in_array($pageId, $this->expandExcludeString($subCfg['exclude']), true)) {
547
                            $res[$key] = [];
548
                            $res[$key]['subCfg'] = $subCfg;
549
                            $res[$key]['paramParsed'] = GeneralUtility::explodeUrl2Array($configurationRecord['configuration']);
550
                            $res[$key]['paramExpanded'] = $this->expandParameters($res[$key]['paramParsed'], $pageId);
551
                            $res[$key]['URLs'] = $this->compileUrls($res[$key]['paramExpanded'], ['?id=' . $pageId]);
552
                            $res[$key]['origin'] = 'tx_crawler_configuration_' . $configurationRecord['uid'];
553
                        }
554
                    }
555
                }
556
            }
557
        }
558
559 3
        foreach ($GLOBALS['TYPO3_CONF_VARS']['EXTCONF']['crawler']['processUrls'] ?? [] as $func) {
560
            $params = [
561
                'res' => &$res,
562
            ];
563
            GeneralUtility::callUserFunction($func, $params, $this);
564
        }
565 3
        return $res;
566
    }
567
568
    /**
569
     * Find all configurations of subpages of a page
570
     * TODO: Write Functional Tests
571
     */
572 2
    public function getConfigurationsForBranch(int $rootid, int $depth): array
573
    {
574 2
        $configurationsForBranch = [];
575 2
        $pageTSconfig = $this->getPageTSconfigForId($rootid);
576 2
        $sets = $pageTSconfig['tx_crawler.']['crawlerCfg.']['paramSets.'] ?? [];
577 2
        foreach ($sets as $key => $value) {
578
            if (! is_array($value)) {
579
                continue;
580
            }
581
            $configurationsForBranch[] = substr($key, -1) === '.' ? substr($key, 0, -1) : $key;
582
        }
583 2
        $pids = [];
584 2
        $rootLine = BackendUtility::BEgetRootLine($rootid);
585 2
        foreach ($rootLine as $node) {
586 1
            $pids[] = $node['uid'];
587
        }
588
        /* @var PageTreeView $tree */
589 2
        $tree = GeneralUtility::makeInstance(PageTreeView::class);
590 2
        $perms_clause = $this->getBackendUser()->getPagePermsClause(Permission::PAGE_SHOW);
591 2
        $tree->init(empty($perms_clause) ? '' : ('AND ' . $perms_clause));
592 2
        $tree->getTree($rootid, $depth, '');
593 2
        foreach ($tree->tree as $node) {
594
            $pids[] = $node['row']['uid'];
595
        }
596
597 2
        $queryBuilder = $this->getQueryBuilder('tx_crawler_configuration');
598
        $statement = $queryBuilder
599 2
            ->select('name')
600 2
            ->from('tx_crawler_configuration')
601 2
            ->where(
602 2
                $queryBuilder->expr()->in('pid', $queryBuilder->createNamedParameter($pids, Connection::PARAM_INT_ARRAY))
603
            )
604 2
            ->execute();
605
606 2
        while ($row = $statement->fetch()) {
607 1
            $configurationsForBranch[] = $row['name'];
608
        }
609 2
        return $configurationsForBranch;
610
    }
611
612
    /**
613
     * Will expand the parameters configuration to individual values. This follows a certain syntax of the value of each parameter.
614
     * Syntax of values:
615
     * - Basically: If the value is wrapped in [...] it will be expanded according to the following syntax, otherwise the value is taken literally
616
     * - Configuration is splitted by "|" and the parts are processed individually and finally added together
617
     * - For each configuration part:
618
     *         - "[int]-[int]" = Integer range, will be expanded to all values in between, values included, starting from low to high (max. 1000). Example "1-34" or "-40--30"
619
     *         - "_TABLE:[TCA table name];[_PID:[optional page id, default is current page]];[_ENABLELANG:1]" = Look up of table records from PID, filtering out deleted records. Example "_TABLE:tt_content; _PID:123"
620
     *        _ENABLELANG:1 picks only original records without their language overlays
621
     *         - Default: Literal value
622
     *
623
     * @param array $paramArray Array with key (GET var name) and values (value of GET var which is configuration for expansion)
624
     * @param integer $pid Current page ID
625
     * @return array
626
     *
627
     * TODO: Write Functional Tests
628
     */
629 10
    public function expandParameters($paramArray, $pid)
630
    {
631
        // Traverse parameter names:
632 10
        foreach ($paramArray as $p => $v) {
633 10
            $v = trim($v);
634
635
            // If value is encapsulated in square brackets it means there are some ranges of values to find, otherwise the value is literal
636 10
            if (strpos($v, '[') === 0 && substr($v, -1) === ']') {
637
                // So, find the value inside brackets and reset the paramArray value as an array.
638 10
                $v = substr($v, 1, -1);
639 10
                $paramArray[$p] = [];
640
641
                // Explode parts and traverse them:
642 10
                $parts = explode('|', $v);
643 10
                foreach ($parts as $pV) {
644
645
                    // Look for integer range: (fx. 1-34 or -40--30 // reads minus 40 to minus 30)
646 10
                    if (preg_match('/^(-?[0-9]+)\s*-\s*(-?[0-9]+)$/', trim($pV), $reg)) {
647 1
                        $reg = $this->swapIfFirstIsLargerThanSecond($reg);
648
649
                        // Traverse range, add values:
650
                        // Limit to size of range!
651 1
                        $runAwayBrake = 1000;
652 1
                        for ($a = $reg[1]; $a <= $reg[2]; $a++) {
653 1
                            $paramArray[$p][] = $a;
654 1
                            $runAwayBrake--;
655 1
                            if ($runAwayBrake <= 0) {
656
                                break;
657
                            }
658
                        }
659 9
                    } elseif (strpos(trim($pV), '_TABLE:') === 0) {
660
661
                        // Parse parameters:
662 6
                        $subparts = GeneralUtility::trimExplode(';', $pV);
663 6
                        $subpartParams = [];
664 6
                        foreach ($subparts as $spV) {
665 6
                            [$pKey, $pVal] = GeneralUtility::trimExplode(':', $spV);
666 6
                            $subpartParams[$pKey] = $pVal;
667
                        }
668
669
                        // Table exists:
670 6
                        if (isset($GLOBALS['TCA'][$subpartParams['_TABLE']])) {
671 6
                            $lookUpPid = isset($subpartParams['_PID']) ? intval($subpartParams['_PID']) : intval($pid);
672 6
                            $recursiveDepth = isset($subpartParams['_RECURSIVE']) ? intval($subpartParams['_RECURSIVE']) : 0;
673 6
                            $pidField = isset($subpartParams['_PIDFIELD']) ? trim($subpartParams['_PIDFIELD']) : 'pid';
674 6
                            $where = $subpartParams['_WHERE'] ?? '';
675 6
                            $addTable = $subpartParams['_ADDTABLE'] ?? '';
676
677 6
                            $fieldName = $subpartParams['_FIELD'] ? $subpartParams['_FIELD'] : 'uid';
678 6
                            if ($fieldName === 'uid' || $GLOBALS['TCA'][$subpartParams['_TABLE']]['columns'][$fieldName]) {
679 6
                                $queryBuilder = $this->getQueryBuilder($subpartParams['_TABLE']);
680
681 6
                                if ($recursiveDepth > 0) {
682
                                    /** @var QueryGenerator $queryGenerator */
683 2
                                    $queryGenerator = GeneralUtility::makeInstance(QueryGenerator::class);
684 2
                                    $pidList = $queryGenerator->getTreeList($lookUpPid, $recursiveDepth, 0, 1);
685 2
                                    $pidArray = GeneralUtility::intExplode(',', $pidList);
686
                                } else {
687 4
                                    $pidArray = [(string) $lookUpPid];
688
                                }
689
690 6
                                $queryBuilder->getRestrictions()
691 6
                                    ->removeAll()
692 6
                                    ->add(GeneralUtility::makeInstance(DeletedRestriction::class));
693
694
                                $queryBuilder
695 6
                                    ->select($fieldName)
696 6
                                    ->from($subpartParams['_TABLE'])
697 6
                                    ->where(
698 6
                                        $queryBuilder->expr()->in($pidField, $queryBuilder->createNamedParameter($pidArray, Connection::PARAM_INT_ARRAY)),
699
                                        $where
700
                                    );
701
702 6
                                if (! empty($addTable)) {
703
                                    // TODO: Check if this works as intended!
704
                                    $queryBuilder->add('from', $addTable);
705
                                }
706 6
                                $transOrigPointerField = $GLOBALS['TCA'][$subpartParams['_TABLE']]['ctrl']['transOrigPointerField'];
707
708 6
                                if ($subpartParams['_ENABLELANG'] && $transOrigPointerField) {
709
                                    $queryBuilder->andWhere(
710
                                        $queryBuilder->expr()->lte(
711
                                            $transOrigPointerField,
712
                                            0
713
                                        )
714
                                    );
715
                                }
716
717 6
                                $statement = $queryBuilder->execute();
718
719 6
                                $rows = [];
720 6
                                while ($row = $statement->fetch()) {
721 6
                                    $rows[$row[$fieldName]] = $row;
722
                                }
723
724 6
                                if (is_array($rows)) {
725 6
                                    $paramArray[$p] = array_merge($paramArray[$p], array_keys($rows));
726
                                }
727
                            }
728
                        }
729
                    } else {
730
                        // Just add value:
731 3
                        $paramArray[$p][] = $pV;
732
                    }
733
                    // Hook for processing own expandParameters place holder
734 10
                    if (is_array($GLOBALS['TYPO3_CONF_VARS']['SC_OPTIONS']['crawler/class.tx_crawler_lib.php']['expandParameters'])) {
735
                        $_params = [
736
                            'pObj' => &$this,
737
                            'paramArray' => &$paramArray,
738
                            'currentKey' => $p,
739
                            'currentValue' => $pV,
740
                            'pid' => $pid,
741
                        ];
742
                        foreach ($GLOBALS['TYPO3_CONF_VARS']['SC_OPTIONS']['crawler/class.tx_crawler_lib.php']['expandParameters'] as $_funcRef) {
743
                            GeneralUtility::callUserFunction($_funcRef, $_params, $this);
744
                        }
745
                    }
746
                }
747
748
                // Make unique set of values and sort array by key:
749 10
                $paramArray[$p] = array_unique($paramArray[$p]);
750 10
                ksort($paramArray);
751
            } else {
752
                // Set the literal value as only value in array:
753 3
                $paramArray[$p] = [$v];
754
            }
755
        }
756
757 10
        return $paramArray;
758
    }
759
760
    /**
761
     * Compiling URLs from parameter array (output of expandParameters())
762
     * The number of URLs will be the multiplication of the number of parameter values for each key
763
     *
764
     * @param array $paramArray Output of expandParameters(): Array with keys (GET var names) and for each an array of values
765
     * @param array $urls URLs accumulated in this array (for recursion)
766
     * @return array
767
     */
768 7
    public function compileUrls($paramArray, array $urls)
769
    {
770 7
        if (empty($paramArray)) {
771 7
            return $urls;
772
        }
773 6
        $varName = key($paramArray);
774 6
        $valueSet = array_shift($paramArray);
775
776
        // Traverse value set:
777 6
        $newUrls = [];
778 6
        foreach ($urls as $url) {
779 5
            foreach ($valueSet as $val) {
780 5
                if (count($newUrls) < $this->getMaximumUrlsToCompile()) {
781 5
                    $newUrls[] = $url . (strcmp((string) $val, '') ? '&' . rawurlencode($varName) . '=' . rawurlencode((string) $val) : '');
782
                }
783
            }
784
        }
785 6
        return $this->compileUrls($paramArray, $newUrls);
786
    }
787
788
    /************************************
789
     *
790
     * Crawler log
791
     *
792
     ************************************/
793
794
    /**
795
     * Adding call back entries to log (called from hooks typically, see indexed search class "class.crawler.php"
796
     *
797
     * @param integer $setId Set ID
798
     * @param array $params Parameters to pass to call back function
799
     * @param string $callBack Call back object reference, eg. 'EXT:indexed_search/class.crawler.php:&tx_indexedsearch_crawler'
800
     * @param integer $page_id Page ID to attach it to
801
     * @param integer $schedule Time at which to activate
802
     */
803
    public function addQueueEntry_callBack($setId, $params, $callBack, $page_id = 0, $schedule = 0): void
804
    {
805
        if (! is_array($params)) {
0 ignored issues
show
introduced by
The condition is_array($params) is always true.
Loading history...
806
            $params = [];
807
        }
808
        $params['_CALLBACKOBJ'] = $callBack;
809
810
        GeneralUtility::makeInstance(ConnectionPool::class)->getConnectionForTable('tx_crawler_queue')
811
            ->insert(
812
                'tx_crawler_queue',
813
                [
814
                    'page_id' => (int) $page_id,
815
                    'parameters' => json_encode($params),
816
                    'scheduled' => (int) $schedule ?: $this->getCurrentTime(),
817
                    'exec_time' => 0,
818
                    'set_id' => (int) $setId,
819
                    'result_data' => '',
820
                ]
821
            );
822
    }
823
824
    /************************************
825
     *
826
     * URL setting
827
     *
828
     ************************************/
829
830
    /**
831
     * Setting a URL for crawling:
832
     *
833
     * @param integer $id Page ID
834
     * @param string $url Complete URL
835
     * @param array $subCfg Sub configuration array (from TS config)
836
     * @param integer $tstamp Scheduled-time
837
     * @param string $configurationHash (optional) configuration hash
838
     * @param bool $skipInnerDuplicationCheck (optional) skip inner duplication check
839
     * @return bool
840
     */
841 7
    public function addUrl(
842
        $id,
843
        $url,
844
        array $subCfg,
845
        $tstamp,
846
        $configurationHash = '',
847
        $skipInnerDuplicationCheck = false
848
    ) {
849 7
        $urlAdded = false;
850 7
        $rows = [];
851
852
        // Creating parameters:
853
        $parameters = [
854 7
            'url' => $url,
855
        ];
856
857
        // fe user group simulation:
858 7
        $uGs = implode(',', array_unique(GeneralUtility::intExplode(',', $subCfg['userGroups'], true)));
859 7
        if ($uGs) {
860 1
            $parameters['feUserGroupList'] = $uGs;
861
        }
862
863
        // Setting processing instructions
864 7
        $parameters['procInstructions'] = GeneralUtility::trimExplode(',', $subCfg['procInstrFilter']);
865 7
        if (is_array($subCfg['procInstrParams.'])) {
866 4
            $parameters['procInstrParams'] = $subCfg['procInstrParams.'];
867
        }
868
869
        // Compile value array:
870 7
        $parameters_serialized = json_encode($parameters);
871
        $fieldArray = [
872 7
            'page_id' => (int) $id,
873 7
            'parameters' => $parameters_serialized,
874 7
            'parameters_hash' => GeneralUtility::shortMD5($parameters_serialized),
875 7
            'configuration_hash' => $configurationHash,
876 7
            'scheduled' => $tstamp,
877 7
            'exec_time' => 0,
878 7
            'set_id' => (int) $this->setID,
879 7
            'result_data' => '',
880 7
            'configuration' => $subCfg['key'],
881
        ];
882
883 7
        if ($this->registerQueueEntriesInternallyOnly) {
884
            //the entries will only be registered and not stored to the database
885 1
            $this->queueEntries[] = $fieldArray;
886
        } else {
887 6
            if (! $skipInnerDuplicationCheck) {
888
                // check if there is already an equal entry
889 5
                $rows = $this->queueRepository->getDuplicateQueueItemsIfExists(
890 5
                    (bool) $this->extensionSettings['enableTimeslot'],
891
                    $tstamp,
892 5
                    $this->getCurrentTime(),
893 5
                    $fieldArray['page_id'],
894 5
                    $fieldArray['parameters_hash']
895
                );
896
            }
897
898 6
            if (empty($rows)) {
899 5
                $connectionForCrawlerQueue = GeneralUtility::makeInstance(ConnectionPool::class)->getConnectionForTable('tx_crawler_queue');
900 5
                $connectionForCrawlerQueue->insert(
901 5
                    'tx_crawler_queue',
902
                    $fieldArray
903
                );
904 5
                $uid = $connectionForCrawlerQueue->lastInsertId('tx_crawler_queue', 'qid');
905 5
                $rows[] = $uid;
906 5
                $urlAdded = true;
907
            }
908
        }
909
910 7
        return $urlAdded;
911
    }
912
913
    /**
914
     * Returns the current system time
915
     *
916
     * @return int
917
     */
918 4
    public function getCurrentTime()
919
    {
920 4
        return time();
921
    }
922
923
    /************************************
924
     *
925
     * URL reading
926
     *
927
     ************************************/
928
929
    /**
930
     * Read URL for single queue entry
931
     *
932
     * @param integer $queueId
933
     * @param boolean $force If set, will process even if exec_time has been set!
934
     * @return integer
935
     */
936 2
    public function readUrl($queueId, $force = false)
937
    {
938 2
        $queryBuilder = GeneralUtility::makeInstance(ConnectionPool::class)->getQueryBuilderForTable($this->tableName);
939 2
        $ret = 0;
940 2
        $this->logger->debug('crawler-readurl start ' . microtime(true));
941
        // Get entry:
942
        $queryBuilder
943 2
            ->select('*')
944 2
            ->from('tx_crawler_queue')
945 2
            ->where(
946 2
                $queryBuilder->expr()->eq('qid', $queryBuilder->createNamedParameter($queueId, PDO::PARAM_INT))
947
            );
948 2
        if (! $force) {
949
            $queryBuilder
950 2
                ->andWhere('exec_time = 0')
951 2
                ->andWhere('process_scheduled > 0');
952
        }
953 2
        $queueRec = $queryBuilder->execute()->fetch();
954
955 2
        if (! is_array($queueRec)) {
956
            return;
957
        }
958
959
        // Set exec_time to lock record:
960 2
        $field_array = ['exec_time' => $this->getCurrentTime()];
961
962 2
        if (isset($this->processID)) {
963
            //if mulitprocessing is used we need to store the id of the process which has handled this entry
964 2
            $field_array['process_id_completed'] = $this->processID;
965
        }
966
967 2
        GeneralUtility::makeInstance(ConnectionPool::class)->getConnectionForTable('tx_crawler_queue')
968 2
            ->update(
969 2
                'tx_crawler_queue',
970
                $field_array,
971 2
                ['qid' => (int) $queueId]
972
            );
973
974 2
        $result = $this->queueExecutor->executeQueueItem($queueRec, $this);
975 2
        if ($result['content'] === null) {
976
            $resultData = 'An errors happened';
977
        } else {
978
            /** @var JsonCompatibilityConverter $jsonCompatibilityConverter */
979 2
            $jsonCompatibilityConverter = GeneralUtility::makeInstance(JsonCompatibilityConverter::class);
980 2
            $resultData = $jsonCompatibilityConverter->convert($result['content']);
981
        }
982
983
        //atm there's no need to point to specific pollable extensions
984 2
        if (is_array($GLOBALS['TYPO3_CONF_VARS']['EXTCONF']['crawler']['pollSuccess'])) {
985
            foreach ($GLOBALS['TYPO3_CONF_VARS']['EXTCONF']['crawler']['pollSuccess'] as $pollable) {
986
                // only check the success value if the instruction is runnig
987
                // it is important to name the pollSuccess key same as the procInstructions key
988
                if (is_array($resultData['parameters']['procInstructions'])
989
                    && in_array(
990
                        $pollable,
991
                        $resultData['parameters']['procInstructions'], true
992
                    )
993
                ) {
994
                    if (! empty($resultData['success'][$pollable]) && $resultData['success'][$pollable]) {
995
                        $ret |= self::CLI_STATUS_POLLABLE_PROCESSED;
996
                    }
997
                }
998
            }
999
        }
1000
1001
        // Set result in log which also denotes the end of the processing of this entry.
1002 2
        $field_array = ['result_data' => json_encode($result)];
1003
1004 2
        GeneralUtility::makeInstance(ConnectionPool::class)->getConnectionForTable('tx_crawler_queue')
1005 2
            ->update(
1006 2
                'tx_crawler_queue',
1007
                $field_array,
1008 2
                ['qid' => (int) $queueId]
1009
            );
1010
1011 2
        $this->logger->debug('crawler-readurl stop ' . microtime(true));
1012 2
        return $ret;
1013
    }
1014
1015
    /**
1016
     * Read URL for not-yet-inserted log-entry
1017
     *
1018
     * @param array $field_array Queue field array,
1019
     *
1020
     * @return array|bool|mixed|string
1021
     */
1022
    public function readUrlFromArray($field_array)
1023
    {
1024
        // Set exec_time to lock record:
1025
        $field_array['exec_time'] = $this->getCurrentTime();
1026
        $connectionForCrawlerQueue = GeneralUtility::makeInstance(ConnectionPool::class)->getConnectionForTable($this->tableName);
1027
        $connectionForCrawlerQueue->insert(
1028
            $this->tableName,
1029
            $field_array
1030
        );
1031
        $queueId = $connectionForCrawlerQueue->lastInsertId($this->tableName, 'qid');
1032
        $result = $this->queueExecutor->executeQueueItem($field_array, $this);
1033
1034
        // Set result in log which also denotes the end of the processing of this entry.
1035
        $field_array = ['result_data' => json_encode($result)];
1036
1037
        $connectionForCrawlerQueue->update(
1038
            $this->tableName,
1039
            $field_array,
1040
            ['qid' => $queueId]
1041
        );
1042
1043
        return $result;
1044
    }
1045
1046
    /*****************************
1047
     *
1048
     * Compiling URLs to crawl - tools
1049
     *
1050
     *****************************/
1051
1052
    /**
1053
     * @param integer $id Root page id to start from.
1054
     * @param integer $depth Depth of tree, 0=only id-page, 1= on sublevel, 99 = infinite
1055
     * @param integer $scheduledTime Unix Time when the URL is timed to be visited when put in queue
1056
     * @param integer $reqMinute Number of requests per minute (creates the interleave between requests)
1057
     * @param boolean $submitCrawlUrls If set, submits the URLs to queue in database (real crawling)
1058
     * @param boolean $downloadCrawlUrls If set (and submitcrawlUrls is false) will fill $downloadUrls with entries)
1059
     * @param array $incomingProcInstructions Array of processing instructions
1060
     * @param array $configurationSelection Array of configuration keys
1061
     * @return string
1062
     */
1063
    public function getPageTreeAndUrls(
1064
        $id,
1065
        $depth,
1066
        $scheduledTime,
1067
        $reqMinute,
1068
        $submitCrawlUrls,
1069
        $downloadCrawlUrls,
1070
        array $incomingProcInstructions,
1071
        array $configurationSelection
1072
    ) {
1073
        $this->scheduledTime = $scheduledTime;
1074
        $this->reqMinute = $reqMinute;
1075
        $this->submitCrawlUrls = $submitCrawlUrls;
1076
        $this->downloadCrawlUrls = $downloadCrawlUrls;
1077
        $this->incomingProcInstructions = $incomingProcInstructions;
1078
        $this->incomingConfigurationSelection = $configurationSelection;
1079
1080
        $this->duplicateTrack = [];
1081
        $this->downloadUrls = [];
1082
1083
        // Drawing tree:
1084
        /* @var PageTreeView $tree */
1085
        $tree = GeneralUtility::makeInstance(PageTreeView::class);
1086
        $perms_clause = $this->getBackendUser()->getPagePermsClause(Permission::PAGE_SHOW);
1087
        $tree->init('AND ' . $perms_clause);
1088
1089
        $pageInfo = BackendUtility::readPageAccess($id, $perms_clause);
1090
        if (is_array($pageInfo)) {
1091
            // Set root row:
1092
            $tree->tree[] = [
1093
                'row' => $pageInfo,
1094
                'HTML' => $this->iconFactory->getIconForRecord('pages', $pageInfo, Icon::SIZE_SMALL),
1095
            ];
1096
        }
1097
1098
        // Get branch beneath:
1099
        if ($depth) {
1100
            $tree->getTree($id, $depth, '');
1101
        }
1102
1103
        // Traverse page tree:
1104
        $code = '';
1105
1106
        foreach ($tree->tree as $data) {
1107
            $this->MP = false;
1108
1109
            // recognize mount points
1110
            if ($data['row']['doktype'] === PageRepository::DOKTYPE_MOUNTPOINT) {
1111
                $mountpage = $this->pageRepository->getPage($data['row']['uid']);
1112
1113
                // fetch mounted pages
1114
                $this->MP = $mountpage[0]['mount_pid'] . '-' . $data['row']['uid'];
0 ignored issues
show
Documentation Bug introduced by
The property $MP was declared of type boolean, but $mountpage[0]['mount_pid...' . $data['row']['uid'] is of type string. Maybe add a type cast?

This check looks for assignments to scalar types that may be of the wrong type.

To ensure the code behaves as expected, it may be a good idea to add an explicit type cast.

$answer = 42;

$correct = false;

$correct = (bool) $answer;
Loading history...
1115
1116
                $mountTree = GeneralUtility::makeInstance(PageTreeView::class);
1117
                $mountTree->init('AND ' . $perms_clause);
1118
                $mountTree->getTree($mountpage[0]['mount_pid'], $depth);
1119
1120
                foreach ($mountTree->tree as $mountData) {
1121
                    $code .= $this->drawURLs_addRowsForPage(
1122
                        $mountData['row'],
1123
                        $mountData['HTML'] . BackendUtility::getRecordTitle('pages', $mountData['row'], true)
1124
                    );
1125
                }
1126
1127
                // replace page when mount_pid_ol is enabled
1128
                if ($mountpage[0]['mount_pid_ol']) {
1129
                    $data['row']['uid'] = $mountpage[0]['mount_pid'];
1130
                } else {
1131
                    // if the mount_pid_ol is not set the MP must not be used for the mountpoint page
1132
                    $this->MP = false;
1133
                }
1134
            }
1135
1136
            $code .= $this->drawURLs_addRowsForPage(
1137
                $data['row'],
1138
                $data['HTML'] . BackendUtility::getRecordTitle('pages', $data['row'], true)
1139
            );
1140
        }
1141
1142
        return $code;
1143
    }
1144
1145
    /**
1146
     * Expands exclude string
1147
     *
1148
     * @param string $excludeString Exclude string
1149
     * @return array
1150
     */
1151 1
    public function expandExcludeString($excludeString)
1152
    {
1153
        // internal static caches;
1154 1
        static $expandedExcludeStringCache;
1155 1
        static $treeCache;
1156
1157 1
        if (empty($expandedExcludeStringCache[$excludeString])) {
1158 1
            $pidList = [];
1159
1160 1
            if (! empty($excludeString)) {
1161
                /** @var PageTreeView $tree */
1162 1
                $tree = GeneralUtility::makeInstance(PageTreeView::class);
1163 1
                $tree->init('AND ' . $this->getBackendUser()->getPagePermsClause(Permission::PAGE_SHOW));
1164
1165 1
                $excludeParts = GeneralUtility::trimExplode(',', $excludeString);
1166
1167 1
                foreach ($excludeParts as $excludePart) {
1168 1
                    [$pid, $depth] = GeneralUtility::trimExplode('+', $excludePart);
1169
1170
                    // default is "page only" = "depth=0"
1171 1
                    if (empty($depth)) {
1172 1
                        $depth = (stristr($excludePart, '+')) ? 99 : 0;
1173
                    }
1174
1175 1
                    $pidList[] = (int) $pid;
1176
1177 1
                    if ($depth > 0) {
1178
                        if (empty($treeCache[$pid][$depth])) {
1179
                            $tree->reset();
1180
                            $tree->getTree($pid, $depth);
0 ignored issues
show
Bug introduced by
$pid of type string is incompatible with the type integer expected by parameter $uid of TYPO3\CMS\Backend\Tree\V...ractTreeView::getTree(). ( Ignorable by Annotation )

If this is a false-positive, you can also ignore this issue in your code via the ignore-type  annotation

1180
                            $tree->getTree(/** @scrutinizer ignore-type */ $pid, $depth);
Loading history...
1181
                            $treeCache[$pid][$depth] = $tree->tree;
1182
                        }
1183
1184
                        foreach ($treeCache[$pid][$depth] as $data) {
1185
                            $pidList[] = (int) $data['row']['uid'];
1186
                        }
1187
                    }
1188
                }
1189
            }
1190
1191 1
            $expandedExcludeStringCache[$excludeString] = array_unique($pidList);
1192
        }
1193
1194 1
        return $expandedExcludeStringCache[$excludeString];
1195
    }
1196
1197
    /**
1198
     * Create the rows for display of the page tree
1199
     * For each page a number of rows are shown displaying GET variable configuration
1200
     */
1201
    public function drawURLs_addRowsForPage(array $pageRow, string $pageTitle): string
1202
    {
1203
        $skipMessage = '';
1204
1205
        // Get list of configurations
1206
        $configurations = $this->getUrlsForPageRow($pageRow, $skipMessage);
1207
        $configurations = ConfigurationService::removeDisallowedConfigurations($this->incomingConfigurationSelection, $configurations);
1208
1209
        // Traverse parameter combinations:
1210
        $c = 0;
1211
        $content = '';
1212
        if (! empty($configurations)) {
1213
            foreach ($configurations as $confKey => $confArray) {
1214
1215
                // Title column:
1216
                if (! $c) {
1217
                    $titleClm = '<td rowspan="' . count($configurations) . '">' . $pageTitle . '</td>';
1218
                } else {
1219
                    $titleClm = '';
1220
                }
1221
1222
                if (! in_array($pageRow['uid'], $this->expandExcludeString($confArray['subCfg']['exclude']), true)) {
1223
1224
                    // URL list:
1225
                    $urlList = $this->urlListFromUrlArray(
1226
                        $confArray,
1227
                        $pageRow,
1228
                        $this->scheduledTime,
1229
                        $this->reqMinute,
1230
                        $this->submitCrawlUrls,
1231
                        $this->downloadCrawlUrls,
1232
                        $this->duplicateTrack,
1233
                        $this->downloadUrls,
1234
                        // if empty the urls won't be filtered by processing instructions
1235
                        $this->incomingProcInstructions
1236
                    );
1237
1238
                    // Expanded parameters:
1239
                    $paramExpanded = '';
1240
                    $calcAccu = [];
1241
                    $calcRes = 1;
1242
                    foreach ($confArray['paramExpanded'] as $gVar => $gVal) {
1243
                        $paramExpanded .= '
1244
                            <tr>
1245
                                <td>' . htmlspecialchars('&' . $gVar . '=') . '<br/>' .
1246
                            '(' . count($gVal) . ')' .
1247
                            '</td>
1248
                                <td nowrap="nowrap">' . nl2br(htmlspecialchars(implode(chr(10), $gVal))) . '</td>
1249
                            </tr>
1250
                        ';
1251
                        $calcRes *= count($gVal);
1252
                        $calcAccu[] = count($gVal);
1253
                    }
1254
                    $paramExpanded = '<table>' . $paramExpanded . '</table>';
1255
                    $paramExpanded .= 'Comb: ' . implode('*', $calcAccu) . '=' . $calcRes;
1256
1257
                    // Options
1258
                    $optionValues = '';
1259
                    if ($confArray['subCfg']['userGroups']) {
1260
                        $optionValues .= 'User Groups: ' . $confArray['subCfg']['userGroups'] . '<br/>';
1261
                    }
1262
                    if ($confArray['subCfg']['procInstrFilter']) {
1263
                        $optionValues .= 'ProcInstr: ' . $confArray['subCfg']['procInstrFilter'] . '<br/>';
1264
                    }
1265
1266
                    // Compile row:
1267
                    $content .= '
1268
                        <tr>
1269
                            ' . $titleClm . '
1270
                            <td>' . htmlspecialchars($confKey) . '</td>
1271
                            <td>' . nl2br(htmlspecialchars(rawurldecode(trim(str_replace('&', chr(10) . '&', GeneralUtility::implodeArrayForUrl('', $confArray['paramParsed'])))))) . '</td>
1272
                            <td>' . $paramExpanded . '</td>
1273
                            <td nowrap="nowrap">' . $urlList . '</td>
1274
                            <td nowrap="nowrap">' . $optionValues . '</td>
1275
                            <td nowrap="nowrap">' . DebugUtility::viewArray($confArray['subCfg']['procInstrParams.']) . '</td>
1276
                        </tr>';
1277
                } else {
1278
                    $content .= '<tr>
1279
                            ' . $titleClm . '
1280
                            <td>' . htmlspecialchars($confKey) . '</td>
1281
                            <td colspan="5"><em>No entries</em> (Page is excluded in this configuration)</td>
1282
                        </tr>';
1283
                }
1284
1285
                $c++;
1286
            }
1287
        } else {
1288
            $message = ! empty($skipMessage) ? ' (' . $skipMessage . ')' : '';
1289
1290
            // Compile row:
1291
            $content .= '
1292
                <tr>
1293
                    <td>' . $pageTitle . '</td>
1294
                    <td colspan="6"><em>No entries</em>' . $message . '</td>
1295
                </tr>';
1296
        }
1297
1298
        return $content;
1299
    }
1300
1301
    /*****************************
1302
     *
1303
     * CLI functions
1304
     *
1305
     *****************************/
1306
1307
    /**
1308
     * Running the functionality of the CLI (crawling URLs from queue)
1309
     */
1310 2
    public function CLI_run(int $countInARun, int $sleepTime, int $sleepAfterFinish): int
1311
    {
1312 2
        $result = 0;
1313 2
        $counter = 0;
1314
1315
        // Clean up the queue
1316 2
        $this->queueRepository->cleanupQueue();
1317
1318
        // Select entries:
1319 2
        $rows = $this->queueRepository->fetchRecordsToBeCrawled($countInARun);
1320
1321 2
        if (! empty($rows)) {
1322 2
            $quidList = [];
1323
1324 2
            foreach ($rows as $r) {
1325 2
                $quidList[] = $r['qid'];
1326
            }
1327
1328 2
            $processId = $this->CLI_buildProcessId();
1329
1330
            //save the number of assigned queue entries to determine how many have been processed later
1331 2
            $numberOfAffectedRows = $this->queueRepository->updateProcessIdAndSchedulerForQueueIds($quidList, $processId);
1332 2
            $this->processRepository->updateProcessAssignItemsCount($numberOfAffectedRows, $processId);
1333
1334 2
            if ($numberOfAffectedRows !== count($quidList)) {
1335
                return ($result | self::CLI_STATUS_ABORTED);
1336
            }
1337
1338 2
            foreach ($rows as $r) {
1339 2
                $result |= $this->readUrl($r['qid']);
1340
1341 2
                $counter++;
1342
                // Just to relax the system
1343 2
                usleep((int) $sleepTime);
1344
1345
                // if during the start and the current read url the cli has been disable we need to return from the function
1346
                // mark the process NOT as ended.
1347 2
                if ($this->crawler->isDisabled()) {
1348
                    return ($result | self::CLI_STATUS_ABORTED);
1349
                }
1350
1351 2
                if (! $this->processRepository->isProcessActive($this->CLI_buildProcessId())) {
1352
                    $this->CLI_debug('conflict / timeout (' . $this->CLI_buildProcessId() . ')');
0 ignored issues
show
Bug introduced by
The method CLI_debug() does not exist on AOE\Crawler\Controller\CrawlerController. Since you implemented __call, consider adding a @method annotation. ( Ignorable by Annotation )

If this is a false-positive, you can also ignore this issue in your code via the ignore-call  annotation

1352
                    $this->/** @scrutinizer ignore-call */ 
1353
                           CLI_debug('conflict / timeout (' . $this->CLI_buildProcessId() . ')');
Loading history...
1353
                    $result |= self::CLI_STATUS_ABORTED;
1354
                    //possible timeout
1355
                    break;
1356
                }
1357
            }
1358
1359 2
            sleep((int) $sleepAfterFinish);
1360
        }
1361
1362 2
        if ($counter > 0) {
1363 2
            $result |= self::CLI_STATUS_PROCESSED;
1364
        }
1365
1366 2
        return $result;
1367
    }
1368
1369
    /**
1370
     * Try to acquire a new process with the given id
1371
     * also performs some auto-cleanup for orphan processes
1372
     * @param string $id identification string for the process
1373
     * @return boolean
1374
     * @todo preemption might not be the most elegant way to clean up
1375
     */
1376 2
    public function CLI_checkAndAcquireNewProcess($id)
1377
    {
1378 2
        $ret = true;
1379
1380 2
        $systemProcessId = getmypid();
1381 2
        if (! $systemProcessId) {
1382
            return false;
1383
        }
1384
1385 2
        $processCount = 0;
1386 2
        $orphanProcesses = [];
1387
1388 2
        $activeProcesses = $this->processRepository->findAllActive();
1389 2
        $currentTime = $this->getCurrentTime();
1390
1391
        /** @var Process $process */
1392 2
        foreach ($activeProcesses as $process) {
1393
            if ($process->getTtl() < $currentTime) {
1394
                $orphanProcesses[] = $process->getProcessId();
1395
            } else {
1396
                $processCount++;
1397
            }
1398
        }
1399
1400
        // if there are less than allowed active processes then add a new one
1401 2
        if ($processCount < (int) $this->extensionSettings['processLimit']) {
1402 2
            GeneralUtility::makeInstance(ConnectionPool::class)->getConnectionForTable('tx_crawler_process')->insert(
1403 2
                'tx_crawler_process',
1404
                [
1405 2
                    'process_id' => $id,
1406 2
                    'active' => 1,
1407 2
                    'ttl' => $currentTime + (int) $this->extensionSettings['processMaxRunTime'],
1408 2
                    'system_process_id' => $systemProcessId,
1409
                ]
1410
            );
1411
        } else {
1412
            $ret = false;
1413
        }
1414
1415 2
        $this->processRepository->deleteProcessesMarkedAsDeleted();
1416 2
        $this->CLI_releaseProcesses($orphanProcesses);
1417
1418 2
        return $ret;
1419
    }
1420
1421
    /**
1422
     * Release a process and the required resources
1423
     *
1424
     * @param mixed $releaseIds string with a single process-id or array with multiple process-ids
1425
     * @return boolean
1426
     */
1427 2
    public function CLI_releaseProcesses($releaseIds)
1428
    {
1429 2
        $queryBuilder = GeneralUtility::makeInstance(ConnectionPool::class)->getQueryBuilderForTable($this->tableName);
1430
1431 2
        if (! is_array($releaseIds)) {
1432 2
            $releaseIds = [$releaseIds];
1433
        }
1434
1435 2
        if (empty($releaseIds)) {
1436
            //nothing to release
1437 2
            return false;
1438
        }
1439
1440
        // some kind of 2nd chance algo - this way you need at least 2 processes to have a real cleanup
1441
        // this ensures that a single process can't mess up the entire process table
1442
1443
        // mark all processes as deleted which have no "waiting" queue-entires and which are not active
1444
1445
        $queryBuilder
1446 2
            ->update($this->tableName, 'q')
1447 2
            ->where(
1448 2
                'q.process_id IN(SELECT p.process_id FROM tx_crawler_process as p WHERE p.active = 0)'
1449
            )
1450 2
            ->set('q.process_scheduled', 0)
1451 2
            ->set('q.process_id', '')
1452 2
            ->execute();
1453
1454
        // FIXME: Not entirely sure that this is equivalent to the previous version
1455 2
        $queryBuilder->resetQueryPart('set');
1456
1457
        $queryBuilder
1458 2
            ->update('tx_crawler_process')
1459 2
            ->where(
1460 2
                $queryBuilder->expr()->eq('active', 0),
1461 2
                'process_id IN(SELECT q.process_id FROM tx_crawler_queue as q WHERE q.exec_time = 0)'
1462
            )
1463 2
            ->set('system_process_id', 0)
1464 2
            ->execute();
1465
1466 2
        $this->processRepository->markRequestedProcessesAsNotActive($releaseIds);
1467 2
        $this->queueRepository->unsetProcessScheduledAndProcessIdForQueueEntries($releaseIds);
1468
1469 2
        return true;
1470
    }
1471
1472
    /**
1473
     * Create a unique Id for the current process
1474
     *
1475
     * @return string the ID
1476
     */
1477 3
    public function CLI_buildProcessId()
1478
    {
1479 3
        if (! $this->processID) {
1480 2
            $this->processID = GeneralUtility::shortMD5(microtime(true));
1481
        }
1482 3
        return $this->processID;
1483
    }
1484
1485
    /**
1486
     * Returns a md5 hash generated from a serialized configuration array.
1487
     *
1488
     * @return string
1489
     */
1490 9
    protected function getConfigurationHash(array $configuration)
1491
    {
1492 9
        unset($configuration['paramExpanded']);
1493 9
        unset($configuration['URLs']);
1494 9
        return md5(serialize($configuration));
1495
    }
1496
1497 1
    protected function swapIfFirstIsLargerThanSecond(array $reg): array
1498
    {
1499
        // Swap if first is larger than last:
1500 1
        if ($reg[1] > $reg[2]) {
1501
            $temp = $reg[2];
1502
            $reg[2] = $reg[1];
1503
            $reg[1] = $temp;
1504
        }
1505
1506 1
        return $reg;
1507
    }
1508
1509 5
    private function getMaximumUrlsToCompile(): int
1510
    {
1511 5
        return $this->maximumUrlsToCompile;
1512
    }
1513
1514
    /**
1515
     * @return BackendUserAuthentication
1516
     */
1517 3
    private function getBackendUser()
1518
    {
1519
        // Make sure the _cli_ user is loaded
1520 3
        Bootstrap::initializeBackendAuthentication();
1521 3
        if ($this->backendUser === null) {
1522 3
            $this->backendUser = $GLOBALS['BE_USER'];
1523
        }
1524 3
        return $this->backendUser;
1525
    }
1526
1527
    /**
1528
     * Get querybuilder for given table
1529
     *
1530
     * @return QueryBuilder
1531
     */
1532 8
    private function getQueryBuilder(string $table)
1533
    {
1534 8
        return GeneralUtility::makeInstance(ConnectionPool::class)->getQueryBuilderForTable($table);
1535
    }
1536
}
1537