Passed
Push — typo3v9 ( 2404ee...b9b5fa )
by Tomas Norre
05:51
created

CrawlerController::getUrlsForPageId()   C

Complexity

Conditions 16
Paths 96

Size

Total Lines 91
Code Lines 51

Duplication

Lines 0
Ratio 0 %

Code Coverage

Tests 46
CRAP Score 16.0585

Importance

Changes 3
Bugs 0 Features 0
Metric Value
cc 16
eloc 51
c 3
b 0
f 0
nc 96
nop 1
dl 0
loc 91
ccs 46
cts 49
cp 0.9388
crap 16.0585
rs 5.5666

How to fix   Long Method    Complexity   

Long Method

Small methods make your code easier to understand, in particular if combined with a good name. Besides, if your method is small, finding a good name is usually much easier.

For example, if you find yourself adding comments to a method's body, this is usually a good sign to extract the commented part to a new method, and use the comment as a starting point when coming up with a good name for this new method.

Commonly applied refactorings include:

1
<?php
2
3
declare(strict_types=1);
4
5
namespace AOE\Crawler\Controller;
6
7
/***************************************************************
8
 *  Copyright notice
9
 *
10
 *  (c) 2019 AOE GmbH <[email protected]>
11
 *
12
 *  All rights reserved
13
 *
14
 *  This script is part of the TYPO3 project. The TYPO3 project is
15
 *  free software; you can redistribute it and/or modify
16
 *  it under the terms of the GNU General Public License as published by
17
 *  the Free Software Foundation; either version 3 of the License, or
18
 *  (at your option) any later version.
19
 *
20
 *  The GNU General Public License can be found at
21
 *  http://www.gnu.org/copyleft/gpl.html.
22
 *
23
 *  This script is distributed in the hope that it will be useful,
24
 *  but WITHOUT ANY WARRANTY; without even the implied warranty of
25
 *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
26
 *  GNU General Public License for more details.
27
 *
28
 *  This copyright notice MUST APPEAR in all copies of the script!
29
 ***************************************************************/
30
31
use AOE\Crawler\Configuration\ExtensionConfigurationProvider;
32
use AOE\Crawler\Domain\Repository\ConfigurationRepository;
33
use AOE\Crawler\Domain\Repository\ProcessRepository;
34
use AOE\Crawler\Domain\Repository\QueueRepository;
35
use AOE\Crawler\Event\EventDispatcher;
36
use AOE\Crawler\QueueExecutor;
37
use AOE\Crawler\Utility\SignalSlotUtility;
38
use Psr\Http\Message\UriInterface;
39
use Psr\Log\LoggerAwareInterface;
40
use Psr\Log\LoggerAwareTrait;
41
use TYPO3\CMS\Backend\Tree\View\PageTreeView;
42
use TYPO3\CMS\Backend\Utility\BackendUtility;
43
use TYPO3\CMS\Core\Authentication\BackendUserAuthentication;
44
use TYPO3\CMS\Core\Core\Bootstrap;
45
use TYPO3\CMS\Core\Core\Environment;
46
use TYPO3\CMS\Core\Database\Connection;
47
use TYPO3\CMS\Core\Database\ConnectionPool;
48
use TYPO3\CMS\Core\Database\Query\Restriction\DeletedRestriction;
49
use TYPO3\CMS\Core\Http\Uri;
50
use TYPO3\CMS\Core\Imaging\Icon;
51
use TYPO3\CMS\Core\Imaging\IconFactory;
52
use TYPO3\CMS\Core\Routing\SiteMatcher;
53
use TYPO3\CMS\Core\Site\Entity\Site;
54
use TYPO3\CMS\Core\Type\Bitmask\Permission;
55
use TYPO3\CMS\Core\TypoScript\Parser\TypoScriptParser;
56
use TYPO3\CMS\Core\Utility\DebugUtility;
57
use TYPO3\CMS\Core\Utility\GeneralUtility;
58
use TYPO3\CMS\Core\Utility\MathUtility;
59
use TYPO3\CMS\Extbase\Object\ObjectManager;
60
use TYPO3\CMS\Frontend\Page\CacheHashCalculator;
61
use TYPO3\CMS\Frontend\Page\PageRepository;
62
63
/**
64
 * Class CrawlerController
65
 *
66
 * @package AOE\Crawler\Controller
67
 */
68
class CrawlerController implements LoggerAwareInterface
69
{
70
    use LoggerAwareTrait;
71
72
    public const CLI_STATUS_NOTHING_PROCCESSED = 0;
73
    public const CLI_STATUS_REMAIN = 1; //queue not empty
74
    public const CLI_STATUS_PROCESSED = 2; //(some) queue items where processed
75
    public const CLI_STATUS_ABORTED = 4; //instance didn't finish
76
    public const CLI_STATUS_POLLABLE_PROCESSED = 8;
77
78
    /**
79
     * @var integer
80
     */
81
    public $setID = 0;
82
83
    /**
84
     * @var string
85
     */
86
    public $processID = '';
87
88
    /**
89
     * @var array
90
     */
91
    public $duplicateTrack = [];
92
93
    /**
94
     * @var array
95
     */
96
    public $downloadUrls = [];
97
98
    /**
99
     * @var array
100
     */
101
    public $incomingProcInstructions = [];
102
103
    /**
104
     * @var array
105
     */
106
    public $incomingConfigurationSelection = [];
107
108
    /**
109
     * @var bool
110
     */
111
    public $registerQueueEntriesInternallyOnly = false;
112
113
    /**
114
     * @var array
115
     */
116
    public $queueEntries = [];
117
118
    /**
119
     * @var array
120
     */
121
    public $urlList = [];
122
123
    /**
124
     * @var array
125
     */
126
    public $extensionSettings = [];
127
128
    /**
129
     * Mount Point
130
     *
131
     * @var boolean
132
     */
133
    public $MP = false;
134
135
    /**
136
     * @var string
137
     */
138
    protected $processFilename;
139
140
    /**
141
     * Holds the internal access mode can be 'gui','cli' or 'cli_im'
142
     *
143
     * @var string
144
     */
145
    protected $accessMode;
146
147
    /**
148
     * @var BackendUserAuthentication|null
149
     */
150
    private $backendUser;
151
152
    /**
153
     * @var integer
154
     */
155
    private $scheduledTime = 0;
156
157
    /**
158
     * @var integer
159
     */
160
    private $reqMinute = 0;
161
162
    /**
163
     * @var bool
164
     */
165
    private $submitCrawlUrls = false;
166
167
    /**
168
     * @var bool
169
     */
170
    private $downloadCrawlUrls = false;
171
172
    /**
173
     * @var QueueRepository
174
     */
175
    protected $queueRepository;
176
177
    /**
178
     * @var ProcessRepository
179
     */
180
    protected $processRepository;
181
182
    /**
183
     * @var ConfigurationRepository
184
     */
185
    protected $configurationRepository;
186
187
    /**
188
     * @var string
189
     */
190
    protected $tableName = 'tx_crawler_queue';
191
192
    /**
193
     * @var QueueExecutor
194
     */
195
    protected $queueExecutor;
196
197
    /**
198
     * @var int
199
     */
200
    protected $maximumUrlsToCompile = 10000;
201
202
    /**
203
     * @var IconFactory
204
     */
205
    protected $iconFactory;
206
207
    /**
208
     * Method to set the accessMode can be gui, cli or cli_im
209
     *
210
     * @return string
211
     */
212 1
    public function getAccessMode()
213
    {
214 1
        return $this->accessMode;
215
    }
216
217
    /**
218
     * @param string $accessMode
219
     */
220 1
    public function setAccessMode($accessMode): void
221
    {
222 1
        $this->accessMode = $accessMode;
223 1
    }
224
225
    /**
226
     * Set disabled status to prevent processes from being processed
227
     *
228
     * @param  bool $disabled (optional, defaults to true)
229
     * @return void
230
     */
231 3
    public function setDisabled($disabled = true): void
232
    {
233 3
        if ($disabled) {
234 2
            GeneralUtility::writeFile($this->processFilename, '');
235
        } else {
236 1
            if (is_file($this->processFilename)) {
237 1
                unlink($this->processFilename);
238
            }
239
        }
240 3
    }
241
242
    /**
243
     * Get disable status
244
     *
245
     * @return bool true if disabled
246
     */
247 3
    public function getDisabled()
248
    {
249 3
        return is_file($this->processFilename);
250
    }
251
252
    /**
253
     * @param string $filenameWithPath
254
     *
255
     * @return void
256
     */
257 4
    public function setProcessFilename($filenameWithPath): void
258
    {
259 4
        $this->processFilename = $filenameWithPath;
260 4
    }
261
262
    /**
263
     * @return string
264
     */
265 1
    public function getProcessFilename()
266
    {
267 1
        return $this->processFilename;
268
    }
269
270
    /************************************
271
     *
272
     * Getting URLs based on Page TSconfig
273
     *
274
     ************************************/
275
276 39
    public function __construct()
277
    {
278 39
        $objectManager = GeneralUtility::makeInstance(ObjectManager::class);
279 39
        $this->queueRepository = $objectManager->get(QueueRepository::class);
280 39
        $this->processRepository = $objectManager->get(ProcessRepository::class);
281 39
        $this->configurationRepository = $objectManager->get(ConfigurationRepository::class);
282 39
        $this->queueExecutor = $objectManager->get(QueueExecutor::class);
283 39
        $this->iconFactory = GeneralUtility::makeInstance(IconFactory::class);
284
285 39
        $this->processFilename = Environment::getVarPath() . '/lock/tx_crawler.proc';
286
287
        /** @var ExtensionConfigurationProvider $configurationProvider */
288 39
        $configurationProvider = GeneralUtility::makeInstance(ExtensionConfigurationProvider::class);
289 39
        $settings = $configurationProvider->getExtensionConfiguration();
290 39
        $this->extensionSettings = is_array($settings) ? $settings : [];
0 ignored issues
show
introduced by
The condition is_array($settings) is always true.
Loading history...
291
292
        // set defaults:
293 39
        if (MathUtility::convertToPositiveInteger($this->extensionSettings['countInARun']) == 0) {
294
            $this->extensionSettings['countInARun'] = 100;
295
        }
296
297 39
        $this->extensionSettings['processLimit'] = MathUtility::forceIntegerInRange($this->extensionSettings['processLimit'], 1, 99, 1);
298 39
        $this->maximumUrlsToCompile = MathUtility::forceIntegerInRange($this->extensionSettings['maxCompileUrls'], 1, 1000000000, 10000);
299 39
    }
300
301
    /**
302
     * @return BackendUserAuthentication
303
     */
304 1
    private function getBackendUser()
305
    {
306
        // Make sure the _cli_ user is loaded
307 1
        Bootstrap::initializeBackendAuthentication();
308 1
        if ($this->backendUser === null) {
309 1
            $this->backendUser = $GLOBALS['BE_USER'];
310
        }
311 1
        return $this->backendUser;
312
    }
313
314
    /**
315
     * Sets the extensions settings (unserialized pendant of $TYPO3_CONF_VARS['EXT']['extConf']['crawler']).
316
     *
317
     * @param array $extensionSettings
318
     * @return void
319
     */
320 12
    public function setExtensionSettings(array $extensionSettings): void
321
    {
322 12
        $this->extensionSettings = $extensionSettings;
323 12
    }
324
325
    /**
326
     * Check if the given page should be crawled
327
     *
328
     * @param array $pageRow
329
     * @return false|string false if the page should be crawled (not excluded), true / skipMessage if it should be skipped
330
     */
331 8
    public function checkIfPageShouldBeSkipped(array $pageRow)
332
    {
333 8
        $skipPage = false;
334 8
        $skipMessage = 'Skipped'; // message will be overwritten later
335
336
        // if page is hidden
337 8
        if (!$this->extensionSettings['crawlHiddenPages']) {
338 8
            if ($pageRow['hidden']) {
339 1
                $skipPage = true;
340 1
                $skipMessage = 'Because page is hidden';
341
            }
342
        }
343
344 8
        if (!$skipPage) {
345 7
            if (GeneralUtility::inList('3,4', $pageRow['doktype']) || $pageRow['doktype'] >= 199) {
346 3
                $skipPage = true;
347 3
                $skipMessage = 'Because doktype is not allowed';
348
            }
349
        }
350
351 8
        if (!$skipPage) {
352 4
            foreach ($GLOBALS['TYPO3_CONF_VARS']['EXTCONF']['crawler']['excludeDoktype'] ?? [] as $key => $doktypeList) {
353 1
                if (GeneralUtility::inList($doktypeList, $pageRow['doktype'])) {
354 1
                    $skipPage = true;
355 1
                    $skipMessage = 'Doktype was excluded by "' . $key . '"';
356 1
                    break;
357
                }
358
            }
359
        }
360
361 8
        if (!$skipPage) {
362
            // veto hook
363 3
            foreach ($GLOBALS['TYPO3_CONF_VARS']['EXTCONF']['crawler']['pageVeto'] ?? [] as $key => $func) {
364
                $params = [
365
                    'pageRow' => $pageRow,
366
                ];
367
                // expects "false" if page is ok and "true" or a skipMessage if this page should _not_ be crawled
368
                $veto = GeneralUtility::callUserFunction($func, $params, $this);
369
                if ($veto !== false) {
370
                    $skipPage = true;
371
                    if (is_string($veto)) {
372
                        $skipMessage = $veto;
373
                    } else {
374
                        $skipMessage = 'Veto from hook "' . htmlspecialchars($key) . '"';
375
                    }
376
                    // no need to execute other hooks if a previous one return a veto
377
                    break;
378
                }
379
            }
380
        }
381
382 8
        return $skipPage ? $skipMessage : false;
383
    }
384
385
    /**
386
     * Wrapper method for getUrlsForPageId()
387
     * It returns an array of configurations and no urls!
388
     *
389
     * @param array $pageRow Page record with at least dok-type and uid columns.
390
     * @param string $skipMessage
391
     * @return array
392
     * @see getUrlsForPageId()
393
     */
394 4
    public function getUrlsForPageRow(array $pageRow, &$skipMessage = '')
395
    {
396 4
        $message = $this->checkIfPageShouldBeSkipped($pageRow);
397
398 4
        if ($message === false) {
399 3
            $res = $this->getUrlsForPageId($pageRow['uid']);
400 3
            $skipMessage = '';
401
        } else {
402 1
            $skipMessage = $message;
403 1
            $res = [];
404
        }
405
406 4
        return $res;
407
    }
408
409
    /**
410
     * This method is used to count if there are ANY unprocessed queue entries
411
     * of a given page_id and the configuration which matches a given hash.
412
     * If there if none, we can skip an inner detail check
413
     *
414
     * @param  int $uid
415
     * @param  string $configurationHash
416
     * @return boolean
417
     */
418 5
    protected function noUnprocessedQueueEntriesForPageWithConfigurationHashExist($uid, $configurationHash)
419
    {
420 5
        $queryBuilder = GeneralUtility::makeInstance(ConnectionPool::class)->getQueryBuilderForTable($this->tableName);
421 5
        $noUnprocessedQueueEntriesFound = true;
422
423
        $result = $queryBuilder
424 5
            ->count('*')
425 5
            ->from($this->tableName)
426 5
            ->where(
427 5
                $queryBuilder->expr()->eq('page_id', (int)$uid),
428 5
                $queryBuilder->expr()->eq('configuration_hash', $queryBuilder->createNamedParameter($configurationHash)),
429 5
                $queryBuilder->expr()->eq('exec_time', 0)
430
            )
431 5
            ->execute()
432 5
            ->fetchColumn();
433
434 5
        if ($result) {
435 3
            $noUnprocessedQueueEntriesFound = false;
436
        }
437
438 5
        return $noUnprocessedQueueEntriesFound;
439
    }
440
441
    /**
442
     * Creates a list of URLs from input array (and submits them to queue if asked for)
443
     * See Web > Info module script + "indexed_search"'s crawler hook-client using this!
444
     *
445
     * @param    array        Information about URLs from pageRow to crawl.
446
     * @param    array        Page row
447
     * @param    integer        Unix time to schedule indexing to, typically time()
448
     * @param    integer        Number of requests per minute (creates the interleave between requests)
449
     * @param    boolean        If set, submits the URLs to queue
0 ignored issues
show
Bug introduced by
The type AOE\Crawler\Controller\If was not found. Maybe you did not declare it correctly or list all dependencies?

The issue could also be caused by a filter entry in the build configuration. If the path has been excluded in your configuration, e.g. excluded_paths: ["lib/*"], you can move it to the dependency path list as follows:

filter:
    dependency_paths: ["lib/*"]

For further information see https://scrutinizer-ci.com/docs/tools/php/php-scrutinizer/#list-dependency-paths

Loading history...
450
     * @param    boolean        If set (and submitcrawlUrls is false) will fill $downloadUrls with entries)
451
     * @param    array        Array which is passed by reference and contains the an id per url to secure we will not crawl duplicates
452
     * @param    array        Array which will be filled with URLS for download if flag is set.
453
     * @param    array        Array of processing instructions
454
     * @return    string        List of URLs (meant for display in backend module)
455
     *
456
     */
457 2
    public function urlListFromUrlArray(
458
        array $vv,
459
        array $pageRow,
460
        $scheduledTime,
461
        $reqMinute,
462
        $submitCrawlUrls,
463
        $downloadCrawlUrls,
464
        array &$duplicateTrack,
465
        array &$downloadUrls,
466
        array $incomingProcInstructions
467
    ) {
468 2
        if (!is_array($vv['URLs'])) {
469
            return 'ERROR - no URL generated';
470
        }
471 2
        $urlLog = [];
472 2
        $pageId = (int)$pageRow['uid'];
473 2
        $configurationHash = $this->getConfigurationHash($vv);
474 2
        $skipInnerCheck = $this->noUnprocessedQueueEntriesForPageWithConfigurationHashExist($pageId, $configurationHash);
475
476 2
        foreach ($vv['URLs'] as $urlQuery) {
477 2
            if (!$this->drawURLs_PIfilter($vv['subCfg']['procInstrFilter'], $incomingProcInstructions)) {
478
                continue;
479
            }
480 2
            $url = (string)$this->getUrlFromPageAndQueryParameters(
481 2
                $pageId,
482 2
                $urlQuery,
483 2
                $vv['subCfg']['baseUrl'] ?? null,
484 2
                $vv['subCfg']['force_ssl'] ?? 0
485
            );
486
487
            // Create key by which to determine unique-ness:
488 2
            $uKey = $url . '|' . $vv['subCfg']['userGroups'] . '|' . $vv['subCfg']['procInstrFilter'];
489
490 2
            if (isset($duplicateTrack[$uKey])) {
491
                //if the url key is registered just display it and do not resubmit is
492
                $urlLog[] = '<em><span class="text-muted">' . htmlspecialchars($url) . '</span></em>';
493
            } else {
494
                // Scheduled time:
495 2
                $schTime = $scheduledTime + round(count($duplicateTrack) * (60 / $reqMinute));
496 2
                $schTime = floor($schTime / 60) * 60;
497 2
                $formattedDate = BackendUtility::datetime($schTime);
0 ignored issues
show
Bug introduced by
$schTime of type double is incompatible with the type integer expected by parameter $value of TYPO3\CMS\Backend\Utilit...kendUtility::datetime(). ( Ignorable by Annotation )

If this is a false-positive, you can also ignore this issue in your code via the ignore-type  annotation

497
                $formattedDate = BackendUtility::datetime(/** @scrutinizer ignore-type */ $schTime);
Loading history...
498 2
                $this->urlList[] = '[' . $formattedDate . '] ' . $url;
499 2
                $urlList = '[' . $formattedDate . '] ' . htmlspecialchars($url);
500
501
                // Submit for crawling!
502 2
                if ($submitCrawlUrls) {
503 2
                    $added = $this->addUrl(
504 2
                        $pageId,
505 2
                        $url,
506 2
                        $vv['subCfg'],
507 2
                        $scheduledTime,
508 2
                        $configurationHash,
509 2
                        $skipInnerCheck
510
                    );
511 2
                    if ($added === false) {
512 2
                        $urlList .= ' (URL already existed)';
513
                    }
514
                } elseif ($downloadCrawlUrls) {
515
                    $downloadUrls[$url] = $url;
516
                }
517 2
                $urlLog[] = $urlList;
518
            }
519 2
            $duplicateTrack[$uKey] = true;
520
        }
521
522 2
        return implode('<br>', $urlLog);
523
    }
524
525
    /**
526
     * Returns true if input processing instruction is among registered ones.
527
     *
528
     * @param string $piString PI to test
529
     * @param array $incomingProcInstructions Processing instructions
530
     * @return boolean
531
     */
532 5
    public function drawURLs_PIfilter($piString, array $incomingProcInstructions)
533
    {
534 5
        if (empty($incomingProcInstructions)) {
535 1
            return true;
536
        }
537
538 4
        foreach ($incomingProcInstructions as $pi) {
539 4
            if (GeneralUtility::inList($piString, $pi)) {
540 2
                return true;
541
            }
542
        }
543 2
        return false;
544
    }
545
546 3
    public function getPageTSconfigForId($id)
547
    {
548 3
        if (!$this->MP) {
549 3
            $pageTSconfig = BackendUtility::getPagesTSconfig($id);
0 ignored issues
show
Deprecated Code introduced by
The function TYPO3\CMS\Backend\Utilit...ity::getPagesTSconfig() has been deprecated. ( Ignorable by Annotation )

If this is a false-positive, you can also ignore this issue in your code via the ignore-deprecated  annotation

549
            $pageTSconfig = /** @scrutinizer ignore-deprecated */ BackendUtility::getPagesTSconfig($id);
Loading history...
550
        } else {
551
            [, $mountPointId] = explode('-', $this->MP);
0 ignored issues
show
Bug introduced by
$this->MP of type true is incompatible with the type string expected by parameter $string of explode(). ( Ignorable by Annotation )

If this is a false-positive, you can also ignore this issue in your code via the ignore-type  annotation

551
            [, $mountPointId] = explode('-', /** @scrutinizer ignore-type */ $this->MP);
Loading history...
552
            $pageTSconfig = BackendUtility::getPagesTSconfig($mountPointId);
0 ignored issues
show
Deprecated Code introduced by
The function TYPO3\CMS\Backend\Utilit...ity::getPagesTSconfig() has been deprecated. ( Ignorable by Annotation )

If this is a false-positive, you can also ignore this issue in your code via the ignore-deprecated  annotation

552
            $pageTSconfig = /** @scrutinizer ignore-deprecated */ BackendUtility::getPagesTSconfig($mountPointId);
Loading history...
Bug introduced by
$mountPointId of type string is incompatible with the type integer expected by parameter $id of TYPO3\CMS\Backend\Utilit...ity::getPagesTSconfig(). ( Ignorable by Annotation )

If this is a false-positive, you can also ignore this issue in your code via the ignore-type  annotation

552
            $pageTSconfig = BackendUtility::getPagesTSconfig(/** @scrutinizer ignore-type */ $mountPointId);
Loading history...
553
        }
554
555
        // Call a hook to alter configuration
556 3
        if (is_array($GLOBALS['TYPO3_CONF_VARS']['EXTCONF']['crawler']['getPageTSconfigForId'])) {
557
            $params = [
558
                'pageId' => $id,
559
                'pageTSConfig' => &$pageTSconfig,
560
            ];
561
            foreach ($GLOBALS['TYPO3_CONF_VARS']['EXTCONF']['crawler']['getPageTSconfigForId'] as $userFunc) {
562
                GeneralUtility::callUserFunction($userFunc, $params, $this);
563
            }
564
        }
565 3
        return $pageTSconfig;
566
    }
567
568
    /**
569
     * This methods returns an array of configurations.
570
     * And no urls!
571
     *
572
     * @param integer $id Page ID
573
     * @return array
574
     */
575 2
    public function getUrlsForPageId($pageId)
576
    {
577
        // Get page TSconfig for page ID
578 2
        $pageTSconfig = $this->getPageTSconfigForId($pageId);
579
580 2
        $res = [];
581
582
        // Fetch Crawler Configuration from pageTSconfig
583 2
        $crawlerCfg = $pageTSconfig['tx_crawler.']['crawlerCfg.']['paramSets.'] ?? [];
584 2
        foreach ($crawlerCfg as $key => $values) {
585 1
            if (!is_array($values)) {
586 1
                continue;
587
            }
588 1
            $key = str_replace('.', '', $key);
589
            // Sub configuration for a single configuration string:
590 1
            $subCfg = (array)$crawlerCfg[$key . '.'];
591 1
            $subCfg['key'] = $key;
592
593 1
            if (strcmp($subCfg['procInstrFilter'], '')) {
594 1
                $subCfg['procInstrFilter'] = implode(',', GeneralUtility::trimExplode(',', $subCfg['procInstrFilter']));
595
            }
596 1
            $pidOnlyList = implode(',', GeneralUtility::trimExplode(',', $subCfg['pidsOnly'], true));
597
598
            // process configuration if it is not page-specific or if the specific page is the current page:
599 1
            if (!strcmp((string) $subCfg['pidsOnly'], '') || GeneralUtility::inList($pidOnlyList, $pageId)) {
600
601
                // Explode, process etc.:
602 1
                $res[$key] = [];
603 1
                $res[$key]['subCfg'] = $subCfg;
604 1
                $res[$key]['paramParsed'] = GeneralUtility::explodeUrl2Array($crawlerCfg[$key]);
605 1
                $res[$key]['paramExpanded'] = $this->expandParameters($res[$key]['paramParsed'], $pageId);
606 1
                $res[$key]['origin'] = 'pagets';
607
608
                // recognize MP value
609 1
                if (!$this->MP) {
610 1
                    $res[$key]['URLs'] = $this->compileUrls($res[$key]['paramExpanded'], ['?id=' . $pageId]);
611
                } else {
612
                    $res[$key]['URLs'] = $this->compileUrls($res[$key]['paramExpanded'], ['?id=' . $pageId . '&MP=' . $this->MP]);
0 ignored issues
show
Bug introduced by
Are you sure $this->MP of type true can be used in concatenation? ( Ignorable by Annotation )

If this is a false-positive, you can also ignore this issue in your code via the ignore-type  annotation

612
                    $res[$key]['URLs'] = $this->compileUrls($res[$key]['paramExpanded'], ['?id=' . $pageId . '&MP=' . /** @scrutinizer ignore-type */ $this->MP]);
Loading history...
613
                }
614
            }
615
        }
616
617
        // Get configuration from tx_crawler_configuration records up the rootline
618 2
        $crawlerConfigurations = $this->configurationRepository->getCrawlerConfigurationRecordsFromRootLine($pageId);
619 2
        foreach ($crawlerConfigurations as $configurationRecord) {
620
621
                // check access to the configuration record
622 1
            if (empty($configurationRecord['begroups']) || $this->getBackendUser()->isAdmin() || $this->hasGroupAccess($this->getBackendUser()->user['usergroup_cached_list'], $configurationRecord['begroups'])) {
623 1
                $pidOnlyList = implode(',', GeneralUtility::trimExplode(',', $configurationRecord['pidsonly'], true));
624
625
                // process configuration if it is not page-specific or if the specific page is the current page:
626 1
                if (!strcmp($configurationRecord['pidsonly'], '') || GeneralUtility::inList($pidOnlyList, $pageId)) {
627 1
                    $key = $configurationRecord['name'];
628
629
                    // don't overwrite previously defined paramSets
630 1
                    if (!isset($res[$key])) {
631
632
                            /* @var $TSparserObject \TYPO3\CMS\Core\TypoScript\Parser\TypoScriptParser */
633 1
                        $TSparserObject = GeneralUtility::makeInstance(TypoScriptParser::class);
634 1
                        $TSparserObject->parse($configurationRecord['processing_instruction_parameters_ts']);
635
636
                        $subCfg = [
637 1
                            'procInstrFilter' => $configurationRecord['processing_instruction_filter'],
638 1
                            'procInstrParams.' => $TSparserObject->setup,
639 1
                            'baseUrl' => $configurationRecord['base_url'],
640 1
                            'force_ssl' => (int)$configurationRecord['force_ssl'],
641 1
                            'userGroups' => $configurationRecord['fegroups'],
642 1
                            'exclude' => $configurationRecord['exclude'],
643 1
                            'key' => $key,
644
                        ];
645
646 1
                        if (!in_array($pageId, $this->expandExcludeString($subCfg['exclude']))) {
647 1
                            $res[$key] = [];
648 1
                            $res[$key]['subCfg'] = $subCfg;
649 1
                            $res[$key]['paramParsed'] = GeneralUtility::explodeUrl2Array($configurationRecord['configuration']);
650 1
                            $res[$key]['paramExpanded'] = $this->expandParameters($res[$key]['paramParsed'], $pageId);
651 1
                            $res[$key]['URLs'] = $this->compileUrls($res[$key]['paramExpanded'], ['?id=' . $pageId]);
652 1
                            $res[$key]['origin'] = 'tx_crawler_configuration_' . $configurationRecord['uid'];
653
                        }
654
                    }
655
                }
656
            }
657
        }
658
659 2
        foreach ($GLOBALS['TYPO3_CONF_VARS']['EXTCONF']['crawler']['processUrls'] ?? [] as $func) {
660
            $params = [
661
                'res' => &$res,
662
            ];
663
            GeneralUtility::callUserFunction($func, $params, $this);
664
        }
665 2
        return $res;
666
    }
667
668
    /**
669
     * Find all configurations of subpages of a page
670
     *
671
     * @param int $rootid
672
     * @param $depth
673
     * @return array
674
     *
675
     * TODO: Write Functional Tests
676
     */
677 1
    public function getConfigurationsForBranch(int $rootid, $depth)
678
    {
679 1
        $configurationsForBranch = [];
680 1
        $pageTSconfig = $this->getPageTSconfigForId($rootid);
681 1
        $sets = $pageTSconfig['tx_crawler.']['crawlerCfg.']['paramSets.'] ?? [];
682 1
        foreach ($sets as $key => $value) {
683
            if (!is_array($value)) {
684
                continue;
685
            }
686
            $configurationsForBranch[] = substr($key, -1) == '.' ? substr($key, 0, -1) : $key;
687
        }
688 1
        $pids = [];
689 1
        $rootLine = BackendUtility::BEgetRootLine($rootid);
690 1
        foreach ($rootLine as $node) {
691 1
            $pids[] = $node['uid'];
692
        }
693
        /* @var PageTreeView $tree */
694 1
        $tree = GeneralUtility::makeInstance(PageTreeView::class);
695 1
        $perms_clause = $this->getBackendUser()->getPagePermsClause(Permission::PAGE_SHOW);
696 1
        $tree->init(empty($perms_clause) ? '' : ('AND ' . $perms_clause));
697 1
        $tree->getTree($rootid, $depth, '');
698 1
        foreach ($tree->tree as $node) {
699
            $pids[] = $node['row']['uid'];
700
        }
701
702 1
        $queryBuilder = $this->getQueryBuilder('tx_crawler_configuration');
703
        $statement = $queryBuilder
704 1
            ->select('name')
705 1
            ->from('tx_crawler_configuration')
706 1
            ->where(
707 1
                $queryBuilder->expr()->in('pid', $queryBuilder->createNamedParameter($pids, Connection::PARAM_INT_ARRAY))
708
            )
709 1
            ->execute();
710
711 1
        while ($row = $statement->fetch()) {
712 1
            $configurationsForBranch[] = $row['name'];
713
        }
714 1
        return $configurationsForBranch;
715
    }
716
717
    /**
718
     * Get querybuilder for given table
719
     *
720
     * @param string $table
721
     * @return \TYPO3\CMS\Core\Database\Query\QueryBuilder
722
     */
723 17
    private function getQueryBuilder(string $table)
724
    {
725 17
        return GeneralUtility::makeInstance(ConnectionPool::class)->getQueryBuilderForTable($table);
726
    }
727
728
    /**
729
     * Check if a user has access to an item
730
     * (e.g. get the group list of the current logged in user from $GLOBALS['TSFE']->gr_list)
731
     *
732
     * @see \TYPO3\CMS\Frontend\Page\PageRepository::getMultipleGroupsWhereClause()
733
     * @param  string $groupList    Comma-separated list of (fe_)group UIDs from a user
734
     * @param  string $accessList   Comma-separated list of (fe_)group UIDs of the item to access
735
     * @return bool                 TRUE if at least one of the users group UIDs is in the access list or the access list is empty
736
     */
737 3
    public function hasGroupAccess($groupList, $accessList)
738
    {
739 3
        if (empty($accessList)) {
740 1
            return true;
741
        }
742 2
        foreach (GeneralUtility::intExplode(',', $groupList) as $groupUid) {
743 2
            if (GeneralUtility::inList($accessList, $groupUid)) {
744 1
                return true;
745
            }
746
        }
747 1
        return false;
748
    }
749
750
    /**
751
     * Will expand the parameters configuration to individual values. This follows a certain syntax of the value of each parameter.
752
     * Syntax of values:
753
     * - Basically: If the value is wrapped in [...] it will be expanded according to the following syntax, otherwise the value is taken literally
754
     * - Configuration is splitted by "|" and the parts are processed individually and finally added together
755
     * - For each configuration part:
756
     *         - "[int]-[int]" = Integer range, will be expanded to all values in between, values included, starting from low to high (max. 1000). Example "1-34" or "-40--30"
757
     *         - "_TABLE:[TCA table name];[_PID:[optional page id, default is current page]];[_ENABLELANG:1]" = Look up of table records from PID, filtering out deleted records. Example "_TABLE:tt_content; _PID:123"
758
     *        _ENABLELANG:1 picks only original records without their language overlays
759
     *         - Default: Literal value
760
     *
761
     * @param array $paramArray Array with key (GET var name) and values (value of GET var which is configuration for expansion)
762
     * @param integer $pid Current page ID
763
     * @return array
764
     *
765
     * TODO: Write Functional Tests
766
     */
767 9
    public function expandParameters($paramArray, $pid)
768
    {
769
        // Traverse parameter names:
770 9
        foreach ($paramArray as $p => $v) {
771 9
            $v = trim($v);
772
773
            // If value is encapsulated in square brackets it means there are some ranges of values to find, otherwise the value is literal
774 9
            if (substr($v, 0, 1) === '[' && substr($v, -1) === ']') {
775
                // So, find the value inside brackets and reset the paramArray value as an array.
776 9
                $v = substr($v, 1, -1);
777 9
                $paramArray[$p] = [];
778
779
                // Explode parts and traverse them:
780 9
                $parts = explode('|', $v);
781 9
                foreach ($parts as $pV) {
782
783
                        // Look for integer range: (fx. 1-34 or -40--30 // reads minus 40 to minus 30)
784 9
                    if (preg_match('/^(-?[0-9]+)\s*-\s*(-?[0-9]+)$/', trim($pV), $reg)) {
785
786
                        // Swap if first is larger than last:
787 1
                        if ($reg[1] > $reg[2]) {
788
                            $temp = $reg[2];
789
                            $reg[2] = $reg[1];
790
                            $reg[1] = $temp;
791
                        }
792
793
                        // Traverse range, add values:
794 1
                        $runAwayBrake = 1000; // Limit to size of range!
795 1
                        for ($a = $reg[1]; $a <= $reg[2];$a++) {
796 1
                            $paramArray[$p][] = $a;
797 1
                            $runAwayBrake--;
798 1
                            if ($runAwayBrake <= 0) {
799
                                break;
800
                            }
801
                        }
802 8
                    } elseif (substr(trim($pV), 0, 7) == '_TABLE:') {
803
804
                        // Parse parameters:
805 6
                        $subparts = GeneralUtility::trimExplode(';', $pV);
806 6
                        $subpartParams = [];
807 6
                        foreach ($subparts as $spV) {
808 6
                            [$pKey, $pVal] = GeneralUtility::trimExplode(':', $spV);
809 6
                            $subpartParams[$pKey] = $pVal;
810
                        }
811
812
                        // Table exists:
813 6
                        if (isset($GLOBALS['TCA'][$subpartParams['_TABLE']])) {
814 6
                            $lookUpPid = isset($subpartParams['_PID']) ? intval($subpartParams['_PID']) : intval($pid);
815 6
                            $recursiveDepth = isset($subpartParams['_RECURSIVE']) ? intval($subpartParams['_RECURSIVE']) : 0;
816 6
                            $pidField = isset($subpartParams['_PIDFIELD']) ? trim($subpartParams['_PIDFIELD']) : 'pid';
817 6
                            $where = $subpartParams['_WHERE'] ?? '';
818 6
                            $addTable = $subpartParams['_ADDTABLE'] ?? '';
819
820 6
                            $fieldName = $subpartParams['_FIELD'] ? $subpartParams['_FIELD'] : 'uid';
821 6
                            if ($fieldName === 'uid' || $GLOBALS['TCA'][$subpartParams['_TABLE']]['columns'][$fieldName]) {
822 6
                                $queryBuilder = $this->getQueryBuilder($subpartParams['_TABLE']);
823
824 6
                                if ($recursiveDepth > 0) {
825
                                    /** @var \TYPO3\CMS\Core\Database\QueryGenerator $queryGenerator */
826 2
                                    $queryGenerator = GeneralUtility::makeInstance(\TYPO3\CMS\Core\Database\QueryGenerator::class);
827 2
                                    $pidList = $queryGenerator->getTreeList($lookUpPid, $recursiveDepth, 0, 1);
828 2
                                    $pidArray = GeneralUtility::intExplode(',', $pidList);
829
                                } else {
830 4
                                    $pidArray = [(string)$lookUpPid];
831
                                }
832
833 6
                                $queryBuilder->getRestrictions()
834 6
                                    ->removeAll()
835 6
                                    ->add(GeneralUtility::makeInstance(DeletedRestriction::class));
836
837
                                $queryBuilder
838 6
                                    ->select($fieldName)
839 6
                                    ->from($subpartParams['_TABLE'])
840 6
                                    ->where(
841 6
                                        $queryBuilder->expr()->in($pidField, $queryBuilder->createNamedParameter($pidArray, Connection::PARAM_INT_ARRAY)),
842 6
                                        $where
843
                                    );
844 6
                                if (!empty($addTable)) {
845
                                    // TODO: Check if this works as intended!
846
                                    $queryBuilder->add('from', $addTable);
847
                                }
848 6
                                $transOrigPointerField = $GLOBALS['TCA'][$subpartParams['_TABLE']]['ctrl']['transOrigPointerField'];
849
850 6
                                if ($subpartParams['_ENABLELANG'] && $transOrigPointerField) {
851
                                    $queryBuilder->andWhere(
852
                                        $queryBuilder->expr()->lte(
853
                                            $transOrigPointerField,
854
                                            0
855
                                        )
856
                                    );
857
                                }
858
859 6
                                $statement = $queryBuilder->execute();
860
861 6
                                $rows = [];
862 6
                                while ($row = $statement->fetch()) {
863 6
                                    $rows[$row[$fieldName]] = $row;
864
                                }
865
866 6
                                if (is_array($rows)) {
867 6
                                    $paramArray[$p] = array_merge($paramArray[$p], array_keys($rows));
868
                                }
869
                            }
870
                        }
871
                    } else { // Just add value:
872 2
                        $paramArray[$p][] = $pV;
873
                    }
874
                    // Hook for processing own expandParameters place holder
875 9
                    if (is_array($GLOBALS['TYPO3_CONF_VARS']['SC_OPTIONS']['crawler/class.tx_crawler_lib.php']['expandParameters'])) {
876
                        $_params = [
877
                            'pObj' => &$this,
878
                            'paramArray' => &$paramArray,
879
                            'currentKey' => $p,
880
                            'currentValue' => $pV,
881
                            'pid' => $pid,
882
                        ];
883
                        foreach ($GLOBALS['TYPO3_CONF_VARS']['SC_OPTIONS']['crawler/class.tx_crawler_lib.php']['expandParameters'] as $_funcRef) {
884
                            GeneralUtility::callUserFunction($_funcRef, $_params, $this);
885
                        }
886
                    }
887
                }
888
889
                // Make unique set of values and sort array by key:
890 9
                $paramArray[$p] = array_unique($paramArray[$p]);
891 9
                ksort($paramArray);
892
            } else {
893
                // Set the literal value as only value in array:
894 2
                $paramArray[$p] = [$v];
895
            }
896
        }
897
898 9
        return $paramArray;
899
    }
900
901
    /**
902
     * Compiling URLs from parameter array (output of expandParameters())
903
     * The number of URLs will be the multiplication of the number of parameter values for each key
904
     *
905
     * @param array $paramArray Output of expandParameters(): Array with keys (GET var names) and for each an array of values
906
     * @param array $urls URLs accumulated in this array (for recursion)
907
     * @return array
908
     */
909 5
    public function compileUrls($paramArray, array $urls)
910
    {
911 5
        if (empty($paramArray)) {
912 5
            return $urls;
913
        }
914
        // shift first off stack:
915 4
        reset($paramArray);
916 4
        $varName = key($paramArray);
917 4
        $valueSet = array_shift($paramArray);
918
919
        // Traverse value set:
920 4
        $newUrls = [];
921 4
        foreach ($urls as $url) {
922 3
            foreach ($valueSet as $val) {
923 3
                $newUrls[] = $url . (strcmp((string) $val, '') ? '&' . rawurlencode($varName) . '=' . rawurlencode((string) $val) : '');
924
925 3
                if (count($newUrls) > $this->maximumUrlsToCompile) {
926
                    break;
927
                }
928
            }
929
        }
930 4
        return $this->compileUrls($paramArray, $newUrls);
931
    }
932
933
    /************************************
934
     *
935
     * Crawler log
936
     *
937
     ************************************/
938
939
    /**
940
     * Return array of records from crawler queue for input page ID
941
     *
942
     * @param integer $id Page ID for which to look up log entries.
943
     * @param string $filter Filter: "all" => all entries, "pending" => all that is not yet run, "finished" => all complete ones
944
     * @param boolean $doFlush If TRUE, then entries selected at DELETED(!) instead of selected!
945
     * @param boolean $doFullFlush
946
     * @param integer $itemsPerPage Limit the amount of entries per page default is 10
947
     * @return array
948
     */
949 4
    public function getLogEntriesForPageId($id, $filter = '', $doFlush = false, $doFullFlush = false, $itemsPerPage = 10)
950
    {
951 4
        $queryBuilder = GeneralUtility::makeInstance(ConnectionPool::class)->getQueryBuilderForTable($this->tableName);
952
        $queryBuilder
953 4
            ->select('*')
954 4
            ->from($this->tableName)
955 4
            ->where(
956 4
                $queryBuilder->expr()->eq('page_id', $queryBuilder->createNamedParameter($id, \PDO::PARAM_INT))
957
            )
958 4
            ->orderBy('scheduled', 'DESC');
959
960 4
        $expressionBuilder = GeneralUtility::makeInstance(ConnectionPool::class)
961 4
            ->getConnectionForTable($this->tableName)
962 4
            ->getExpressionBuilder();
963 4
        $query = $expressionBuilder->andX();
964
        // PHPStorm adds the highlight that the $addWhere is immediately overwritten,
965
        // but the $query = $expressionBuilder->andX() ensures that the $addWhere is written correctly with AND
966
        // between the statements, it's not a mistake in the code.
967 4
        $addWhere = '';
968 4
        switch ($filter) {
969 4
            case 'pending':
970
                $queryBuilder->andWhere($queryBuilder->expr()->eq('exec_time', 0));
971
                $addWhere = ' AND ' . $query->add($expressionBuilder->eq('exec_time', 0));
0 ignored issues
show
Unused Code introduced by
The assignment to $addWhere is dead and can be removed.
Loading history...
972
                break;
973 4
            case 'finished':
974
                $queryBuilder->andWhere($queryBuilder->expr()->gt('exec_time', 0));
975
                $addWhere = ' AND ' . $query->add($expressionBuilder->gt('exec_time', 0));
976
                break;
977
        }
978
979
        // FIXME: Write unit test that ensures that the right records are deleted.
980 4
        if ($doFlush) {
981 2
            $addWhere = $query->add($expressionBuilder->eq('page_id', (int)$id));
982 2
            $this->flushQueue($doFullFlush ? '1=1' : $addWhere);
983 2
            return [];
984
        } else {
985 2
            if ($itemsPerPage > 0) {
986
                $queryBuilder
987 2
                    ->setMaxResults((int)$itemsPerPage);
988
            }
989
990 2
            return $queryBuilder->execute()->fetchAll();
991
        }
992
    }
993
994
    /**
995
     * Return array of records from crawler queue for input set ID
996
     *
997
     * @param integer $set_id Set ID for which to look up log entries.
998
     * @param string $filter Filter: "all" => all entries, "pending" => all that is not yet run, "finished" => all complete ones
999
     * @param boolean $doFlush If TRUE, then entries selected at DELETED(!) instead of selected!
1000
     * @param integer $itemsPerPage Limit the amount of entires per page default is 10
1001
     * @return array
1002
     */
1003 6
    public function getLogEntriesForSetId($set_id, $filter = '', $doFlush = false, $doFullFlush = false, $itemsPerPage = 10)
1004
    {
1005 6
        $queryBuilder = GeneralUtility::makeInstance(ConnectionPool::class)->getQueryBuilderForTable($this->tableName);
1006
        $queryBuilder
1007 6
            ->select('*')
1008 6
            ->from($this->tableName)
1009 6
            ->where(
1010 6
                $queryBuilder->expr()->eq('set_id', $queryBuilder->createNamedParameter($set_id, \PDO::PARAM_INT))
1011
            )
1012 6
            ->orderBy('scheduled', 'DESC');
1013
1014 6
        $expressionBuilder = GeneralUtility::makeInstance(ConnectionPool::class)
1015 6
            ->getConnectionForTable($this->tableName)
1016 6
            ->getExpressionBuilder();
1017 6
        $query = $expressionBuilder->andX();
1018
        // FIXME: Write Unit tests for Filters
1019
        // PHPStorm adds the highlight that the $addWhere is immediately overwritten,
1020
        // but the $query = $expressionBuilder->andX() ensures that the $addWhere is written correctly with AND
1021
        // between the statements, it's not a mistake in the code.
1022 6
        $addWhere = '';
1023 6
        switch ($filter) {
1024 6
            case 'pending':
1025 1
                $queryBuilder->andWhere($queryBuilder->expr()->eq('exec_time', 0));
1026 1
                $addWhere = $query->add($expressionBuilder->eq('exec_time', 0));
0 ignored issues
show
Unused Code introduced by
The assignment to $addWhere is dead and can be removed.
Loading history...
1027 1
                break;
1028 5
            case 'finished':
1029 1
                $queryBuilder->andWhere($queryBuilder->expr()->gt('exec_time', 0));
1030 1
                $addWhere = $query->add($expressionBuilder->gt('exec_time', 0));
1031 1
                break;
1032
        }
1033
        // FIXME: Write unit test that ensures that the right records are deleted.
1034 6
        if ($doFlush) {
1035 4
            $addWhere = $query->add($expressionBuilder->eq('set_id', (int)$set_id));
1036 4
            $this->flushQueue($doFullFlush ? '' : $addWhere);
1037 4
            return [];
1038
        } else {
1039 2
            if ($itemsPerPage > 0) {
1040
                $queryBuilder
1041 2
                    ->setMaxResults((int)$itemsPerPage);
1042
            }
1043
1044 2
            return $queryBuilder->execute()->fetchAll();
1045
        }
1046
    }
1047
1048
    /**
1049
     * Removes queue entries
1050
     *
1051
     * @param string $where SQL related filter for the entries which should be removed
1052
     * @return void
1053
     */
1054 10
    protected function flushQueue($where = ''): void
1055
    {
1056 10
        $realWhere = strlen((string) $where) > 0 ? $where : '1=1';
1057
1058 10
        $queryBuilder = $this->getQueryBuilder($this->tableName);
1059
1060 10
        if (EventDispatcher::getInstance()->hasObserver('queueEntryFlush')) {
1061
            $groups = $queryBuilder
1062
                ->select('DISTINCT set_id')
1063
                ->from($this->tableName)
1064
                ->where($realWhere)
1065
                ->execute()
1066
                ->fetchAll();
1067
            if (is_array($groups)) {
0 ignored issues
show
introduced by
The condition is_array($groups) is always true.
Loading history...
1068
                foreach ($groups as $group) {
1069
                    $subSet = $queryBuilder
1070
                        ->select('uid', 'set_id')
1071
                        ->from($this->tableName)
1072
                        ->where(
1073
                            $realWhere,
1074
                            $queryBuilder->expr()->eq('set_id', $group['set_id'])
1075
                        )
1076
                        ->execute()
1077
                        ->fetchAll();
1078
                    EventDispatcher::getInstance()->post('queueEntryFlush', $group['set_id'], $subSet);
1079
                }
1080
            }
1081
        }
1082
1083
        $queryBuilder
1084 10
            ->delete($this->tableName)
1085 10
            ->where($realWhere)
1086 10
            ->execute();
1087 10
    }
1088
1089
    /**
1090
     * Adding call back entries to log (called from hooks typically, see indexed search class "class.crawler.php"
1091
     *
1092
     * @param integer $setId Set ID
1093
     * @param array $params Parameters to pass to call back function
1094
     * @param string $callBack Call back object reference, eg. 'EXT:indexed_search/class.crawler.php:&tx_indexedsearch_crawler'
1095
     * @param integer $page_id Page ID to attach it to
1096
     * @param integer $schedule Time at which to activate
1097
     * @return void
1098
     */
1099
    public function addQueueEntry_callBack($setId, $params, $callBack, $page_id = 0, $schedule = 0): void
1100
    {
1101
        if (!is_array($params)) {
0 ignored issues
show
introduced by
The condition is_array($params) is always true.
Loading history...
1102
            $params = [];
1103
        }
1104
        $params['_CALLBACKOBJ'] = $callBack;
1105
1106
        GeneralUtility::makeInstance(ConnectionPool::class)->getConnectionForTable('tx_crawler_queue')
1107
            ->insert(
1108
                'tx_crawler_queue',
1109
                [
1110
                    'page_id' => (int)$page_id,
1111
                    'parameters' => serialize($params),
1112
                    'scheduled' => (int)$schedule ?: $this->getCurrentTime(),
1113
                    'exec_time' => 0,
1114
                    'set_id' => (int)$setId,
1115
                    'result_data' => '',
1116
                ]
1117
            );
1118
    }
1119
1120
    /************************************
1121
     *
1122
     * URL setting
1123
     *
1124
     ************************************/
1125
1126
    /**
1127
     * Setting a URL for crawling:
1128
     *
1129
     * @param integer $id Page ID
1130
     * @param string $url Complete URL
1131
     * @param array $subCfg Sub configuration array (from TS config)
1132
     * @param integer $tstamp Scheduled-time
1133
     * @param string $configurationHash (optional) configuration hash
1134
     * @param bool $skipInnerDuplicationCheck (optional) skip inner duplication check
1135
     * @return bool
1136
     */
1137 6
    public function addUrl(
1138
        $id,
1139
        $url,
1140
        array $subCfg,
1141
        $tstamp,
1142
        $configurationHash = '',
1143
        $skipInnerDuplicationCheck = false
1144
    ) {
1145 6
        $urlAdded = false;
1146 6
        $rows = [];
1147
1148
        // Creating parameters:
1149
        $parameters = [
1150 6
            'url' => $url,
1151
        ];
1152
1153
        // fe user group simulation:
1154 6
        $uGs = implode(',', array_unique(GeneralUtility::intExplode(',', $subCfg['userGroups'], true)));
1155 6
        if ($uGs) {
1156 1
            $parameters['feUserGroupList'] = $uGs;
1157
        }
1158
1159
        // Setting processing instructions
1160 6
        $parameters['procInstructions'] = GeneralUtility::trimExplode(',', $subCfg['procInstrFilter']);
1161 6
        if (is_array($subCfg['procInstrParams.'])) {
1162 3
            $parameters['procInstrParams'] = $subCfg['procInstrParams.'];
1163
        }
1164
1165
        // Compile value array:
1166 6
        $parameters_serialized = serialize($parameters);
1167
        $fieldArray = [
1168 6
            'page_id' => (int)$id,
1169 6
            'parameters' => $parameters_serialized,
1170 6
            'parameters_hash' => GeneralUtility::shortMD5($parameters_serialized),
1171 6
            'configuration_hash' => $configurationHash,
1172 6
            'scheduled' => $tstamp,
1173 6
            'exec_time' => 0,
1174 6
            'set_id' => (int)$this->setID,
1175 6
            'result_data' => '',
1176 6
            'configuration' => $subCfg['key'],
1177
        ];
1178
1179 6
        if ($this->registerQueueEntriesInternallyOnly) {
1180
            //the entries will only be registered and not stored to the database
1181 1
            $this->queueEntries[] = $fieldArray;
1182
        } else {
1183 5
            if (!$skipInnerDuplicationCheck) {
1184
                // check if there is already an equal entry
1185 4
                $rows = $this->getDuplicateRowsIfExist($tstamp, $fieldArray);
1186
            }
1187
1188 5
            if (empty($rows)) {
1189 4
                $connectionForCrawlerQueue = GeneralUtility::makeInstance(ConnectionPool::class)->getConnectionForTable('tx_crawler_queue');
1190 4
                $connectionForCrawlerQueue->insert(
1191 4
                    'tx_crawler_queue',
1192 4
                    $fieldArray
1193
                );
1194 4
                $uid = $connectionForCrawlerQueue->lastInsertId('tx_crawler_queue', 'qid');
1195 4
                $rows[] = $uid;
1196 4
                $urlAdded = true;
1197 4
                EventDispatcher::getInstance()->post('urlAddedToQueue', $this->setID, ['uid' => $uid, 'fieldArray' => $fieldArray]);
1198
            } else {
1199 1
                EventDispatcher::getInstance()->post('duplicateUrlInQueue', $this->setID, ['rows' => $rows, 'fieldArray' => $fieldArray]);
1200
            }
1201
        }
1202
1203 6
        return $urlAdded;
1204
    }
1205
1206
    /**
1207
     * This method determines duplicates for a queue entry with the same parameters and this timestamp.
1208
     * If the timestamp is in the past, it will check if there is any unprocessed queue entry in the past.
1209
     * If the timestamp is in the future it will check, if the queued entry has exactly the same timestamp
1210
     *
1211
     * @param int $tstamp
1212
     * @param array $fieldArray
1213
     *
1214
     * @return array
1215
     */
1216 7
    protected function getDuplicateRowsIfExist($tstamp, $fieldArray)
1217
    {
1218 7
        $rows = [];
1219
1220 7
        $currentTime = $this->getCurrentTime();
1221
1222 7
        $queryBuilder = GeneralUtility::makeInstance(ConnectionPool::class)->getQueryBuilderForTable($this->tableName);
1223
        $queryBuilder
1224 7
            ->select('qid')
1225 7
            ->from('tx_crawler_queue');
1226
        //if this entry is scheduled with "now"
1227 7
        if ($tstamp <= $currentTime) {
1228 2
            if ($this->extensionSettings['enableTimeslot']) {
1229 1
                $timeBegin = $currentTime - 100;
1230 1
                $timeEnd = $currentTime + 100;
1231
                $queryBuilder
1232 1
                    ->where(
1233 1
                        'scheduled BETWEEN ' . $timeBegin . ' AND ' . $timeEnd . ''
1234
                    )
1235 1
                    ->orWhere(
1236 1
                        $queryBuilder->expr()->lte('scheduled', $currentTime)
1237
                    );
1238
            } else {
1239
                $queryBuilder
1240 1
                    ->where(
1241 2
                        $queryBuilder->expr()->lte('scheduled', $currentTime)
1242
                    );
1243
            }
1244 5
        } elseif ($tstamp > $currentTime) {
1245
            //entry with a timestamp in the future need to have the same schedule time
1246
            $queryBuilder
1247 5
                ->where(
1248 5
                    $queryBuilder->expr()->eq('scheduled', $tstamp)
1249
                );
1250
        }
1251
1252
        $queryBuilder
1253 7
            ->andWhere('NOT exec_time')
1254 7
            ->andWhere('NOT process_id')
1255 7
            ->andWhere($queryBuilder->expr()->eq('page_id', $queryBuilder->createNamedParameter($fieldArray['page_id'], \PDO::PARAM_INT)))
1256 7
            ->andWhere($queryBuilder->expr()->eq('parameters_hash', $queryBuilder->createNamedParameter($fieldArray['parameters_hash'], \PDO::PARAM_STR)))
1257
            ;
1258
1259 7
        $statement = $queryBuilder->execute();
1260
1261 7
        while ($row = $statement->fetch()) {
1262 5
            $rows[] = $row['qid'];
1263
        }
1264
1265 7
        return $rows;
1266
    }
1267
1268
    /**
1269
     * Returns the current system time
1270
     *
1271
     * @return int
1272
     */
1273
    public function getCurrentTime()
1274
    {
1275
        return time();
1276
    }
1277
1278
    /************************************
1279
     *
1280
     * URL reading
1281
     *
1282
     ************************************/
1283
1284
    /**
1285
     * Read URL for single queue entry
1286
     *
1287
     * @param integer $queueId
1288
     * @param boolean $force If set, will process even if exec_time has been set!
1289
     * @return integer
1290
     */
1291
    public function readUrl($queueId, $force = false)
1292
    {
1293
        $queryBuilder = GeneralUtility::makeInstance(ConnectionPool::class)->getQueryBuilderForTable($this->tableName);
1294
        $ret = 0;
1295
        $this->logger->debug('crawler-readurl start ' . microtime(true));
1296
        // Get entry:
1297
        $queryBuilder
1298
            ->select('*')
1299
            ->from('tx_crawler_queue')
1300
            ->where(
1301
                $queryBuilder->expr()->eq('qid', $queryBuilder->createNamedParameter($queueId, \PDO::PARAM_INT))
1302
            );
1303
        if (!$force) {
1304
            $queryBuilder
1305
                ->andWhere('exec_time = 0')
1306
                ->andWhere('process_scheduled > 0');
1307
        }
1308
        $queueRec = $queryBuilder->execute()->fetch();
1309
1310
        if (!is_array($queueRec)) {
1311
            return;
1312
        }
1313
1314
        SignalSlotUtility::emitSignal(
1315
            __CLASS__,
1316
            SignalSlotUtility::SIGNNAL_QUEUEITEM_PREPROCESS,
1317
            [$queueId, &$queueRec]
1318
        );
1319
1320
        // Set exec_time to lock record:
1321
        $field_array = ['exec_time' => $this->getCurrentTime()];
1322
1323
        if (isset($this->processID)) {
1324
            //if mulitprocessing is used we need to store the id of the process which has handled this entry
1325
            $field_array['process_id_completed'] = $this->processID;
1326
        }
1327
1328
        GeneralUtility::makeInstance(ConnectionPool::class)->getConnectionForTable('tx_crawler_queue')
1329
            ->update(
1330
                'tx_crawler_queue',
1331
                $field_array,
1332
                ['qid' => (int)$queueId]
1333
            );
1334
1335
        $result = $this->queueExecutor->executeQueueItem($queueRec, $this);
1336
        $resultData = unserialize($result['content']);
1337
1338
        //atm there's no need to point to specific pollable extensions
1339
        if (is_array($GLOBALS['TYPO3_CONF_VARS']['EXTCONF']['crawler']['pollSuccess'])) {
1340
            foreach ($GLOBALS['TYPO3_CONF_VARS']['EXTCONF']['crawler']['pollSuccess'] as $pollable) {
1341
                // only check the success value if the instruction is runnig
1342
                // it is important to name the pollSuccess key same as the procInstructions key
1343
                if (is_array($resultData['parameters']['procInstructions']) && in_array(
1344
                    $pollable,
1345
                    $resultData['parameters']['procInstructions']
1346
                )
1347
                ) {
1348
                    if (!empty($resultData['success'][$pollable]) && $resultData['success'][$pollable]) {
1349
                        $ret |= self::CLI_STATUS_POLLABLE_PROCESSED;
1350
                    }
1351
                }
1352
            }
1353
        }
1354
1355
        // Set result in log which also denotes the end of the processing of this entry.
1356
        $field_array = ['result_data' => serialize($result)];
1357
1358
        SignalSlotUtility::emitSignal(
1359
            __CLASS__,
1360
            SignalSlotUtility::SIGNNAL_QUEUEITEM_POSTPROCESS,
1361
            [$queueId, &$field_array]
1362
        );
1363
1364
        GeneralUtility::makeInstance(ConnectionPool::class)->getConnectionForTable('tx_crawler_queue')
1365
            ->update(
1366
                'tx_crawler_queue',
1367
                $field_array,
1368
                ['qid' => (int)$queueId]
1369
            );
1370
1371
        $this->logger->debug('crawler-readurl stop ' . microtime(true));
1372
        return $ret;
1373
    }
1374
1375
    /**
1376
     * Read URL for not-yet-inserted log-entry
1377
     *
1378
     * @param array $field_array Queue field array,
1379
     *
1380
     * @return string
1381
     */
1382
    public function readUrlFromArray($field_array)
1383
    {
1384
        // Set exec_time to lock record:
1385
        $field_array['exec_time'] = $this->getCurrentTime();
1386
        $connectionForCrawlerQueue = GeneralUtility::makeInstance(ConnectionPool::class)->getConnectionForTable($this->tableName);
1387
        $connectionForCrawlerQueue->insert(
1388
            $this->tableName,
1389
            $field_array
1390
        );
1391
        $queueId = $field_array['qid'] = $connectionForCrawlerQueue->lastInsertId($this->tableName, 'qid');
1392
1393
        $result = $this->queueExecutor->executeQueueItem($field_array, $this);
1394
1395
        // Set result in log which also denotes the end of the processing of this entry.
1396
        $field_array = ['result_data' => serialize($result)];
1397
1398
        SignalSlotUtility::emitSignal(
1399
            __CLASS__,
1400
            SignalSlotUtility::SIGNNAL_QUEUEITEM_POSTPROCESS,
1401
            [$queueId, &$field_array]
1402
        );
1403
1404
        $connectionForCrawlerQueue->update(
1405
            $this->tableName,
1406
            $field_array,
1407
            ['qid' => $queueId]
1408
        );
1409
1410
        return $result;
0 ignored issues
show
Bug Best Practice introduced by
The expression return $result also could return the type array|boolean which is incompatible with the documented return type string.
Loading history...
1411
    }
1412
1413
    /*****************************
1414
     *
1415
     * Compiling URLs to crawl - tools
1416
     *
1417
     *****************************/
1418
1419
    /**
1420
     * @param integer $id Root page id to start from.
1421
     * @param integer $depth Depth of tree, 0=only id-page, 1= on sublevel, 99 = infinite
1422
     * @param integer $scheduledTime Unix Time when the URL is timed to be visited when put in queue
1423
     * @param integer $reqMinute Number of requests per minute (creates the interleave between requests)
1424
     * @param boolean $submitCrawlUrls If set, submits the URLs to queue in database (real crawling)
1425
     * @param boolean $downloadCrawlUrls If set (and submitcrawlUrls is false) will fill $downloadUrls with entries)
1426
     * @param array $incomingProcInstructions Array of processing instructions
1427
     * @param array $configurationSelection Array of configuration keys
1428
     * @return string
1429
     */
1430
    public function getPageTreeAndUrls(
1431
        $id,
1432
        $depth,
1433
        $scheduledTime,
1434
        $reqMinute,
1435
        $submitCrawlUrls,
1436
        $downloadCrawlUrls,
1437
        array $incomingProcInstructions,
1438
        array $configurationSelection
1439
    ) {
1440
        $this->scheduledTime = $scheduledTime;
1441
        $this->reqMinute = $reqMinute;
1442
        $this->submitCrawlUrls = $submitCrawlUrls;
1443
        $this->downloadCrawlUrls = $downloadCrawlUrls;
1444
        $this->incomingProcInstructions = $incomingProcInstructions;
1445
        $this->incomingConfigurationSelection = $configurationSelection;
1446
1447
        $this->duplicateTrack = [];
1448
        $this->downloadUrls = [];
1449
1450
        // Drawing tree:
1451
        /* @var PageTreeView $tree */
1452
        $tree = GeneralUtility::makeInstance(PageTreeView::class);
1453
        $perms_clause = $this->getBackendUser()->getPagePermsClause(Permission::PAGE_SHOW);
1454
        $tree->init('AND ' . $perms_clause);
1455
1456
        $pageInfo = BackendUtility::readPageAccess($id, $perms_clause);
1457
        if (is_array($pageInfo)) {
0 ignored issues
show
introduced by
The condition is_array($pageInfo) is always true.
Loading history...
1458
            // Set root row:
1459
            $tree->tree[] = [
1460
                'row' => $pageInfo,
1461
                'HTML' => $this->iconFactory->getIconForRecord('pages', $pageInfo, Icon::SIZE_SMALL),
1462
            ];
1463
        }
1464
1465
        // Get branch beneath:
1466
        if ($depth) {
1467
            $tree->getTree($id, $depth, '');
1468
        }
1469
1470
        // Traverse page tree:
1471
        $code = '';
1472
1473
        foreach ($tree->tree as $data) {
1474
            $this->MP = false;
1475
1476
            // recognize mount points
1477
            if ($data['row']['doktype'] == PageRepository::DOKTYPE_MOUNTPOINT) {
1478
                $queryBuilder = GeneralUtility::makeInstance(ConnectionPool::class)->getQueryBuilderForTable($this->tableName);
1479
                $queryBuilder->getRestrictions()->removeAll()->add(GeneralUtility::makeInstance(DeletedRestriction::class));
1480
                $mountpage = $queryBuilder
1481
                    ->select('*')
1482
                    ->from('pages')
1483
                    ->where(
1484
                        $queryBuilder->expr()->eq('uid', $queryBuilder->createNamedParameter($data['row']['uid'], \PDO::PARAM_INT))
1485
                    )
1486
                    ->execute()
1487
                    ->fetchAll();
1488
                $queryBuilder->resetRestrictions();
1489
1490
                // fetch mounted pages
1491
                $this->MP = $mountpage[0]['mount_pid'] . '-' . $data['row']['uid'];
0 ignored issues
show
Documentation Bug introduced by
The property $MP was declared of type boolean, but $mountpage[0]['mount_pid...' . $data['row']['uid'] is of type string. Maybe add a type cast?

This check looks for assignments to scalar types that may be of the wrong type.

To ensure the code behaves as expected, it may be a good idea to add an explicit type cast.

$answer = 42;

$correct = false;

$correct = (bool) $answer;
Loading history...
1492
1493
                $mountTree = GeneralUtility::makeInstance(PageTreeView::class);
1494
                $mountTree->init('AND ' . $perms_clause);
1495
                $mountTree->getTree($mountpage[0]['mount_pid'], $depth);
1496
1497
                foreach ($mountTree->tree as $mountData) {
1498
                    $code .= $this->drawURLs_addRowsForPage(
1499
                        $mountData['row'],
1500
                        $mountData['HTML'] . BackendUtility::getRecordTitle('pages', $mountData['row'], true)
1501
                    );
1502
                }
1503
1504
                // replace page when mount_pid_ol is enabled
1505
                if ($mountpage[0]['mount_pid_ol']) {
1506
                    $data['row']['uid'] = $mountpage[0]['mount_pid'];
1507
                } else {
1508
                    // if the mount_pid_ol is not set the MP must not be used for the mountpoint page
1509
                    $this->MP = false;
1510
                }
1511
            }
1512
1513
            $code .= $this->drawURLs_addRowsForPage(
1514
                $data['row'],
1515
                $data['HTML'] . BackendUtility::getRecordTitle('pages', $data['row'], true)
1516
            );
1517
        }
1518
1519
        return $code;
1520
    }
1521
1522
    /**
1523
     * Expands exclude string
1524
     *
1525
     * @param string $excludeString Exclude string
1526
     * @return array
1527
     */
1528 1
    public function expandExcludeString($excludeString)
1529
    {
1530
        // internal static caches;
1531 1
        static $expandedExcludeStringCache;
1532 1
        static $treeCache;
1533
1534 1
        if (empty($expandedExcludeStringCache[$excludeString])) {
1535 1
            $pidList = [];
1536
1537 1
            if (!empty($excludeString)) {
1538
                /** @var PageTreeView $tree */
1539
                $tree = GeneralUtility::makeInstance(PageTreeView::class);
1540
                $tree->init('AND ' . $this->getBackendUser()->getPagePermsClause(Permission::PAGE_SHOW));
1541
1542
                $excludeParts = GeneralUtility::trimExplode(',', $excludeString);
1543
1544
                foreach ($excludeParts as $excludePart) {
1545
                    [$pid, $depth] = GeneralUtility::trimExplode('+', $excludePart);
1546
1547
                    // default is "page only" = "depth=0"
1548
                    if (empty($depth)) {
1549
                        $depth = (stristr($excludePart, '+')) ? 99 : 0;
1550
                    }
1551
1552
                    $pidList[] = $pid;
1553
1554
                    if ($depth > 0) {
1555
                        if (empty($treeCache[$pid][$depth])) {
1556
                            $tree->reset();
1557
                            $tree->getTree($pid, $depth);
1558
                            $treeCache[$pid][$depth] = $tree->tree;
1559
                        }
1560
1561
                        foreach ($treeCache[$pid][$depth] as $data) {
1562
                            $pidList[] = $data['row']['uid'];
1563
                        }
1564
                    }
1565
                }
1566
            }
1567
1568 1
            $expandedExcludeStringCache[$excludeString] = array_unique($pidList);
1569
        }
1570
1571 1
        return $expandedExcludeStringCache[$excludeString];
1572
    }
1573
1574
    /**
1575
     * Create the rows for display of the page tree
1576
     * For each page a number of rows are shown displaying GET variable configuration
1577
     *
1578
     * @param    array        Page row
1579
     * @param    string        Page icon and title for row
0 ignored issues
show
Bug introduced by
The type AOE\Crawler\Controller\Page was not found. Maybe you did not declare it correctly or list all dependencies?

The issue could also be caused by a filter entry in the build configuration. If the path has been excluded in your configuration, e.g. excluded_paths: ["lib/*"], you can move it to the dependency path list as follows:

filter:
    dependency_paths: ["lib/*"]

For further information see https://scrutinizer-ci.com/docs/tools/php/php-scrutinizer/#list-dependency-paths

Loading history...
1580
     * @return    string        HTML <tr> content (one or more)
1581
     */
1582
    public function drawURLs_addRowsForPage(array $pageRow, $pageTitleAndIcon)
1583
    {
1584
        $skipMessage = '';
1585
1586
        // Get list of configurations
1587
        $configurations = $this->getUrlsForPageRow($pageRow, $skipMessage);
1588
1589
        if (!empty($this->incomingConfigurationSelection)) {
1590
            // remove configuration that does not match the current selection
1591
            foreach ($configurations as $confKey => $confArray) {
1592
                if (!in_array($confKey, $this->incomingConfigurationSelection)) {
1593
                    unset($configurations[$confKey]);
1594
                }
1595
            }
1596
        }
1597
1598
        // Traverse parameter combinations:
1599
        $c = 0;
1600
        $content = '';
1601
        if (!empty($configurations)) {
1602
            foreach ($configurations as $confKey => $confArray) {
1603
1604
                    // Title column:
1605
                if (!$c) {
1606
                    $titleClm = '<td rowspan="' . count($configurations) . '">' . $pageTitleAndIcon . '</td>';
1607
                } else {
1608
                    $titleClm = '';
1609
                }
1610
1611
                if (!in_array($pageRow['uid'], $this->expandExcludeString($confArray['subCfg']['exclude']))) {
1612
1613
                        // URL list:
1614
                    $urlList = $this->urlListFromUrlArray(
1615
                        $confArray,
1616
                        $pageRow,
1617
                        $this->scheduledTime,
1618
                        $this->reqMinute,
1619
                        $this->submitCrawlUrls,
1620
                        $this->downloadCrawlUrls,
1621
                        $this->duplicateTrack,
1622
                        $this->downloadUrls,
1623
                        $this->incomingProcInstructions // if empty the urls won't be filtered by processing instructions
1624
                    );
1625
1626
                    // Expanded parameters:
1627
                    $paramExpanded = '';
1628
                    $calcAccu = [];
1629
                    $calcRes = 1;
1630
                    foreach ($confArray['paramExpanded'] as $gVar => $gVal) {
1631
                        $paramExpanded .= '
1632
                            <tr>
1633
                                <td>' . htmlspecialchars('&' . $gVar . '=') . '<br/>' .
1634
                                '(' . count($gVal) . ')' .
1635
                                '</td>
1636
                                <td nowrap="nowrap">' . nl2br(htmlspecialchars(implode(chr(10), $gVal))) . '</td>
1637
                            </tr>
1638
                        ';
1639
                        $calcRes *= count($gVal);
1640
                        $calcAccu[] = count($gVal);
1641
                    }
1642
                    $paramExpanded = '<table>' . $paramExpanded . '</table>';
1643
                    $paramExpanded .= 'Comb: ' . implode('*', $calcAccu) . '=' . $calcRes;
1644
1645
                    // Options
1646
                    $optionValues = '';
1647
                    if ($confArray['subCfg']['userGroups']) {
1648
                        $optionValues .= 'User Groups: ' . $confArray['subCfg']['userGroups'] . '<br/>';
1649
                    }
1650
                    if ($confArray['subCfg']['procInstrFilter']) {
1651
                        $optionValues .= 'ProcInstr: ' . $confArray['subCfg']['procInstrFilter'] . '<br/>';
1652
                    }
1653
1654
                    // Compile row:
1655
                    $content .= '
1656
                        <tr>
1657
                            ' . $titleClm . '
1658
                            <td>' . htmlspecialchars($confKey) . '</td>
1659
                            <td>' . nl2br(htmlspecialchars(rawurldecode(trim(str_replace('&', chr(10) . '&', GeneralUtility::implodeArrayForUrl('', $confArray['paramParsed'])))))) . '</td>
1660
                            <td>' . $paramExpanded . '</td>
1661
                            <td nowrap="nowrap">' . $urlList . '</td>
1662
                            <td nowrap="nowrap">' . $optionValues . '</td>
1663
                            <td nowrap="nowrap">' . DebugUtility::viewArray($confArray['subCfg']['procInstrParams.']) . '</td>
1664
                        </tr>';
1665
                } else {
1666
                    $content .= '<tr>
1667
                            ' . $titleClm . '
1668
                            <td>' . htmlspecialchars($confKey) . '</td>
1669
                            <td colspan="5"><em>No entries</em> (Page is excluded in this configuration)</td>
1670
                        </tr>';
1671
                }
1672
1673
                $c++;
1674
            }
1675
        } else {
1676
            $message = !empty($skipMessage) ? ' (' . $skipMessage . ')' : '';
1677
1678
            // Compile row:
1679
            $content .= '
1680
                <tr>
1681
                    <td>' . $pageTitleAndIcon . '</td>
1682
                    <td colspan="6"><em>No entries</em>' . $message . '</td>
1683
                </tr>';
1684
        }
1685
1686
        return $content;
1687
    }
1688
1689
    /*****************************
1690
     *
1691
     * CLI functions
1692
     *
1693
     *****************************/
1694
1695
    /**
1696
     * Running the functionality of the CLI (crawling URLs from queue)
1697
     *
1698
     * @param int $countInARun
1699
     * @param int $sleepTime
1700
     * @param int $sleepAfterFinish
1701
     * @return string
1702
     */
1703
    public function CLI_run($countInARun, $sleepTime, $sleepAfterFinish)
1704
    {
1705
        $result = 0;
1706
        $counter = 0;
1707
1708
        // First, run hooks:
1709
        $this->CLI_runHooks();
1710
1711
        // Clean up the queue
1712
        if ((int)$this->extensionSettings['purgeQueueDays'] > 0) {
1713
            $purgeDate = $this->getCurrentTime() - 24 * 60 * 60 * (int)$this->extensionSettings['purgeQueueDays'];
1714
1715
            $queryBuilderDelete = GeneralUtility::makeInstance(ConnectionPool::class)->getQueryBuilderForTable($this->tableName);
1716
            $del = $queryBuilderDelete
1717
                ->delete($this->tableName)
1718
                ->where(
1719
                    'exec_time != 0 AND exec_time < ' . $purgeDate
1720
                )->execute();
1721
1722
            if (false === $del) {
1723
                $this->logger->info(
1724
                    'Records could not be deleted.'
1725
                );
1726
            }
1727
        }
1728
1729
        // Select entries:
1730
        //TODO Shouldn't this reside within the transaction?
1731
        $queryBuilderSelect = GeneralUtility::makeInstance(ConnectionPool::class)->getQueryBuilderForTable($this->tableName);
1732
        $rows = $queryBuilderSelect
1733
            ->select('qid', 'scheduled')
1734
            ->from($this->tableName)
1735
            ->where(
1736
                $queryBuilderSelect->expr()->eq('exec_time', 0),
1737
                $queryBuilderSelect->expr()->eq('process_scheduled', 0),
1738
                $queryBuilderSelect->expr()->lte('scheduled', $this->getCurrentTime())
1739
            )
1740
            ->orderBy('scheduled')
1741
            ->addOrderBy('qid')
1742
            ->setMaxResults($countInARun)
1743
            ->execute()
1744
            ->fetchAll();
1745
1746
        if (!empty($rows)) {
1747
            $quidList = [];
1748
1749
            foreach ($rows as $r) {
1750
                $quidList[] = $r['qid'];
1751
            }
1752
1753
            $processId = $this->CLI_buildProcessId();
1754
1755
            //reserve queue entries for process
1756
1757
            //$this->queryBuilder->getConnection()->executeQuery('BEGIN');
1758
            //TODO make sure we're not taking assigned queue-entires
1759
1760
            //save the number of assigned queue entrys to determine who many have been processed later
1761
            $queryBuilderUpdate = GeneralUtility::makeInstance(ConnectionPool::class)->getQueryBuilderForTable($this->tableName);
1762
            $numberOfAffectedRows = $queryBuilderUpdate
1763
                ->update($this->tableName)
1764
                ->where(
1765
                    $queryBuilderUpdate->expr()->in('qid', $quidList)
1766
                )
1767
                ->set('process_scheduled', $this->getCurrentTime())
1768
                ->set('process_id', $processId)
1769
                ->execute();
1770
1771
            GeneralUtility::makeInstance(ConnectionPool::class)->getConnectionForTable('tx_crawler_process')
1772
                ->update(
1773
                    'tx_crawler_process',
1774
                    ['assigned_items_count' => (int)$numberOfAffectedRows],
1775
                    ['process_id' => $processId]
1776
                );
1777
1778
            if ($numberOfAffectedRows == count($quidList)) {
1779
                //$this->queryBuilder->getConnection()->executeQuery('COMMIT');
1780
            } else {
1781
                //$this->queryBuilder->getConnection()->executeQuery('ROLLBACK');
1782
                $this->CLI_debug("Nothing processed due to multi-process collision (" . $this->CLI_buildProcessId() . ")");
1783
                return ($result | self::CLI_STATUS_ABORTED);
1784
            }
1785
1786
            foreach ($rows as $r) {
1787
                $result |= $this->readUrl($r['qid']);
1788
1789
                $counter++;
1790
                usleep((int)$sleepTime); // Just to relax the system
1791
1792
                // if during the start and the current read url the cli has been disable we need to return from the function
1793
                // mark the process NOT as ended.
1794
                if ($this->getDisabled()) {
1795
                    return ($result | self::CLI_STATUS_ABORTED);
1796
                }
1797
1798
                if (!$this->processRepository->isProcessActive($this->CLI_buildProcessId())) {
1799
                    $this->CLI_debug("conflict / timeout (" . $this->CLI_buildProcessId() . ")");
1800
1801
                    //TODO might need an additional returncode
1802
                    $result |= self::CLI_STATUS_ABORTED;
1803
                    break; //possible timeout
1804
                }
1805
            }
1806
1807
            sleep((int)$sleepAfterFinish);
1808
1809
            $msg = 'Rows: ' . $counter;
1810
            $this->CLI_debug($msg . " (" . $this->CLI_buildProcessId() . ")");
1811
        } else {
1812
            $this->CLI_debug("Nothing within queue which needs to be processed (" . $this->CLI_buildProcessId() . ")");
1813
        }
1814
1815
        if ($counter > 0) {
1816
            $result |= self::CLI_STATUS_PROCESSED;
1817
        }
1818
1819
        return $result;
1820
    }
1821
1822
    /**
1823
     * Activate hooks
1824
     *
1825
     * @return void
1826
     */
1827
    public function CLI_runHooks(): void
1828
    {
1829
        foreach ($GLOBALS['TYPO3_CONF_VARS']['EXTCONF']['crawler']['cli_hooks'] ?? [] as $objRef) {
1830
            $hookObj = GeneralUtility::makeInstance($objRef);
1831
            if (is_object($hookObj)) {
1832
                $hookObj->crawler_init($this);
1833
            }
1834
        }
1835
    }
1836
1837
    /**
1838
     * Try to acquire a new process with the given id
1839
     * also performs some auto-cleanup for orphan processes
1840
     * @todo preemption might not be the most elegant way to clean up
1841
     *
1842
     * @param string $id identification string for the process
1843
     * @return boolean
1844
     */
1845
    public function CLI_checkAndAcquireNewProcess($id)
1846
    {
1847
        $queryBuilder = GeneralUtility::makeInstance(ConnectionPool::class)->getQueryBuilderForTable($this->tableName);
1848
        $ret = true;
1849
1850
        $systemProcessId = getmypid();
1851
        if ($systemProcessId < 1) {
1852
            return false;
1853
        }
1854
1855
        $processCount = 0;
1856
        $orphanProcesses = [];
1857
1858
        //$this->queryBuilder->getConnection()->executeQuery('BEGIN');
1859
1860
        $statement = $queryBuilder
1861
            ->select('process_id', 'ttl')
1862
            ->from('tx_crawler_process')
1863
            ->where(
1864
                'active = 1 AND deleted = 0'
1865
            )
1866
            ->execute();
1867
1868
        $currentTime = $this->getCurrentTime();
1869
1870
        while ($row = $statement->fetch()) {
1871
            if ($row['ttl'] < $currentTime) {
1872
                $orphanProcesses[] = $row['process_id'];
1873
            } else {
1874
                $processCount++;
1875
            }
1876
        }
1877
1878
        // if there are less than allowed active processes then add a new one
1879
        if ($processCount < (int)$this->extensionSettings['processLimit']) {
1880
            $this->CLI_debug("add process " . $this->CLI_buildProcessId() . " (" . ($processCount + 1) . "/" . (int)$this->extensionSettings['processLimit'] . ")");
1881
1882
            GeneralUtility::makeInstance(ConnectionPool::class)->getConnectionForTable('tx_crawler_process')->insert(
1883
                'tx_crawler_process',
1884
                [
1885
                    'process_id' => $id,
1886
                    'active' => 1,
1887
                    'ttl' => $currentTime + (int)$this->extensionSettings['processMaxRunTime'],
1888
                    'system_process_id' => $systemProcessId,
1889
                ]
1890
            );
1891
        } else {
1892
            $this->CLI_debug("Processlimit reached (" . ($processCount) . "/" . (int)$this->extensionSettings['processLimit'] . ")");
1893
            $ret = false;
1894
        }
1895
1896
        $this->processRepository->deleteProcessesMarkedAsDeleted();
1897
        $this->CLI_releaseProcesses($orphanProcesses, true); // maybe this should be somehow included into the current lock
1898
1899
        return $ret;
1900
    }
1901
1902
    /**
1903
     * Release a process and the required resources
1904
     *
1905
     * @param  mixed    $releaseIds   string with a single process-id or array with multiple process-ids
1906
     * @param  boolean  $withinLock   show whether the DB-actions are included within an existing lock
1907
     * @return boolean
1908
     */
1909
    public function CLI_releaseProcesses($releaseIds, $withinLock = false)
1910
    {
1911
        $queryBuilder = GeneralUtility::makeInstance(ConnectionPool::class)->getQueryBuilderForTable($this->tableName);
1912
1913
        if (!is_array($releaseIds)) {
1914
            $releaseIds = [$releaseIds];
1915
        }
1916
1917
        if (empty($releaseIds)) {
1918
            return false;   //nothing to release
1919
        }
1920
1921
        if (!$withinLock) {
1922
            //$this->queryBuilder->getConnection()->executeQuery('BEGIN');
1923
        }
1924
1925
        // some kind of 2nd chance algo - this way you need at least 2 processes to have a real cleanup
1926
        // this ensures that a single process can't mess up the entire process table
1927
1928
        // mark all processes as deleted which have no "waiting" queue-entires and which are not active
1929
1930
        $queryBuilder
1931
        ->update($this->tableName, 'q')
1932
        ->where(
1933
            'q.process_id IN(SELECT p.process_id FROM tx_crawler_process as p WHERE p.active = 0)'
1934
        )
1935
        ->set('q.process_scheduled', 0)
1936
        ->set('q.process_id', '')
1937
        ->execute();
1938
1939
        // FIXME: Not entirely sure that this is equivalent to the previous version
1940
        $queryBuilder->resetQueryPart('set');
1941
1942
        $queryBuilder
1943
            ->update('tx_crawler_process')
1944
            ->where(
1945
                $queryBuilder->expr()->eq('active', 0),
1946
                'process_id IN(SELECT q.process_id FROM tx_crawler_queue as q WHERE q.exec_time = 0)'
1947
            )
1948
            ->set('system_process_id', 0)
1949
            ->execute();
1950
        // previous version for reference
1951
        /*
1952
        $GLOBALS['TYPO3_DB']->exec_UPDATEquery(
1953
            'tx_crawler_process',
1954
            'active=0 AND deleted=0
1955
            AND NOT EXISTS (
1956
                SELECT * FROM tx_crawler_queue
1957
                WHERE tx_crawler_queue.process_id = tx_crawler_process.process_id
1958
                AND tx_crawler_queue.exec_time = 0
1959
            )',
1960
            [
1961
                'deleted' => '1',
1962
                'system_process_id' => 0
1963
            ]
1964
        );*/
1965
        // mark all requested processes as non-active
1966
        $queryBuilder
1967
            ->update('tx_crawler_process')
1968
            ->where(
1969
                'NOT EXISTS (
1970
                SELECT * FROM tx_crawler_queue
1971
                    WHERE tx_crawler_queue.process_id = tx_crawler_process.process_id
1972
                    AND tx_crawler_queue.exec_time = 0
1973
                )',
1974
                $queryBuilder->expr()->in('process_id', $queryBuilder->createNamedParameter($releaseIds, Connection::PARAM_STR_ARRAY)),
1975
                $queryBuilder->expr()->eq('deleted', 0)
1976
            )
1977
            ->set('active', 0)
1978
            ->execute();
1979
        $queryBuilder->resetQueryPart('set');
1980
        $queryBuilder
1981
            ->update($this->tableName)
1982
            ->where(
1983
                $queryBuilder->expr()->eq('exec_time', 0),
1984
                $queryBuilder->expr()->in('process_id', $queryBuilder->createNamedParameter($releaseIds, Connection::PARAM_STR_ARRAY))
1985
            )
1986
            ->set('process_scheduled', 0)
1987
            ->set('process_id', '')
1988
            ->execute();
1989
1990
        if (!$withinLock) {
1991
            //$this->queryBuilder->getConnection()->executeQuery('COMMIT');
1992
        }
1993
1994
        return true;
1995
    }
1996
1997
    /**
1998
     * Create a unique Id for the current process
1999
     *
2000
     * @return string  the ID
2001
     */
2002 1
    public function CLI_buildProcessId()
2003
    {
2004 1
        if (!$this->processID) {
2005
            $this->processID = GeneralUtility::shortMD5(microtime(true));
2006
        }
2007 1
        return $this->processID;
2008
    }
2009
2010
    /**
2011
     * Prints a message to the stdout (only if debug-mode is enabled)
2012
     *
2013
     * @param  string $msg  the message
2014
     */
2015
    public function CLI_debug($msg): void
2016
    {
2017
        if ((int)$this->extensionSettings['processDebug']) {
2018
            echo $msg . "\n";
2019
            flush();
2020
        }
2021
    }
2022
2023
    /**
2024
     * Cleans up entries that stayed for too long in the queue. These are:
2025
     * - processed entries that are over 1.5 days in age
2026
     * - scheduled entries that are over 7 days old
2027
     *
2028
     * @return void
2029
     */
2030
    public function cleanUpOldQueueEntries(): void
2031
    {
2032
        $processedAgeInSeconds = $this->extensionSettings['cleanUpProcessedAge'] * 86400; // 24*60*60 Seconds in 24 hours
2033
        $scheduledAgeInSeconds = $this->extensionSettings['cleanUpScheduledAge'] * 86400;
2034
2035
        $now = time();
2036
        $condition = '(exec_time<>0 AND exec_time<' . ($now - $processedAgeInSeconds) . ') OR scheduled<=' . ($now - $scheduledAgeInSeconds);
2037
        $this->flushQueue($condition);
2038
    }
2039
2040
    /**
2041
     * Returns a md5 hash generated from a serialized configuration array.
2042
     *
2043
     * @param array $configuration
2044
     *
2045
     * @return string
2046
     */
2047 8
    protected function getConfigurationHash(array $configuration)
2048
    {
2049 8
        unset($configuration['paramExpanded']);
2050 8
        unset($configuration['URLs']);
2051 8
        return md5(serialize($configuration));
2052
    }
2053
2054
    /**
2055
     * Build a URL from a Page and the Query String. If the page has a Site configuration, it can be built by using
2056
     * the Site instance.
2057
     *
2058
     * @param int $pageId
2059
     * @param string $queryString
2060
     * @param string|null $alternativeBaseUrl
2061
     * @param int $httpsOrHttp see tx_crawler_configuration.force_ssl
2062
     * @return UriInterface
2063
     * @throws \TYPO3\CMS\Core\Exception\SiteNotFoundException
2064
     * @throws \TYPO3\CMS\Core\Routing\InvalidRouteArgumentsException
2065
     */
2066 2
    protected function getUrlFromPageAndQueryParameters(int $pageId, string $queryString, ?string $alternativeBaseUrl, int $httpsOrHttp): UriInterface
2067
    {
2068 2
        $site = GeneralUtility::makeInstance(SiteMatcher::class)->matchByPageId((int)$pageId);
2069 2
        if ($site instanceof Site) {
2070
            $queryString = ltrim($queryString, '?&');
2071
            $queryParts = [];
2072
            parse_str($queryString, $queryParts);
2073
            unset($queryParts['id']);
2074
            // workaround as long as we don't have native language support in crawler configurations
2075
            if (isset($queryParts['L'])) {
2076
                $queryParts['_language'] = $queryParts['L'];
2077
                unset($queryParts['L']);
2078
                $siteLanguage = $site->getLanguageById((int)$queryParts['_language']);
0 ignored issues
show
Unused Code introduced by
The assignment to $siteLanguage is dead and can be removed.
Loading history...
2079
            } else {
2080
                $siteLanguage = $site->getDefaultLanguage();
2081
            }
2082
            $url = $site->getRouter()->generateUri($pageId, $queryParts);
2083
            if (!empty($alternativeBaseUrl)) {
2084
                $alternativeBaseUrl = new Uri($alternativeBaseUrl);
2085
                $url = $url->withHost($alternativeBaseUrl->getHost());
2086
                $url = $url->withScheme($alternativeBaseUrl->getScheme());
2087
                $url = $url->withPort($alternativeBaseUrl->getPort());
2088
            }
2089
        } else {
2090
            // Technically this is not possible with site handling, but kept for backwards-compatibility reasons
2091
            // Once EXT:crawler is v10-only compatible, this should be removed completely
2092 2
            $baseUrl = ($alternativeBaseUrl ?: GeneralUtility::getIndpEnv('TYPO3_SITE_URL'));
2093 2
            $cacheHashCalculator = GeneralUtility::makeInstance(CacheHashCalculator::class);
2094 2
            $queryString .= '&cHash=' . $cacheHashCalculator->generateForParameters($queryString);
2095 2
            $url = rtrim($baseUrl, '/') . '/index.php' . $queryString;
2096 2
            $url = new Uri($url);
2097
        }
2098
2099 2
        if ($httpsOrHttp === -1) {
2100
            $url = $url->withScheme('http');
2101 2
        } elseif ($httpsOrHttp === 1) {
2102
            $url = $url->withScheme('https');
2103
        }
2104
2105 2
        return $url;
2106
    }
2107
}
2108