Passed
Push — typo3v9 ( 2404ee...b9b5fa )
by Tomas Norre
05:51
created

CrawlerController::drawURLs_addRowsForPage()   C

Complexity

Conditions 12
Paths 6

Size

Total Lines 105
Code Lines 62

Duplication

Lines 0
Ratio 0 %

Code Coverage

Tests 0
CRAP Score 156

Importance

Changes 1
Bugs 0 Features 0
Metric Value
cc 12
eloc 62
c 1
b 0
f 0
nc 6
nop 2
dl 0
loc 105
ccs 0
cts 56
cp 0
crap 156
rs 6.4024

How to fix   Long Method    Complexity   

Long Method

Small methods make your code easier to understand, in particular if combined with a good name. Besides, if your method is small, finding a good name is usually much easier.

For example, if you find yourself adding comments to a method's body, this is usually a good sign to extract the commented part to a new method, and use the comment as a starting point when coming up with a good name for this new method.

Commonly applied refactorings include:

1
<?php
2
3
declare(strict_types=1);
4
5
namespace AOE\Crawler\Controller;
6
7
/***************************************************************
8
 *  Copyright notice
9
 *
10
 *  (c) 2019 AOE GmbH <[email protected]>
11
 *
12
 *  All rights reserved
13
 *
14
 *  This script is part of the TYPO3 project. The TYPO3 project is
15
 *  free software; you can redistribute it and/or modify
16
 *  it under the terms of the GNU General Public License as published by
17
 *  the Free Software Foundation; either version 3 of the License, or
18
 *  (at your option) any later version.
19
 *
20
 *  The GNU General Public License can be found at
21
 *  http://www.gnu.org/copyleft/gpl.html.
22
 *
23
 *  This script is distributed in the hope that it will be useful,
24
 *  but WITHOUT ANY WARRANTY; without even the implied warranty of
25
 *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
26
 *  GNU General Public License for more details.
27
 *
28
 *  This copyright notice MUST APPEAR in all copies of the script!
29
 ***************************************************************/
30
31
use AOE\Crawler\Configuration\ExtensionConfigurationProvider;
32
use AOE\Crawler\Domain\Repository\ConfigurationRepository;
33
use AOE\Crawler\Domain\Repository\ProcessRepository;
34
use AOE\Crawler\Domain\Repository\QueueRepository;
35
use AOE\Crawler\Event\EventDispatcher;
36
use AOE\Crawler\QueueExecutor;
37
use AOE\Crawler\Utility\SignalSlotUtility;
38
use Psr\Http\Message\UriInterface;
39
use Psr\Log\LoggerAwareInterface;
40
use Psr\Log\LoggerAwareTrait;
41
use TYPO3\CMS\Backend\Tree\View\PageTreeView;
42
use TYPO3\CMS\Backend\Utility\BackendUtility;
43
use TYPO3\CMS\Core\Authentication\BackendUserAuthentication;
44
use TYPO3\CMS\Core\Core\Bootstrap;
45
use TYPO3\CMS\Core\Core\Environment;
46
use TYPO3\CMS\Core\Database\Connection;
47
use TYPO3\CMS\Core\Database\ConnectionPool;
48
use TYPO3\CMS\Core\Database\Query\Restriction\DeletedRestriction;
49
use TYPO3\CMS\Core\Http\Uri;
50
use TYPO3\CMS\Core\Imaging\Icon;
51
use TYPO3\CMS\Core\Imaging\IconFactory;
52
use TYPO3\CMS\Core\Routing\SiteMatcher;
53
use TYPO3\CMS\Core\Site\Entity\Site;
54
use TYPO3\CMS\Core\Type\Bitmask\Permission;
55
use TYPO3\CMS\Core\TypoScript\Parser\TypoScriptParser;
56
use TYPO3\CMS\Core\Utility\DebugUtility;
57
use TYPO3\CMS\Core\Utility\GeneralUtility;
58
use TYPO3\CMS\Core\Utility\MathUtility;
59
use TYPO3\CMS\Extbase\Object\ObjectManager;
60
use TYPO3\CMS\Frontend\Page\CacheHashCalculator;
61
use TYPO3\CMS\Frontend\Page\PageRepository;
62
63
/**
64
 * Class CrawlerController
65
 *
66
 * @package AOE\Crawler\Controller
67
 */
68
class CrawlerController implements LoggerAwareInterface
69
{
70
    use LoggerAwareTrait;
71
72
    public const CLI_STATUS_NOTHING_PROCCESSED = 0;
73
    public const CLI_STATUS_REMAIN = 1; //queue not empty
74
    public const CLI_STATUS_PROCESSED = 2; //(some) queue items where processed
75
    public const CLI_STATUS_ABORTED = 4; //instance didn't finish
76
    public const CLI_STATUS_POLLABLE_PROCESSED = 8;
77
78
    /**
79
     * @var integer
80
     */
81
    public $setID = 0;
82
83
    /**
84
     * @var string
85
     */
86
    public $processID = '';
87
88
    /**
89
     * @var array
90
     */
91
    public $duplicateTrack = [];
92
93
    /**
94
     * @var array
95
     */
96
    public $downloadUrls = [];
97
98
    /**
99
     * @var array
100
     */
101
    public $incomingProcInstructions = [];
102
103
    /**
104
     * @var array
105
     */
106
    public $incomingConfigurationSelection = [];
107
108
    /**
109
     * @var bool
110
     */
111
    public $registerQueueEntriesInternallyOnly = false;
112
113
    /**
114
     * @var array
115
     */
116
    public $queueEntries = [];
117
118
    /**
119
     * @var array
120
     */
121
    public $urlList = [];
122
123
    /**
124
     * @var array
125
     */
126
    public $extensionSettings = [];
127
128
    /**
129
     * Mount Point
130
     *
131
     * @var boolean
132
     */
133
    public $MP = false;
134
135
    /**
136
     * @var string
137
     */
138
    protected $processFilename;
139
140
    /**
141
     * Holds the internal access mode can be 'gui','cli' or 'cli_im'
142
     *
143
     * @var string
144
     */
145
    protected $accessMode;
146
147
    /**
148
     * @var BackendUserAuthentication|null
149
     */
150
    private $backendUser;
151
152
    /**
153
     * @var integer
154
     */
155
    private $scheduledTime = 0;
156
157
    /**
158
     * @var integer
159
     */
160
    private $reqMinute = 0;
161
162
    /**
163
     * @var bool
164
     */
165
    private $submitCrawlUrls = false;
166
167
    /**
168
     * @var bool
169
     */
170
    private $downloadCrawlUrls = false;
171
172
    /**
173
     * @var QueueRepository
174
     */
175
    protected $queueRepository;
176
177
    /**
178
     * @var ProcessRepository
179
     */
180
    protected $processRepository;
181
182
    /**
183
     * @var ConfigurationRepository
184
     */
185
    protected $configurationRepository;
186
187
    /**
188
     * @var string
189
     */
190
    protected $tableName = 'tx_crawler_queue';
191
192
    /**
193
     * @var QueueExecutor
194
     */
195
    protected $queueExecutor;
196
197
    /**
198
     * @var int
199
     */
200
    protected $maximumUrlsToCompile = 10000;
201
202
    /**
203
     * @var IconFactory
204
     */
205
    protected $iconFactory;
206
207
    /**
208
     * Method to set the accessMode can be gui, cli or cli_im
209
     *
210
     * @return string
211
     */
212 1
    public function getAccessMode()
213
    {
214 1
        return $this->accessMode;
215
    }
216
217
    /**
218
     * @param string $accessMode
219
     */
220 1
    public function setAccessMode($accessMode): void
221
    {
222 1
        $this->accessMode = $accessMode;
223 1
    }
224
225
    /**
226
     * Set disabled status to prevent processes from being processed
227
     *
228
     * @param  bool $disabled (optional, defaults to true)
229
     * @return void
230
     */
231 3
    public function setDisabled($disabled = true): void
232
    {
233 3
        if ($disabled) {
234 2
            GeneralUtility::writeFile($this->processFilename, '');
235
        } else {
236 1
            if (is_file($this->processFilename)) {
237 1
                unlink($this->processFilename);
238
            }
239
        }
240 3
    }
241
242
    /**
243
     * Get disable status
244
     *
245
     * @return bool true if disabled
246
     */
247 3
    public function getDisabled()
248
    {
249 3
        return is_file($this->processFilename);
250
    }
251
252
    /**
253
     * @param string $filenameWithPath
254
     *
255
     * @return void
256
     */
257 4
    public function setProcessFilename($filenameWithPath): void
258
    {
259 4
        $this->processFilename = $filenameWithPath;
260 4
    }
261
262
    /**
263
     * @return string
264
     */
265 1
    public function getProcessFilename()
266
    {
267 1
        return $this->processFilename;
268
    }
269
270
    /************************************
271
     *
272
     * Getting URLs based on Page TSconfig
273
     *
274
     ************************************/
275
276 39
    public function __construct()
277
    {
278 39
        $objectManager = GeneralUtility::makeInstance(ObjectManager::class);
279 39
        $this->queueRepository = $objectManager->get(QueueRepository::class);
280 39
        $this->processRepository = $objectManager->get(ProcessRepository::class);
281 39
        $this->configurationRepository = $objectManager->get(ConfigurationRepository::class);
282 39
        $this->queueExecutor = $objectManager->get(QueueExecutor::class);
283 39
        $this->iconFactory = GeneralUtility::makeInstance(IconFactory::class);
284
285 39
        $this->processFilename = Environment::getVarPath() . '/lock/tx_crawler.proc';
286
287
        /** @var ExtensionConfigurationProvider $configurationProvider */
288 39
        $configurationProvider = GeneralUtility::makeInstance(ExtensionConfigurationProvider::class);
289 39
        $settings = $configurationProvider->getExtensionConfiguration();
290 39
        $this->extensionSettings = is_array($settings) ? $settings : [];
0 ignored issues
show
introduced by
The condition is_array($settings) is always true.
Loading history...
291
292
        // set defaults:
293 39
        if (MathUtility::convertToPositiveInteger($this->extensionSettings['countInARun']) == 0) {
294
            $this->extensionSettings['countInARun'] = 100;
295
        }
296
297 39
        $this->extensionSettings['processLimit'] = MathUtility::forceIntegerInRange($this->extensionSettings['processLimit'], 1, 99, 1);
298 39
        $this->maximumUrlsToCompile = MathUtility::forceIntegerInRange($this->extensionSettings['maxCompileUrls'], 1, 1000000000, 10000);
299 39
    }
300
301
    /**
302
     * @return BackendUserAuthentication
303
     */
304 1
    private function getBackendUser()
305
    {
306
        // Make sure the _cli_ user is loaded
307 1
        Bootstrap::initializeBackendAuthentication();
308 1
        if ($this->backendUser === null) {
309 1
            $this->backendUser = $GLOBALS['BE_USER'];
310
        }
311 1
        return $this->backendUser;
312
    }
313
314
    /**
315
     * Sets the extensions settings (unserialized pendant of $TYPO3_CONF_VARS['EXT']['extConf']['crawler']).
316
     *
317
     * @param array $extensionSettings
318
     * @return void
319
     */
320 12
    public function setExtensionSettings(array $extensionSettings): void
321
    {
322 12
        $this->extensionSettings = $extensionSettings;
323 12
    }
324
325
    /**
326
     * Check if the given page should be crawled
327
     *
328
     * @param array $pageRow
329
     * @return false|string false if the page should be crawled (not excluded), true / skipMessage if it should be skipped
330
     */
331 8
    public function checkIfPageShouldBeSkipped(array $pageRow)
332
    {
333 8
        $skipPage = false;
334 8
        $skipMessage = 'Skipped'; // message will be overwritten later
335
336
        // if page is hidden
337 8
        if (!$this->extensionSettings['crawlHiddenPages']) {
338 8
            if ($pageRow['hidden']) {
339 1
                $skipPage = true;
340 1
                $skipMessage = 'Because page is hidden';
341
            }
342
        }
343
344 8
        if (!$skipPage) {
345 7
            if (GeneralUtility::inList('3,4', $pageRow['doktype']) || $pageRow['doktype'] >= 199) {
346 3
                $skipPage = true;
347 3
                $skipMessage = 'Because doktype is not allowed';
348
            }
349
        }
350
351 8
        if (!$skipPage) {
352 4
            foreach ($GLOBALS['TYPO3_CONF_VARS']['EXTCONF']['crawler']['excludeDoktype'] ?? [] as $key => $doktypeList) {
353 1
                if (GeneralUtility::inList($doktypeList, $pageRow['doktype'])) {
354 1
                    $skipPage = true;
355 1
                    $skipMessage = 'Doktype was excluded by "' . $key . '"';
356 1
                    break;
357
                }
358
            }
359
        }
360
361 8
        if (!$skipPage) {
362
            // veto hook
363 3
            foreach ($GLOBALS['TYPO3_CONF_VARS']['EXTCONF']['crawler']['pageVeto'] ?? [] as $key => $func) {
364
                $params = [
365
                    'pageRow' => $pageRow,
366
                ];
367
                // expects "false" if page is ok and "true" or a skipMessage if this page should _not_ be crawled
368
                $veto = GeneralUtility::callUserFunction($func, $params, $this);
369
                if ($veto !== false) {
370
                    $skipPage = true;
371
                    if (is_string($veto)) {
372
                        $skipMessage = $veto;
373
                    } else {
374
                        $skipMessage = 'Veto from hook "' . htmlspecialchars($key) . '"';
375
                    }
376
                    // no need to execute other hooks if a previous one return a veto
377
                    break;
378
                }
379
            }
380
        }
381
382 8
        return $skipPage ? $skipMessage : false;
383
    }
384
385
    /**
386
     * Wrapper method for getUrlsForPageId()
387
     * It returns an array of configurations and no urls!
388
     *
389
     * @param array $pageRow Page record with at least dok-type and uid columns.
390
     * @param string $skipMessage
391
     * @return array
392
     * @see getUrlsForPageId()
393
     */
394 4
    public function getUrlsForPageRow(array $pageRow, &$skipMessage = '')
395
    {
396 4
        $message = $this->checkIfPageShouldBeSkipped($pageRow);
397
398 4
        if ($message === false) {
399 3
            $res = $this->getUrlsForPageId($pageRow['uid']);
400 3
            $skipMessage = '';
401
        } else {
402 1
            $skipMessage = $message;
403 1
            $res = [];
404
        }
405
406 4
        return $res;
407
    }
408
409
    /**
410
     * This method is used to count if there are ANY unprocessed queue entries
411
     * of a given page_id and the configuration which matches a given hash.
412
     * If there if none, we can skip an inner detail check
413
     *
414
     * @param  int $uid
415
     * @param  string $configurationHash
416
     * @return boolean
417
     */
418 5
    protected function noUnprocessedQueueEntriesForPageWithConfigurationHashExist($uid, $configurationHash)
419
    {
420 5
        $queryBuilder = GeneralUtility::makeInstance(ConnectionPool::class)->getQueryBuilderForTable($this->tableName);
421 5
        $noUnprocessedQueueEntriesFound = true;
422
423
        $result = $queryBuilder
424 5
            ->count('*')
425 5
            ->from($this->tableName)
426 5
            ->where(
427 5
                $queryBuilder->expr()->eq('page_id', (int)$uid),
428 5
                $queryBuilder->expr()->eq('configuration_hash', $queryBuilder->createNamedParameter($configurationHash)),
429 5
                $queryBuilder->expr()->eq('exec_time', 0)
430
            )
431 5
            ->execute()
432 5
            ->fetchColumn();
433
434 5
        if ($result) {
435 3
            $noUnprocessedQueueEntriesFound = false;
436
        }
437
438 5
        return $noUnprocessedQueueEntriesFound;
439
    }
440
441
    /**
442
     * Creates a list of URLs from input array (and submits them to queue if asked for)
443
     * See Web > Info module script + "indexed_search"'s crawler hook-client using this!
444
     *
445
     * @param    array        Information about URLs from pageRow to crawl.
446
     * @param    array        Page row
447
     * @param    integer        Unix time to schedule indexing to, typically time()
448
     * @param    integer        Number of requests per minute (creates the interleave between requests)
449
     * @param    boolean        If set, submits the URLs to queue
0 ignored issues
show
Bug introduced by
The type AOE\Crawler\Controller\If was not found. Maybe you did not declare it correctly or list all dependencies?

The issue could also be caused by a filter entry in the build configuration. If the path has been excluded in your configuration, e.g. excluded_paths: ["lib/*"], you can move it to the dependency path list as follows:

filter:
    dependency_paths: ["lib/*"]

For further information see https://scrutinizer-ci.com/docs/tools/php/php-scrutinizer/#list-dependency-paths

Loading history...
450
     * @param    boolean        If set (and submitcrawlUrls is false) will fill $downloadUrls with entries)
451
     * @param    array        Array which is passed by reference and contains the an id per url to secure we will not crawl duplicates
452
     * @param    array        Array which will be filled with URLS for download if flag is set.
453
     * @param    array        Array of processing instructions
454
     * @return    string        List of URLs (meant for display in backend module)
455
     *
456
     */
457 2
    public function urlListFromUrlArray(
458
        array $vv,
459
        array $pageRow,
460
        $scheduledTime,
461
        $reqMinute,
462
        $submitCrawlUrls,
463
        $downloadCrawlUrls,
464
        array &$duplicateTrack,
465
        array &$downloadUrls,
466
        array $incomingProcInstructions
467
    ) {
468 2
        if (!is_array($vv['URLs'])) {
469
            return 'ERROR - no URL generated';
470
        }
471 2
        $urlLog = [];
472 2
        $pageId = (int)$pageRow['uid'];
473 2
        $configurationHash = $this->getConfigurationHash($vv);
474 2
        $skipInnerCheck = $this->noUnprocessedQueueEntriesForPageWithConfigurationHashExist($pageId, $configurationHash);
475
476 2
        foreach ($vv['URLs'] as $urlQuery) {
477 2
            if (!$this->drawURLs_PIfilter($vv['subCfg']['procInstrFilter'], $incomingProcInstructions)) {
478
                continue;
479
            }
480 2
            $url = (string)$this->getUrlFromPageAndQueryParameters(
481 2
                $pageId,
482 2
                $urlQuery,
483 2
                $vv['subCfg']['baseUrl'] ?? null,
484 2
                $vv['subCfg']['force_ssl'] ?? 0
485
            );
486
487
            // Create key by which to determine unique-ness:
488 2
            $uKey = $url . '|' . $vv['subCfg']['userGroups'] . '|' . $vv['subCfg']['procInstrFilter'];
489
490 2
            if (isset($duplicateTrack[$uKey])) {
491
                //if the url key is registered just display it and do not resubmit is
492
                $urlLog[] = '<em><span class="text-muted">' . htmlspecialchars($url) . '</span></em>';
493
            } else {
494
                // Scheduled time:
495 2
                $schTime = $scheduledTime + round(count($duplicateTrack) * (60 / $reqMinute));
496 2
                $schTime = floor($schTime / 60) * 60;
497 2
                $formattedDate = BackendUtility::datetime($schTime);
0 ignored issues
show
Bug introduced by
$schTime of type double is incompatible with the type integer expected by parameter $value of TYPO3\CMS\Backend\Utilit...kendUtility::datetime(). ( Ignorable by Annotation )

If this is a false-positive, you can also ignore this issue in your code via the ignore-type  annotation

497
                $formattedDate = BackendUtility::datetime(/** @scrutinizer ignore-type */ $schTime);
Loading history...
498 2
                $this->urlList[] = '[' . $formattedDate . '] ' . $url;
499 2
                $urlList = '[' . $formattedDate . '] ' . htmlspecialchars($url);
500
501
                // Submit for crawling!
502 2
                if ($submitCrawlUrls) {
503 2
                    $added = $this->addUrl(
504 2
                        $pageId,
505 2
                        $url,
506 2
                        $vv['subCfg'],
507 2
                        $scheduledTime,
508 2
                        $configurationHash,
509 2
                        $skipInnerCheck
510
                    );
511 2
                    if ($added === false) {
512 2
                        $urlList .= ' (URL already existed)';
513
                    }
514
                } elseif ($downloadCrawlUrls) {
515
                    $downloadUrls[$url] = $url;
516
                }
517 2
                $urlLog[] = $urlList;
518
            }
519 2
            $duplicateTrack[$uKey] = true;
520
        }
521
522 2
        return implode('<br>', $urlLog);
523
    }
524
525
    /**
526
     * Returns true if input processing instruction is among registered ones.
527
     *
528
     * @param string $piString PI to test
529
     * @param array $incomingProcInstructions Processing instructions
530
     * @return boolean
531
     */
532 5
    public function drawURLs_PIfilter($piString, array $incomingProcInstructions)
533
    {
534 5
        if (empty($incomingProcInstructions)) {
535 1
            return true;
536
        }
537
538 4
        foreach ($incomingProcInstructions as $pi) {
539 4
            if (GeneralUtility::inList($piString, $pi)) {
540 2
                return true;
541
            }
542
        }
543 2
        return false;
544
    }
545
546 3
    public function getPageTSconfigForId($id)
547
    {
548 3
        if (!$this->MP) {
549 3
            $pageTSconfig = BackendUtility::getPagesTSconfig($id);
0 ignored issues
show
Deprecated Code introduced by
The function TYPO3\CMS\Backend\Utilit...ity::getPagesTSconfig() has been deprecated. ( Ignorable by Annotation )

If this is a false-positive, you can also ignore this issue in your code via the ignore-deprecated  annotation

549
            $pageTSconfig = /** @scrutinizer ignore-deprecated */ BackendUtility::getPagesTSconfig($id);
Loading history...
550
        } else {
551
            [, $mountPointId] = explode('-', $this->MP);
0 ignored issues
show
Bug introduced by
$this->MP of type true is incompatible with the type string expected by parameter $string of explode(). ( Ignorable by Annotation )

If this is a false-positive, you can also ignore this issue in your code via the ignore-type  annotation

551
            [, $mountPointId] = explode('-', /** @scrutinizer ignore-type */ $this->MP);
Loading history...
552
            $pageTSconfig = BackendUtility::getPagesTSconfig($mountPointId);
0 ignored issues
show
Deprecated Code introduced by
The function TYPO3\CMS\Backend\Utilit...ity::getPagesTSconfig() has been deprecated. ( Ignorable by Annotation )

If this is a false-positive, you can also ignore this issue in your code via the ignore-deprecated  annotation

552
            $pageTSconfig = /** @scrutinizer ignore-deprecated */ BackendUtility::getPagesTSconfig($mountPointId);
Loading history...
Bug introduced by
$mountPointId of type string is incompatible with the type integer expected by parameter $id of TYPO3\CMS\Backend\Utilit...ity::getPagesTSconfig(). ( Ignorable by Annotation )

If this is a false-positive, you can also ignore this issue in your code via the ignore-type  annotation

552
            $pageTSconfig = BackendUtility::getPagesTSconfig(/** @scrutinizer ignore-type */ $mountPointId);
Loading history...
553
        }
554
555
        // Call a hook to alter configuration
556 3
        if (is_array($GLOBALS['TYPO3_CONF_VARS']['EXTCONF']['crawler']['getPageTSconfigForId'])) {
557
            $params = [
558
                'pageId' => $id,
559
                'pageTSConfig' => &$pageTSconfig,
560
            ];
561
            foreach ($GLOBALS['TYPO3_CONF_VARS']['EXTCONF']['crawler']['getPageTSconfigForId'] as $userFunc) {
562
                GeneralUtility::callUserFunction($userFunc, $params, $this);
563
            }
564
        }
565 3
        return $pageTSconfig;
566
    }
567
568
    /**
569
     * This methods returns an array of configurations.
570
     * And no urls!
571
     *
572
     * @param integer $id Page ID
573
     * @return array
574
     */
575 2
    public function getUrlsForPageId($pageId)
576
    {
577
        // Get page TSconfig for page ID
578 2
        $pageTSconfig = $this->getPageTSconfigForId($pageId);
579
580 2
        $res = [];
581
582
        // Fetch Crawler Configuration from pageTSconfig
583 2
        $crawlerCfg = $pageTSconfig['tx_crawler.']['crawlerCfg.']['paramSets.'] ?? [];
584 2
        foreach ($crawlerCfg as $key => $values) {
585 1
            if (!is_array($values)) {
586 1
                continue;
587
            }
588 1
            $key = str_replace('.', '', $key);
589
            // Sub configuration for a single configuration string:
590 1
            $subCfg = (array)$crawlerCfg[$key . '.'];
591 1
            $subCfg['key'] = $key;
592
593 1
            if (strcmp($subCfg['procInstrFilter'], '')) {
594 1
                $subCfg['procInstrFilter'] = implode(',', GeneralUtility::trimExplode(',', $subCfg['procInstrFilter']));
595
            }
596 1
            $pidOnlyList = implode(',', GeneralUtility::trimExplode(',', $subCfg['pidsOnly'], true));
597
598
            // process configuration if it is not page-specific or if the specific page is the current page:
599 1
            if (!strcmp((string) $subCfg['pidsOnly'], '') || GeneralUtility::inList($pidOnlyList, $pageId)) {
600
601
                // Explode, process etc.:
602 1
                $res[$key] = [];
603 1
                $res[$key]['subCfg'] = $subCfg;
604 1
                $res[$key]['paramParsed'] = GeneralUtility::explodeUrl2Array($crawlerCfg[$key]);
605 1
                $res[$key]['paramExpanded'] = $this->expandParameters($res[$key]['paramParsed'], $pageId);
606 1
                $res[$key]['origin'] = 'pagets';
607
608
                // recognize MP value
609 1
                if (!$this->MP) {
610 1
                    $res[$key]['URLs'] = $this->compileUrls($res[$key]['paramExpanded'], ['?id=' . $pageId]);
611
                } else {
612
                    $res[$key]['URLs'] = $this->compileUrls($res[$key]['paramExpanded'], ['?id=' . $pageId . '&MP=' . $this->MP]);
0 ignored issues
show
Bug introduced by
Are you sure $this->MP of type true can be used in concatenation? ( Ignorable by Annotation )

If this is a false-positive, you can also ignore this issue in your code via the ignore-type  annotation

612
                    $res[$key]['URLs'] = $this->compileUrls($res[$key]['paramExpanded'], ['?id=' . $pageId . '&MP=' . /** @scrutinizer ignore-type */ $this->MP]);
Loading history...
613
                }
614
            }
615
        }
616
617
        // Get configuration from tx_crawler_configuration records up the rootline
618 2
        $crawlerConfigurations = $this->configurationRepository->getCrawlerConfigurationRecordsFromRootLine($pageId);
619 2
        foreach ($crawlerConfigurations as $configurationRecord) {
620
621
                // check access to the configuration record
622 1
            if (empty($configurationRecord['begroups']) || $this->getBackendUser()->isAdmin() || $this->hasGroupAccess($this->getBackendUser()->user['usergroup_cached_list'], $configurationRecord['begroups'])) {
623 1
                $pidOnlyList = implode(',', GeneralUtility::trimExplode(',', $configurationRecord['pidsonly'], true));
624
625
                // process configuration if it is not page-specific or if the specific page is the current page:
626 1
                if (!strcmp($configurationRecord['pidsonly'], '') || GeneralUtility::inList($pidOnlyList, $pageId)) {
627 1
                    $key = $configurationRecord['name'];
628
629
                    // don't overwrite previously defined paramSets
630 1
                    if (!isset($res[$key])) {
631
632
                            /* @var $TSparserObject \TYPO3\CMS\Core\TypoScript\Parser\TypoScriptParser */
633 1
                        $TSparserObject = GeneralUtility::makeInstance(TypoScriptParser::class);
634 1
                        $TSparserObject->parse($configurationRecord['processing_instruction_parameters_ts']);
635
636
                        $subCfg = [
637 1
                            'procInstrFilter' => $configurationRecord['processing_instruction_filter'],
638 1
                            'procInstrParams.' => $TSparserObject->setup,
639 1
                            'baseUrl' => $configurationRecord['base_url'],
640 1
                            'force_ssl' => (int)$configurationRecord['force_ssl'],
641 1
                            'userGroups' => $configurationRecord['fegroups'],
642 1
                            'exclude' => $configurationRecord['exclude'],
643 1
                            'key' => $key,
644
                        ];
645
646 1
                        if (!in_array($pageId, $this->expandExcludeString($subCfg['exclude']))) {
647 1
                            $res[$key] = [];
648 1
                            $res[$key]['subCfg'] = $subCfg;
649 1
                            $res[$key]['paramParsed'] = GeneralUtility::explodeUrl2Array($configurationRecord['configuration']);
650 1
                            $res[$key]['paramExpanded'] = $this->expandParameters($res[$key]['paramParsed'], $pageId);
651 1
                            $res[$key]['URLs'] = $this->compileUrls($res[$key]['paramExpanded'], ['?id=' . $pageId]);
652 1
                            $res[$key]['origin'] = 'tx_crawler_configuration_' . $configurationRecord['uid'];
653
                        }
654
                    }
655
                }
656
            }
657
        }
658
659 2
        foreach ($GLOBALS['TYPO3_CONF_VARS']['EXTCONF']['crawler']['processUrls'] ?? [] as $func) {
660
            $params = [
661
                'res' => &$res,
662
            ];
663
            GeneralUtility::callUserFunction($func, $params, $this);
664
        }
665 2
        return $res;
666
    }
667
668
    /**
669
     * Find all configurations of subpages of a page
670
     *
671
     * @param int $rootid
672
     * @param $depth
673
     * @return array
674
     *
675
     * TODO: Write Functional Tests
676
     */
677 1
    public function getConfigurationsForBranch(int $rootid, $depth)
678
    {
679 1
        $configurationsForBranch = [];
680 1
        $pageTSconfig = $this->getPageTSconfigForId($rootid);
681 1
        $sets = $pageTSconfig['tx_crawler.']['crawlerCfg.']['paramSets.'] ?? [];
682 1
        foreach ($sets as $key => $value) {
683
            if (!is_array($value)) {
684
                continue;
685
            }
686
            $configurationsForBranch[] = substr($key, -1) == '.' ? substr($key, 0, -1) : $key;
687
        }
688 1
        $pids = [];
689 1
        $rootLine = BackendUtility::BEgetRootLine($rootid);
690 1
        foreach ($rootLine as $node) {
691 1
            $pids[] = $node['uid'];
692
        }
693
        /* @var PageTreeView $tree */
694 1
        $tree = GeneralUtility::makeInstance(PageTreeView::class);
695 1
        $perms_clause = $this->getBackendUser()->getPagePermsClause(Permission::PAGE_SHOW);
696 1
        $tree->init(empty($perms_clause) ? '' : ('AND ' . $perms_clause));
697 1
        $tree->getTree($rootid, $depth, '');
698 1
        foreach ($tree->tree as $node) {
699
            $pids[] = $node['row']['uid'];
700
        }
701
702 1
        $queryBuilder = $this->getQueryBuilder('tx_crawler_configuration');
703
        $statement = $queryBuilder
704 1
            ->select('name')
705 1
            ->from('tx_crawler_configuration')
706 1
            ->where(
707 1
                $queryBuilder->expr()->in('pid', $queryBuilder->createNamedParameter($pids, Connection::PARAM_INT_ARRAY))
708
            )
709 1
            ->execute();
710
711 1
        while ($row = $statement->fetch()) {
712 1
            $configurationsForBranch[] = $row['name'];
713
        }
714 1
        return $configurationsForBranch;
715
    }
716
717
    /**
718
     * Get querybuilder for given table
719
     *
720
     * @param string $table
721
     * @return \TYPO3\CMS\Core\Database\Query\QueryBuilder
722
     */
723 17
    private function getQueryBuilder(string $table)
724
    {
725 17
        return GeneralUtility::makeInstance(ConnectionPool::class)->getQueryBuilderForTable($table);
726
    }
727
728
    /**
729
     * Check if a user has access to an item
730
     * (e.g. get the group list of the current logged in user from $GLOBALS['TSFE']->gr_list)
731
     *
732
     * @see \TYPO3\CMS\Frontend\Page\PageRepository::getMultipleGroupsWhereClause()
733
     * @param  string $groupList    Comma-separated list of (fe_)group UIDs from a user
734
     * @param  string $accessList   Comma-separated list of (fe_)group UIDs of the item to access
735
     * @return bool                 TRUE if at least one of the users group UIDs is in the access list or the access list is empty
736
     */
737 3
    public function hasGroupAccess($groupList, $accessList)
738
    {
739 3
        if (empty($accessList)) {
740 1
            return true;
741
        }
742 2
        foreach (GeneralUtility::intExplode(',', $groupList) as $groupUid) {
743 2
            if (GeneralUtility::inList($accessList, $groupUid)) {
744 1
                return true;
745
            }
746
        }
747 1
        return false;
748
    }
749
750
    /**
751
     * Will expand the parameters configuration to individual values. This follows a certain syntax of the value of each parameter.
752
     * Syntax of values:
753
     * - Basically: If the value is wrapped in [...] it will be expanded according to the following syntax, otherwise the value is taken literally
754
     * - Configuration is splitted by "|" and the parts are processed individually and finally added together
755
     * - For each configuration part:
756
     *         - "[int]-[int]" = Integer range, will be expanded to all values in between, values included, starting from low to high (max. 1000). Example "1-34" or "-40--30"
757
     *         - "_TABLE:[TCA table name];[_PID:[optional page id, default is current page]];[_ENABLELANG:1]" = Look up of table records from PID, filtering out deleted records. Example "_TABLE:tt_content; _PID:123"
758
     *        _ENABLELANG:1 picks only original records without their language overlays
759
     *         - Default: Literal value
760
     *
761
     * @param array $paramArray Array with key (GET var name) and values (value of GET var which is configuration for expansion)
762
     * @param integer $pid Current page ID
763
     * @return array
764
     *
765
     * TODO: Write Functional Tests
766
     */
767 9
    public function expandParameters($paramArray, $pid)
768
    {
769
        // Traverse parameter names:
770 9
        foreach ($paramArray as $p => $v) {
771 9
            $v = trim($v);
772
773
            // If value is encapsulated in square brackets it means there are some ranges of values to find, otherwise the value is literal
774 9
            if (substr($v, 0, 1) === '[' && substr($v, -1) === ']') {
775
                // So, find the value inside brackets and reset the paramArray value as an array.
776 9
                $v = substr($v, 1, -1);
777 9
                $paramArray[$p] = [];
778
779
                // Explode parts and traverse them:
780 9
                $parts = explode('|', $v);
781 9
                foreach ($parts as $pV) {
782
783
                        // Look for integer range: (fx. 1-34 or -40--30 // reads minus 40 to minus 30)
784 9
                    if (preg_match('/^(-?[0-9]+)\s*-\s*(-?[0-9]+)$/', trim($pV), $reg)) {
785
786
                        // Swap if first is larger than last:
787 1
                        if ($reg[1] > $reg[2]) {
788
                            $temp = $reg[2];
789
                            $reg[2] = $reg[1];
790
                            $reg[1] = $temp;
791
                        }
792
793
                        // Traverse range, add values:
794 1
                        $runAwayBrake = 1000; // Limit to size of range!
795 1
                        for ($a = $reg[1]; $a <= $reg[2];$a++) {
796 1
                            $paramArray[$p][] = $a;
797 1
                            $runAwayBrake--;
798 1
                            if ($runAwayBrake <= 0) {
799
                                break;
800
                            }
801
                        }
802 8
                    } elseif (substr(trim($pV), 0, 7) == '_TABLE:') {
803
804
                        // Parse parameters:
805 6
                        $subparts = GeneralUtility::trimExplode(';', $pV);
806 6
                        $subpartParams = [];
807 6
                        foreach ($subparts as $spV) {
808 6
                            [$pKey, $pVal] = GeneralUtility::trimExplode(':', $spV);
809 6
                            $subpartParams[$pKey] = $pVal;
810
                        }
811
812
                        // Table exists:
813 6
                        if (isset($GLOBALS['TCA'][$subpartParams['_TABLE']])) {
814 6
                            $lookUpPid = isset($subpartParams['_PID']) ? intval($subpartParams['_PID']) : intval($pid);
815 6
                            $recursiveDepth = isset($subpartParams['_RECURSIVE']) ? intval($subpartParams['_RECURSIVE']) : 0;
816 6
                            $pidField = isset($subpartParams['_PIDFIELD']) ? trim($subpartParams['_PIDFIELD']) : 'pid';
817 6
                            $where = $subpartParams['_WHERE'] ?? '';
818 6
                            $addTable = $subpartParams['_ADDTABLE'] ?? '';
819
820 6
                            $fieldName = $subpartParams['_FIELD'] ? $subpartParams['_FIELD'] : 'uid';
821 6
                            if ($fieldName === 'uid' || $GLOBALS['TCA'][$subpartParams['_TABLE']]['columns'][$fieldName]) {
822 6
                                $queryBuilder = $this->getQueryBuilder($subpartParams['_TABLE']);
823
824 6
                                if ($recursiveDepth > 0) {
825
                                    /** @var \TYPO3\CMS\Core\Database\QueryGenerator $queryGenerator */
826 2
                                    $queryGenerator = GeneralUtility::makeInstance(\TYPO3\CMS\Core\Database\QueryGenerator::class);
827 2
                                    $pidList = $queryGenerator->getTreeList($lookUpPid, $recursiveDepth, 0, 1);
828 2
                                    $pidArray = GeneralUtility::intExplode(',', $pidList);
829
                                } else {
830 4
                                    $pidArray = [(string)$lookUpPid];
831
                                }
832
833 6
                                $queryBuilder->getRestrictions()
834 6
                                    ->removeAll()
835 6
                                    ->add(GeneralUtility::makeInstance(DeletedRestriction::class));
836
837
                                $queryBuilder
838 6
                                    ->select($fieldName)
839 6
                                    ->from($subpartParams['_TABLE'])
840 6
                                    ->where(
841 6
                                        $queryBuilder->expr()->in($pidField, $queryBuilder->createNamedParameter($pidArray, Connection::PARAM_INT_ARRAY)),
842 6
                                        $where
843
                                    );
844 6
                                if (!empty($addTable)) {
845
                                    // TODO: Check if this works as intended!
846
                                    $queryBuilder->add('from', $addTable);
847
                                }
848 6
                                $transOrigPointerField = $GLOBALS['TCA'][$subpartParams['_TABLE']]['ctrl']['transOrigPointerField'];
849
850 6
                                if ($subpartParams['_ENABLELANG'] && $transOrigPointerField) {
851
                                    $queryBuilder->andWhere(
852
                                        $queryBuilder->expr()->lte(
853
                                            $transOrigPointerField,
854
                                            0
855
                                        )
856
                                    );
857
                                }
858
859 6
                                $statement = $queryBuilder->execute();
860
861 6
                                $rows = [];
862 6
                                while ($row = $statement->fetch()) {
863 6
                                    $rows[$row[$fieldName]] = $row;
864
                                }
865
866 6
                                if (is_array($rows)) {
867 6
                                    $paramArray[$p] = array_merge($paramArray[$p], array_keys($rows));
868
                                }
869
                            }
870
                        }
871
                    } else { // Just add value:
872 2
                        $paramArray[$p][] = $pV;
873
                    }
874
                    // Hook for processing own expandParameters place holder
875 9
                    if (is_array($GLOBALS['TYPO3_CONF_VARS']['SC_OPTIONS']['crawler/class.tx_crawler_lib.php']['expandParameters'])) {
876
                        $_params = [
877
                            'pObj' => &$this,
878
                            'paramArray' => &$paramArray,
879
                            'currentKey' => $p,
880
                            'currentValue' => $pV,
881
                            'pid' => $pid,
882
                        ];
883
                        foreach ($GLOBALS['TYPO3_CONF_VARS']['SC_OPTIONS']['crawler/class.tx_crawler_lib.php']['expandParameters'] as $_funcRef) {
884
                            GeneralUtility::callUserFunction($_funcRef, $_params, $this);
885
                        }
886
                    }
887
                }
888
889
                // Make unique set of values and sort array by key:
890 9
                $paramArray[$p] = array_unique($paramArray[$p]);
891 9
                ksort($paramArray);
892
            } else {
893
                // Set the literal value as only value in array:
894 2
                $paramArray[$p] = [$v];
895
            }
896
        }
897
898 9
        return $paramArray;
899
    }
900
901
    /**
902
     * Compiling URLs from parameter array (output of expandParameters())
903
     * The number of URLs will be the multiplication of the number of parameter values for each key
904
     *
905
     * @param array $paramArray Output of expandParameters(): Array with keys (GET var names) and for each an array of values
906
     * @param array $urls URLs accumulated in this array (for recursion)
907
     * @return array
908
     */
909 5
    public function compileUrls($paramArray, array $urls)
910
    {
911 5
        if (empty($paramArray)) {
912 5
            return $urls;
913
        }
914
        // shift first off stack:
915 4
        reset($paramArray);
916 4
        $varName = key($paramArray);
917 4
        $valueSet = array_shift($paramArray);
918
919
        // Traverse value set:
920 4
        $newUrls = [];
921 4
        foreach ($urls as $url) {
922 3
            foreach ($valueSet as $val) {
923 3
                $newUrls[] = $url . (strcmp((string) $val, '') ? '&' . rawurlencode($varName) . '=' . rawurlencode((string) $val) : '');
924
925 3
                if (count($newUrls) > $this->maximumUrlsToCompile) {
926
                    break;
927
                }
928
            }
929
        }
930 4
        return $this->compileUrls($paramArray, $newUrls);
931
    }
932
933
    /************************************
934
     *
935
     * Crawler log
936
     *
937
     ************************************/
938
939
    /**
940
     * Return array of records from crawler queue for input page ID
941
     *
942
     * @param integer $id Page ID for which to look up log entries.
943
     * @param string $filter Filter: "all" => all entries, "pending" => all that is not yet run, "finished" => all complete ones
944
     * @param boolean $doFlush If TRUE, then entries selected at DELETED(!) instead of selected!
945
     * @param boolean $doFullFlush
946
     * @param integer $itemsPerPage Limit the amount of entries per page default is 10
947
     * @return array
948
     */
949 4
    public function getLogEntriesForPageId($id, $filter = '', $doFlush = false, $doFullFlush = false, $itemsPerPage = 10)
950
    {
951 4
        $queryBuilder = GeneralUtility::makeInstance(ConnectionPool::class)->getQueryBuilderForTable($this->tableName);
952
        $queryBuilder
953 4
            ->select('*')
954 4
            ->from($this->tableName)
955 4
            ->where(
956 4
                $queryBuilder->expr()->eq('page_id', $queryBuilder->createNamedParameter($id, \PDO::PARAM_INT))
957
            )
958 4
            ->orderBy('scheduled', 'DESC');
959
960 4
        $expressionBuilder = GeneralUtility::makeInstance(ConnectionPool::class)
961 4
            ->getConnectionForTable($this->tableName)
962 4
            ->getExpressionBuilder();
963 4
        $query = $expressionBuilder->andX();
964
        // PHPStorm adds the highlight that the $addWhere is immediately overwritten,
965
        // but the $query = $expressionBuilder->andX() ensures that the $addWhere is written correctly with AND
966
        // between the statements, it's not a mistake in the code.
967 4
        $addWhere = '';
968 4
        switch ($filter) {
969 4
            case 'pending':
970
                $queryBuilder->andWhere($queryBuilder->expr()->eq('exec_time', 0));
971
                $addWhere = ' AND ' . $query->add($expressionBuilder->eq('exec_time', 0));
0 ignored issues
show
Unused Code introduced by
The assignment to $addWhere is dead and can be removed.
Loading history...
972
                break;
973 4
            case 'finished':
974
                $queryBuilder->andWhere($queryBuilder->expr()->gt('exec_time', 0));
975
                $addWhere = ' AND ' . $query->add($expressionBuilder->gt('exec_time', 0));
976
                break;
977
        }
978
979
        // FIXME: Write unit test that ensures that the right records are deleted.
980 4
        if ($doFlush) {
981 2
            $addWhere = $query->add($expressionBuilder->eq('page_id', (int)$id));
982 2
            $this->flushQueue($doFullFlush ? '1=1' : $addWhere);
983 2
            return [];
984
        } else {
985 2
            if ($itemsPerPage > 0) {
986
                $queryBuilder
987 2
                    ->setMaxResults((int)$itemsPerPage);
988
            }
989
990 2
            return $queryBuilder->execute()->fetchAll();
991
        }
992
    }
993
994
    /**
995
     * Return array of records from crawler queue for input set ID
996
     *
997
     * @param integer $set_id Set ID for which to look up log entries.
998
     * @param string $filter Filter: "all" => all entries, "pending" => all that is not yet run, "finished" => all complete ones
999
     * @param boolean $doFlush If TRUE, then entries selected at DELETED(!) instead of selected!
1000
     * @param integer $itemsPerPage Limit the amount of entires per page default is 10
1001
     * @return array
1002
     */
1003 6
    public function getLogEntriesForSetId($set_id, $filter = '', $doFlush = false, $doFullFlush = false, $itemsPerPage = 10)
1004
    {
1005 6
        $queryBuilder = GeneralUtility::makeInstance(ConnectionPool::class)->getQueryBuilderForTable($this->tableName);
1006
        $queryBuilder
1007 6
            ->select('*')
1008 6
            ->from($this->tableName)
1009 6
            ->where(
1010 6
                $queryBuilder->expr()->eq('set_id', $queryBuilder->createNamedParameter($set_id, \PDO::PARAM_INT))
1011
            )
1012 6
            ->orderBy('scheduled', 'DESC');
1013
1014 6
        $expressionBuilder = GeneralUtility::makeInstance(ConnectionPool::class)
1015 6
            ->getConnectionForTable($this->tableName)
1016 6
            ->getExpressionBuilder();
1017 6
        $query = $expressionBuilder->andX();
1018
        // FIXME: Write Unit tests for Filters
1019
        // PHPStorm adds the highlight that the $addWhere is immediately overwritten,
1020
        // but the $query = $expressionBuilder->andX() ensures that the $addWhere is written correctly with AND
1021
        // between the statements, it's not a mistake in the code.
1022 6
        $addWhere = '';
1023 6
        switch ($filter) {
1024 6
            case 'pending':
1025 1
                $queryBuilder->andWhere($queryBuilder->expr()->eq('exec_time', 0));
1026 1
                $addWhere = $query->add($expressionBuilder->eq('exec_time', 0));
0 ignored issues
show
Unused Code introduced by
The assignment to $addWhere is dead and can be removed.
Loading history...
1027 1
                break;
1028 5
            case 'finished':
1029 1
                $queryBuilder->andWhere($queryBuilder->expr()->gt('exec_time', 0));
1030 1
                $addWhere = $query->add($expressionBuilder->gt('exec_time', 0));
1031 1
                break;
1032
        }
1033
        // FIXME: Write unit test that ensures that the right records are deleted.
1034 6
        if ($doFlush) {
1035 4
            $addWhere = $query->add($expressionBuilder->eq('set_id', (int)$set_id));
1036 4
            $this->flushQueue($doFullFlush ? '' : $addWhere);
1037 4
            return [];
1038
        } else {
1039 2
            if ($itemsPerPage > 0) {
1040
                $queryBuilder
1041 2
                    ->setMaxResults((int)$itemsPerPage);
1042
            }
1043
1044 2
            return $queryBuilder->execute()->fetchAll();
1045
        }
1046
    }
1047
1048
    /**
1049
     * Removes queue entries
1050
     *
1051
     * @param string $where SQL related filter for the entries which should be removed
1052
     * @return void
1053
     */
1054 10
    protected function flushQueue($where = ''): void
1055
    {
1056 10
        $realWhere = strlen((string) $where) > 0 ? $where : '1=1';
1057
1058 10
        $queryBuilder = $this->getQueryBuilder($this->tableName);
1059
1060 10
        if (EventDispatcher::getInstance()->hasObserver('queueEntryFlush')) {
1061
            $groups = $queryBuilder
1062
                ->select('DISTINCT set_id')
1063
                ->from($this->tableName)
1064
                ->where($realWhere)
1065
                ->execute()
1066
                ->fetchAll();
1067
            if (is_array($groups)) {
0 ignored issues
show
introduced by
The condition is_array($groups) is always true.
Loading history...
1068
                foreach ($groups as $group) {
1069
                    $subSet = $queryBuilder
1070
                        ->select('uid', 'set_id')
1071
                        ->from($this->tableName)
1072
                        ->where(
1073
                            $realWhere,
1074
                            $queryBuilder->expr()->eq('set_id', $group['set_id'])
1075
                        )
1076
                        ->execute()
1077
                        ->fetchAll();
1078
                    EventDispatcher::getInstance()->post('queueEntryFlush', $group['set_id'], $subSet);
1079
                }
1080
            }
1081
        }
1082
1083
        $queryBuilder
1084 10
            ->delete($this->tableName)
1085 10
            ->where($realWhere)
1086 10
            ->execute();
1087 10
    }
1088
1089
    /**
1090
     * Adding call back entries to log (called from hooks typically, see indexed search class "class.crawler.php"
1091
     *
1092
     * @param integer $setId Set ID
1093
     * @param array $params Parameters to pass to call back function
1094
     * @param string $callBack Call back object reference, eg. 'EXT:indexed_search/class.crawler.php:&tx_indexedsearch_crawler'
1095
     * @param integer $page_id Page ID to attach it to
1096
     * @param integer $schedule Time at which to activate
1097
     * @return void
1098
     */
1099
    public function addQueueEntry_callBack($setId, $params, $callBack, $page_id = 0, $schedule = 0): void
1100
    {
1101
        if (!is_array($params)) {
0 ignored issues
show
introduced by
The condition is_array($params) is always true.
Loading history...
1102
            $params = [];
1103
        }
1104
        $params['_CALLBACKOBJ'] = $callBack;
1105
1106
        GeneralUtility::makeInstance(ConnectionPool::class)->getConnectionForTable('tx_crawler_queue')
1107
            ->insert(
1108
                'tx_crawler_queue',
1109
                [
1110
                    'page_id' => (int)$page_id,
1111
                    'parameters' => serialize($params),
1112
                    'scheduled' => (int)$schedule ?: $this->getCurrentTime(),
1113
                    'exec_time' => 0,
1114
                    'set_id' => (int)$setId,
1115
                    'result_data' => '',
1116
                ]
1117
            );
1118
    }
1119
1120
    /************************************
1121
     *
1122
     * URL setting
1123
     *
1124
     ************************************/
1125
1126
    /**
1127
     * Setting a URL for crawling:
1128
     *
1129
     * @param integer $id Page ID
1130
     * @param string $url Complete URL
1131
     * @param array $subCfg Sub configuration array (from TS config)
1132
     * @param integer $tstamp Scheduled-time
1133
     * @param string $configurationHash (optional) configuration hash
1134
     * @param bool $skipInnerDuplicationCheck (optional) skip inner duplication check
1135
     * @return bool
1136
     */
1137 6
    public function addUrl(
1138
        $id,
1139
        $url,
1140
        array $subCfg,
1141
        $tstamp,
1142
        $configurationHash = '',
1143
        $skipInnerDuplicationCheck = false
1144
    ) {
1145 6
        $urlAdded = false;
1146 6
        $rows = [];
1147
1148
        // Creating parameters:
1149
        $parameters = [
1150 6
            'url' => $url,
1151
        ];
1152
1153
        // fe user group simulation:
1154 6
        $uGs = implode(',', array_unique(GeneralUtility::intExplode(',', $subCfg['userGroups'], true)));
1155 6
        if ($uGs) {
1156 1
            $parameters['feUserGroupList'] = $uGs;
1157
        }
1158
1159
        // Setting processing instructions
1160 6
        $parameters['procInstructions'] = GeneralUtility::trimExplode(',', $subCfg['procInstrFilter']);
1161 6
        if (is_array($subCfg['procInstrParams.'])) {
1162 3
            $parameters['procInstrParams'] = $subCfg['procInstrParams.'];
1163
        }
1164
1165
        // Compile value array:
1166 6
        $parameters_serialized = serialize($parameters);
1167
        $fieldArray = [
1168 6
            'page_id' => (int)$id,
1169 6
            'parameters' => $parameters_serialized,
1170 6
            'parameters_hash' => GeneralUtility::shortMD5($parameters_serialized),
1171 6
            'configuration_hash' => $configurationHash,
1172 6
            'scheduled' => $tstamp,
1173 6
            'exec_time' => 0,
1174 6
            'set_id' => (int)$this->setID,
1175 6
            'result_data' => '',
1176 6
            'configuration' => $subCfg['key'],
1177
        ];
1178
1179 6
        if ($this->registerQueueEntriesInternallyOnly) {
1180
            //the entries will only be registered and not stored to the database
1181 1
            $this->queueEntries[] = $fieldArray;
1182
        } else {
1183 5
            if (!$skipInnerDuplicationCheck) {
1184
                // check if there is already an equal entry
1185 4
                $rows = $this->getDuplicateRowsIfExist($tstamp, $fieldArray);
1186
            }
1187
1188 5
            if (empty($rows)) {
1189 4
                $connectionForCrawlerQueue = GeneralUtility::makeInstance(ConnectionPool::class)->getConnectionForTable('tx_crawler_queue');
1190 4
                $connectionForCrawlerQueue->insert(
1191 4
                    'tx_crawler_queue',
1192 4
                    $fieldArray
1193
                );
1194 4
                $uid = $connectionForCrawlerQueue->lastInsertId('tx_crawler_queue', 'qid');
1195 4
                $rows[] = $uid;
1196 4
                $urlAdded = true;
1197 4
                EventDispatcher::getInstance()->post('urlAddedToQueue', $this->setID, ['uid' => $uid, 'fieldArray' => $fieldArray]);
1198
            } else {
1199 1
                EventDispatcher::getInstance()->post('duplicateUrlInQueue', $this->setID, ['rows' => $rows, 'fieldArray' => $fieldArray]);
1200
            }
1201
        }
1202
1203 6
        return $urlAdded;
1204
    }
1205
1206
    /**
1207
     * This method determines duplicates for a queue entry with the same parameters and this timestamp.
1208
     * If the timestamp is in the past, it will check if there is any unprocessed queue entry in the past.
1209
     * If the timestamp is in the future it will check, if the queued entry has exactly the same timestamp
1210
     *
1211
     * @param int $tstamp
1212
     * @param array $fieldArray
1213
     *
1214
     * @return array
1215
     */
1216 7
    protected function getDuplicateRowsIfExist($tstamp, $fieldArray)
1217
    {
1218 7
        $rows = [];
1219
1220 7
        $currentTime = $this->getCurrentTime();
1221
1222 7
        $queryBuilder = GeneralUtility::makeInstance(ConnectionPool::class)->getQueryBuilderForTable($this->tableName);
1223
        $queryBuilder
1224 7
            ->select('qid')
1225 7
            ->from('tx_crawler_queue');
1226
        //if this entry is scheduled with "now"
1227 7
        if ($tstamp <= $currentTime) {
1228 2
            if ($this->extensionSettings['enableTimeslot']) {
1229 1
                $timeBegin = $currentTime - 100;
1230 1
                $timeEnd = $currentTime + 100;
1231
                $queryBuilder
1232 1
                    ->where(
1233 1
                        'scheduled BETWEEN ' . $timeBegin . ' AND ' . $timeEnd . ''
1234
                    )
1235 1
                    ->orWhere(
1236 1
                        $queryBuilder->expr()->lte('scheduled', $currentTime)
1237
                    );
1238
            } else {
1239
                $queryBuilder
1240 1
                    ->where(
1241 2
                        $queryBuilder->expr()->lte('scheduled', $currentTime)
1242
                    );
1243
            }
1244 5
        } elseif ($tstamp > $currentTime) {
1245
            //entry with a timestamp in the future need to have the same schedule time
1246
            $queryBuilder
1247 5
                ->where(
1248 5
                    $queryBuilder->expr()->eq('scheduled', $tstamp)
1249
                );
1250
        }
1251
1252
        $queryBuilder
1253 7
            ->andWhere('NOT exec_time')
1254 7
            ->andWhere('NOT process_id')
1255 7
            ->andWhere($queryBuilder->expr()->eq('page_id', $queryBuilder->createNamedParameter($fieldArray['page_id'], \PDO::PARAM_INT)))
1256 7
            ->andWhere($queryBuilder->expr()->eq('parameters_hash', $queryBuilder->createNamedParameter($fieldArray['parameters_hash'], \PDO::PARAM_STR)))
1257
            ;
1258
1259 7
        $statement = $queryBuilder->execute();
1260
1261 7
        while ($row = $statement->fetch()) {
1262 5
            $rows[] = $row['qid'];
1263
        }
1264
1265 7
        return $rows;
1266
    }
1267
1268
    /**
1269
     * Returns the current system time
1270
     *
1271
     * @return int
1272
     */
1273
    public function getCurrentTime()
1274
    {
1275
        return time();
1276
    }
1277
1278
    /************************************
1279
     *
1280
     * URL reading
1281
     *
1282
     ************************************/
1283
1284
    /**
1285
     * Read URL for single queue entry
1286
     *
1287
     * @param integer $queueId
1288
     * @param boolean $force If set, will process even if exec_time has been set!
1289
     * @return integer
1290
     */
1291
    public function readUrl($queueId, $force = false)
1292
    {
1293
        $queryBuilder = GeneralUtility::makeInstance(ConnectionPool::class)->getQueryBuilderForTable($this->tableName);
1294
        $ret = 0;
1295
        $this->logger->debug('crawler-readurl start ' . microtime(true));
1296
        // Get entry:
1297
        $queryBuilder
1298
            ->select('*')
1299
            ->from('tx_crawler_queue')
1300
            ->where(
1301
                $queryBuilder->expr()->eq('qid', $queryBuilder->createNamedParameter($queueId, \PDO::PARAM_INT))
1302
            );
1303
        if (!$force) {
1304
            $queryBuilder
1305
                ->andWhere('exec_time = 0')
1306
                ->andWhere('process_scheduled > 0');
1307
        }
1308
        $queueRec = $queryBuilder->execute()->fetch();
1309
1310
        if (!is_array($queueRec)) {
1311
            return;
1312
        }
1313
1314
        SignalSlotUtility::emitSignal(
1315
            __CLASS__,
1316
            SignalSlotUtility::SIGNNAL_QUEUEITEM_PREPROCESS,
1317
            [$queueId, &$queueRec]
1318
        );
1319
1320
        // Set exec_time to lock record:
1321
        $field_array = ['exec_time' => $this->getCurrentTime()];
1322
1323
        if (isset($this->processID)) {
1324
            //if mulitprocessing is used we need to store the id of the process which has handled this entry
1325
            $field_array['process_id_completed'] = $this->processID;
1326
        }
1327
1328
        GeneralUtility::makeInstance(ConnectionPool::class)->getConnectionForTable('tx_crawler_queue')
1329
            ->update(
1330
                'tx_crawler_queue',
1331
                $field_array,
1332
                ['qid' => (int)$queueId]
1333
            );
1334
1335
        $result = $this->queueExecutor->executeQueueItem($queueRec, $this);
1336
        $resultData = unserialize($result['content']);
1337
1338
        //atm there's no need to point to specific pollable extensions
1339
        if (is_array($GLOBALS['TYPO3_CONF_VARS']['EXTCONF']['crawler']['pollSuccess'])) {
1340
            foreach ($GLOBALS['TYPO3_CONF_VARS']['EXTCONF']['crawler']['pollSuccess'] as $pollable) {
1341
                // only check the success value if the instruction is runnig
1342
                // it is important to name the pollSuccess key same as the procInstructions key
1343
                if (is_array($resultData['parameters']['procInstructions']) && in_array(
1344
                    $pollable,
1345
                    $resultData['parameters']['procInstructions']
1346
                )
1347
                ) {
1348
                    if (!empty($resultData['success'][$pollable]) && $resultData['success'][$pollable]) {
1349
                        $ret |= self::CLI_STATUS_POLLABLE_PROCESSED;
1350
                    }
1351
                }
1352
            }
1353
        }
1354
1355
        // Set result in log which also denotes the end of the processing of this entry.
1356
        $field_array = ['result_data' => serialize($result)];
1357
1358
        SignalSlotUtility::emitSignal(
1359
            __CLASS__,
1360
            SignalSlotUtility::SIGNNAL_QUEUEITEM_POSTPROCESS,
1361
            [$queueId, &$field_array]
1362
        );
1363
1364
        GeneralUtility::makeInstance(ConnectionPool::class)->getConnectionForTable('tx_crawler_queue')
1365
            ->update(
1366
                'tx_crawler_queue',
1367
                $field_array,
1368
                ['qid' => (int)$queueId]
1369
            );
1370
1371
        $this->logger->debug('crawler-readurl stop ' . microtime(true));
1372
        return $ret;
1373
    }
1374
1375
    /**
1376
     * Read URL for not-yet-inserted log-entry
1377
     *
1378
     * @param array $field_array Queue field array,
1379
     *
1380
     * @return string
1381
     */
1382
    public function readUrlFromArray($field_array)
1383
    {
1384
        // Set exec_time to lock record:
1385
        $field_array['exec_time'] = $this->getCurrentTime();
1386
        $connectionForCrawlerQueue = GeneralUtility::makeInstance(ConnectionPool::class)->getConnectionForTable($this->tableName);
1387
        $connectionForCrawlerQueue->insert(
1388
            $this->tableName,
1389
            $field_array
1390
        );
1391
        $queueId = $field_array['qid'] = $connectionForCrawlerQueue->lastInsertId($this->tableName, 'qid');
1392
1393
        $result = $this->queueExecutor->executeQueueItem($field_array, $this);
1394
1395
        // Set result in log which also denotes the end of the processing of this entry.
1396
        $field_array = ['result_data' => serialize($result)];
1397
1398
        SignalSlotUtility::emitSignal(
1399
            __CLASS__,
1400
            SignalSlotUtility::SIGNNAL_QUEUEITEM_POSTPROCESS,
1401
            [$queueId, &$field_array]
1402
        );
1403
1404
        $connectionForCrawlerQueue->update(
1405
            $this->tableName,
1406
            $field_array,
1407
            ['qid' => $queueId]
1408
        );
1409
1410
        return $result;
0 ignored issues
show
Bug Best Practice introduced by
The expression return $result also could return the type array|boolean which is incompatible with the documented return type string.
Loading history...
1411
    }
1412
1413
    /*****************************
1414
     *
1415
     * Compiling URLs to crawl - tools
1416
     *
1417
     *****************************/
1418
1419
    /**
1420
     * @param integer $id Root page id to start from.
1421
     * @param integer $depth Depth of tree, 0=only id-page, 1= on sublevel, 99 = infinite
1422
     * @param integer $scheduledTime Unix Time when the URL is timed to be visited when put in queue
1423
     * @param integer $reqMinute Number of requests per minute (creates the interleave between requests)
1424
     * @param boolean $submitCrawlUrls If set, submits the URLs to queue in database (real crawling)
1425
     * @param boolean $downloadCrawlUrls If set (and submitcrawlUrls is false) will fill $downloadUrls with entries)
1426
     * @param array $incomingProcInstructions Array of processing instructions
1427
     * @param array $configurationSelection Array of configuration keys
1428
     * @return string
1429
     */
1430
    public function getPageTreeAndUrls(
1431
        $id,
1432
        $depth,
1433
        $scheduledTime,
1434
        $reqMinute,
1435
        $submitCrawlUrls,
1436
        $downloadCrawlUrls,
1437
        array $incomingProcInstructions,
1438
        array $configurationSelection
1439
    ) {
1440
        $this->scheduledTime = $scheduledTime;
1441
        $this->reqMinute = $reqMinute;
1442
        $this->submitCrawlUrls = $submitCrawlUrls;
1443
        $this->downloadCrawlUrls = $downloadCrawlUrls;
1444
        $this->incomingProcInstructions = $incomingProcInstructions;
1445
        $this->incomingConfigurationSelection = $configurationSelection;
1446
1447
        $this->duplicateTrack = [];
1448
        $this->downloadUrls = [];
1449
1450
        // Drawing tree:
1451
        /* @var PageTreeView $tree */
1452
        $tree = GeneralUtility::makeInstance(PageTreeView::class);
1453
        $perms_clause = $this->getBackendUser()->getPagePermsClause(Permission::PAGE_SHOW);
1454
        $tree->init('AND ' . $perms_clause);
1455
1456
        $pageInfo = BackendUtility::readPageAccess($id, $perms_clause);
1457
        if (is_array($pageInfo)) {
0 ignored issues
show
introduced by
The condition is_array($pageInfo) is always true.
Loading history...
1458
            // Set root row:
1459
            $tree->tree[] = [
1460
                'row' => $pageInfo,
1461
                'HTML' => $this->iconFactory->getIconForRecord('pages', $pageInfo, Icon::SIZE_SMALL),
1462
            ];
1463
        }
1464
1465
        // Get branch beneath:
1466
        if ($depth) {
1467
            $tree->getTree($id, $depth, '');
1468
        }
1469
1470
        // Traverse page tree:
1471
        $code = '';
1472
1473
        foreach ($tree->tree as $data) {
1474
            $this->MP = false;
1475
1476
            // recognize mount points
1477
            if ($data['row']['doktype'] == PageRepository::DOKTYPE_MOUNTPOINT) {
1478
                $queryBuilder = GeneralUtility::makeInstance(ConnectionPool::class)->getQueryBuilderForTable($this->tableName);
1479
                $queryBuilder->getRestrictions()->removeAll()->add(GeneralUtility::makeInstance(DeletedRestriction::class));
1480
                $mountpage = $queryBuilder
1481
                    ->select('*')
1482
                    ->from('pages')
1483
                    ->where(
1484
                        $queryBuilder->expr()->eq('uid', $queryBuilder->createNamedParameter($data['row']['uid'], \PDO::PARAM_INT))
1485
                    )
1486
                    ->execute()
1487
                    ->fetchAll();
1488
                $queryBuilder->resetRestrictions();
1489
1490
                // fetch mounted pages
1491
                $this->MP = $mountpage[0]['mount_pid'] . '-' . $data['row']['uid'];
0 ignored issues
show
Documentation Bug introduced by
The property $MP was declared of type boolean, but $mountpage[0]['mount_pid...' . $data['row']['uid'] is of type string. Maybe add a type cast?

This check looks for assignments to scalar types that may be of the wrong type.

To ensure the code behaves as expected, it may be a good idea to add an explicit type cast.

$answer = 42;

$correct = false;

$correct = (bool) $answer;
Loading history...
1492
1493
                $mountTree = GeneralUtility::makeInstance(PageTreeView::class);
1494
                $mountTree->init('AND ' . $perms_clause);
1495
                $mountTree->getTree($mountpage[0]['mount_pid'], $depth);
1496
1497
                foreach ($mountTree->tree as $mountData) {
1498
                    $code .= $this->drawURLs_addRowsForPage(
1499
                        $mountData['row'],
1500
                        $mountData['HTML'] . BackendUtility::getRecordTitle('pages', $mountData['row'], true)
1501
                    );
1502
                }
1503
1504
                // replace page when mount_pid_ol is enabled
1505
                if ($mountpage[0]['mount_pid_ol']) {
1506
                    $data['row']['uid'] = $mountpage[0]['mount_pid'];
1507
                } else {
1508
                    // if the mount_pid_ol is not set the MP must not be used for the mountpoint page
1509
                    $this->MP = false;
1510
                }
1511
            }
1512
1513
            $code .= $this->drawURLs_addRowsForPage(
1514
                $data['row'],
1515
                $data['HTML'] . BackendUtility::getRecordTitle('pages', $data['row'], true)
1516
            );
1517
        }
1518
1519
        return $code;
1520
    }
1521
1522
    /**
1523
     * Expands exclude string
1524
     *
1525
     * @param string $excludeString Exclude string
1526
     * @return array
1527
     */
1528 1
    public function expandExcludeString($excludeString)
1529
    {
1530
        // internal static caches;
1531 1
        static $expandedExcludeStringCache;
1532 1
        static $treeCache;
1533
1534 1
        if (empty($expandedExcludeStringCache[$excludeString])) {
1535 1
            $pidList = [];
1536
1537 1
            if (!empty($excludeString)) {
1538
                /** @var PageTreeView $tree */
1539
                $tree = GeneralUtility::makeInstance(PageTreeView::class);
1540
                $tree->init('AND ' . $this->getBackendUser()->getPagePermsClause(Permission::PAGE_SHOW));
1541
1542
                $excludeParts = GeneralUtility::trimExplode(',', $excludeString);
1543
1544
                foreach ($excludeParts as $excludePart) {
1545
                    [$pid, $depth] = GeneralUtility::trimExplode('+', $excludePart);
1546
1547
                    // default is "page only" = "depth=0"
1548
                    if (empty($depth)) {
1549
                        $depth = (stristr($excludePart, '+')) ? 99 : 0;
1550
                    }
1551
1552
                    $pidList[] = $pid;
1553
1554
                    if ($depth > 0) {
1555
                        if (empty($treeCache[$pid][$depth])) {
1556
                            $tree->reset();
1557
                            $tree->getTree($pid, $depth);
1558
                            $treeCache[$pid][$depth] = $tree->tree;
1559
                        }
1560
1561
                        foreach ($treeCache[$pid][$depth] as $data) {
1562
                            $pidList[] = $data['row']['uid'];
1563
                        }
1564
                    }
1565
                }
1566
            }
1567
1568 1
            $expandedExcludeStringCache[$excludeString] = array_unique($pidList);
1569
        }
1570
1571 1
        return $expandedExcludeStringCache[$excludeString];
1572
    }
1573
1574
    /**
1575
     * Create the rows for display of the page tree
1576
     * For each page a number of rows are shown displaying GET variable configuration
1577
     *
1578
     * @param    array        Page row
1579
     * @param    string        Page icon and title for row
0 ignored issues
show
Bug introduced by
The type AOE\Crawler\Controller\Page was not found. Maybe you did not declare it correctly or list all dependencies?

The issue could also be caused by a filter entry in the build configuration. If the path has been excluded in your configuration, e.g. excluded_paths: ["lib/*"], you can move it to the dependency path list as follows:

filter:
    dependency_paths: ["lib/*"]

For further information see https://scrutinizer-ci.com/docs/tools/php/php-scrutinizer/#list-dependency-paths

Loading history...
1580
     * @return    string        HTML <tr> content (one or more)
1581
     */
1582
    public function drawURLs_addRowsForPage(array $pageRow, $pageTitleAndIcon)
1583
    {
1584
        $skipMessage = '';
1585
1586
        // Get list of configurations
1587
        $configurations = $this->getUrlsForPageRow($pageRow, $skipMessage);
1588
1589
        if (!empty($this->incomingConfigurationSelection)) {
1590
            // remove configuration that does not match the current selection
1591
            foreach ($configurations as $confKey => $confArray) {
1592
                if (!in_array($confKey, $this->incomingConfigurationSelection)) {
1593
                    unset($configurations[$confKey]);
1594
                }
1595
            }
1596
        }
1597
1598
        // Traverse parameter combinations:
1599
        $c = 0;
1600
        $content = '';
1601
        if (!empty($configurations)) {
1602
            foreach ($configurations as $confKey => $confArray) {
1603
1604
                    // Title column:
1605
                if (!$c) {
1606
                    $titleClm = '<td rowspan="' . count($configurations) . '">' . $pageTitleAndIcon . '</td>';
1607
                } else {
1608
                    $titleClm = '';
1609
                }
1610
1611
                if (!in_array($pageRow['uid'], $this->expandExcludeString($confArray['subCfg']['exclude']))) {
1612
1613
                        // URL list:
1614
                    $urlList = $this->urlListFromUrlArray(
1615
                        $confArray,
1616
                        $pageRow,
1617
                        $this->scheduledTime,
1618
                        $this->reqMinute,
1619
                        $this->submitCrawlUrls,
1620
                        $this->downloadCrawlUrls,
1621
                        $this->duplicateTrack,
1622
                        $this->downloadUrls,
1623
                        $this->incomingProcInstructions // if empty the urls won't be filtered by processing instructions
1624
                    );
1625
1626
                    // Expanded parameters:
1627
                    $paramExpanded = '';
1628
                    $calcAccu = [];
1629
                    $calcRes = 1;
1630
                    foreach ($confArray['paramExpanded'] as $gVar => $gVal) {
1631
                        $paramExpanded .= '
1632
                            <tr>
1633
                                <td>' . htmlspecialchars('&' . $gVar . '=') . '<br/>' .
1634
                                '(' . count($gVal) . ')' .
1635
                                '</td>
1636
                                <td nowrap="nowrap">' . nl2br(htmlspecialchars(implode(chr(10), $gVal))) . '</td>
1637
                            </tr>
1638
                        ';
1639
                        $calcRes *= count($gVal);
1640
                        $calcAccu[] = count($gVal);
1641
                    }
1642
                    $paramExpanded = '<table>' . $paramExpanded . '</table>';
1643
                    $paramExpanded .= 'Comb: ' . implode('*', $calcAccu) . '=' . $calcRes;
1644
1645
                    // Options
1646
                    $optionValues = '';
1647
                    if ($confArray['subCfg']['userGroups']) {
1648
                        $optionValues .= 'User Groups: ' . $confArray['subCfg']['userGroups'] . '<br/>';
1649
                    }
1650
                    if ($confArray['subCfg']['procInstrFilter']) {
1651
                        $optionValues .= 'ProcInstr: ' . $confArray['subCfg']['procInstrFilter'] . '<br/>';
1652
                    }
1653
1654
                    // Compile row:
1655
                    $content .= '
1656
                        <tr>
1657
                            ' . $titleClm . '
1658
                            <td>' . htmlspecialchars($confKey) . '</td>
1659
                            <td>' . nl2br(htmlspecialchars(rawurldecode(trim(str_replace('&', chr(10) . '&', GeneralUtility::implodeArrayForUrl('', $confArray['paramParsed'])))))) . '</td>
1660
                            <td>' . $paramExpanded . '</td>
1661
                            <td nowrap="nowrap">' . $urlList . '</td>
1662
                            <td nowrap="nowrap">' . $optionValues . '</td>
1663
                            <td nowrap="nowrap">' . DebugUtility::viewArray($confArray['subCfg']['procInstrParams.']) . '</td>
1664
                        </tr>';
1665
                } else {
1666
                    $content .= '<tr>
1667
                            ' . $titleClm . '
1668
                            <td>' . htmlspecialchars($confKey) . '</td>
1669
                            <td colspan="5"><em>No entries</em> (Page is excluded in this configuration)</td>
1670
                        </tr>';
1671
                }
1672
1673
                $c++;
1674
            }
1675
        } else {
1676
            $message = !empty($skipMessage) ? ' (' . $skipMessage . ')' : '';
1677
1678
            // Compile row:
1679
            $content .= '
1680
                <tr>
1681
                    <td>' . $pageTitleAndIcon . '</td>
1682
                    <td colspan="6"><em>No entries</em>' . $message . '</td>
1683
                </tr>';
1684
        }
1685
1686
        return $content;
1687
    }
1688
1689
    /*****************************
1690
     *
1691
     * CLI functions
1692
     *
1693
     *****************************/
1694
1695
    /**
1696
     * Running the functionality of the CLI (crawling URLs from queue)
1697
     *
1698
     * @param int $countInARun
1699
     * @param int $sleepTime
1700
     * @param int $sleepAfterFinish
1701
     * @return string
1702
     */
1703
    public function CLI_run($countInARun, $sleepTime, $sleepAfterFinish)
1704
    {
1705
        $result = 0;
1706
        $counter = 0;
1707
1708
        // First, run hooks:
1709
        $this->CLI_runHooks();
1710
1711
        // Clean up the queue
1712
        if ((int)$this->extensionSettings['purgeQueueDays'] > 0) {
1713
            $purgeDate = $this->getCurrentTime() - 24 * 60 * 60 * (int)$this->extensionSettings['purgeQueueDays'];
1714
1715
            $queryBuilderDelete = GeneralUtility::makeInstance(ConnectionPool::class)->getQueryBuilderForTable($this->tableName);
1716
            $del = $queryBuilderDelete
1717
                ->delete($this->tableName)
1718
                ->where(
1719
                    'exec_time != 0 AND exec_time < ' . $purgeDate
1720
                )->execute();
1721
1722
            if (false === $del) {
1723
                $this->logger->info(
1724
                    'Records could not be deleted.'
1725
                );
1726
            }
1727
        }
1728
1729
        // Select entries:
1730
        //TODO Shouldn't this reside within the transaction?
1731
        $queryBuilderSelect = GeneralUtility::makeInstance(ConnectionPool::class)->getQueryBuilderForTable($this->tableName);
1732
        $rows = $queryBuilderSelect
1733
            ->select('qid', 'scheduled')
1734
            ->from($this->tableName)
1735
            ->where(
1736
                $queryBuilderSelect->expr()->eq('exec_time', 0),
1737
                $queryBuilderSelect->expr()->eq('process_scheduled', 0),
1738
                $queryBuilderSelect->expr()->lte('scheduled', $this->getCurrentTime())
1739
            )
1740
            ->orderBy('scheduled')
1741
            ->addOrderBy('qid')
1742
            ->setMaxResults($countInARun)
1743
            ->execute()
1744
            ->fetchAll();
1745
1746
        if (!empty($rows)) {
1747
            $quidList = [];
1748
1749
            foreach ($rows as $r) {
1750
                $quidList[] = $r['qid'];
1751
            }
1752
1753
            $processId = $this->CLI_buildProcessId();
1754
1755
            //reserve queue entries for process
1756
1757
            //$this->queryBuilder->getConnection()->executeQuery('BEGIN');
1758
            //TODO make sure we're not taking assigned queue-entires
1759
1760
            //save the number of assigned queue entrys to determine who many have been processed later
1761
            $queryBuilderUpdate = GeneralUtility::makeInstance(ConnectionPool::class)->getQueryBuilderForTable($this->tableName);
1762
            $numberOfAffectedRows = $queryBuilderUpdate
1763
                ->update($this->tableName)
1764
                ->where(
1765
                    $queryBuilderUpdate->expr()->in('qid', $quidList)
1766
                )
1767
                ->set('process_scheduled', $this->getCurrentTime())
1768
                ->set('process_id', $processId)
1769
                ->execute();
1770
1771
            GeneralUtility::makeInstance(ConnectionPool::class)->getConnectionForTable('tx_crawler_process')
1772
                ->update(
1773
                    'tx_crawler_process',
1774
                    ['assigned_items_count' => (int)$numberOfAffectedRows],
1775
                    ['process_id' => $processId]
1776
                );
1777
1778
            if ($numberOfAffectedRows == count($quidList)) {
1779
                //$this->queryBuilder->getConnection()->executeQuery('COMMIT');
1780
            } else {
1781
                //$this->queryBuilder->getConnection()->executeQuery('ROLLBACK');
1782
                $this->CLI_debug("Nothing processed due to multi-process collision (" . $this->CLI_buildProcessId() . ")");
1783
                return ($result | self::CLI_STATUS_ABORTED);
1784
            }
1785
1786
            foreach ($rows as $r) {
1787
                $result |= $this->readUrl($r['qid']);
1788
1789
                $counter++;
1790
                usleep((int)$sleepTime); // Just to relax the system
1791
1792
                // if during the start and the current read url the cli has been disable we need to return from the function
1793
                // mark the process NOT as ended.
1794
                if ($this->getDisabled()) {
1795
                    return ($result | self::CLI_STATUS_ABORTED);
1796
                }
1797
1798
                if (!$this->processRepository->isProcessActive($this->CLI_buildProcessId())) {
1799
                    $this->CLI_debug("conflict / timeout (" . $this->CLI_buildProcessId() . ")");
1800
1801
                    //TODO might need an additional returncode
1802
                    $result |= self::CLI_STATUS_ABORTED;
1803
                    break; //possible timeout
1804
                }
1805
            }
1806
1807
            sleep((int)$sleepAfterFinish);
1808
1809
            $msg = 'Rows: ' . $counter;
1810
            $this->CLI_debug($msg . " (" . $this->CLI_buildProcessId() . ")");
1811
        } else {
1812
            $this->CLI_debug("Nothing within queue which needs to be processed (" . $this->CLI_buildProcessId() . ")");
1813
        }
1814
1815
        if ($counter > 0) {
1816
            $result |= self::CLI_STATUS_PROCESSED;
1817
        }
1818
1819
        return $result;
1820
    }
1821
1822
    /**
1823
     * Activate hooks
1824
     *
1825
     * @return void
1826
     */
1827
    public function CLI_runHooks(): void
1828
    {
1829
        foreach ($GLOBALS['TYPO3_CONF_VARS']['EXTCONF']['crawler']['cli_hooks'] ?? [] as $objRef) {
1830
            $hookObj = GeneralUtility::makeInstance($objRef);
1831
            if (is_object($hookObj)) {
1832
                $hookObj->crawler_init($this);
1833
            }
1834
        }
1835
    }
1836
1837
    /**
1838
     * Try to acquire a new process with the given id
1839
     * also performs some auto-cleanup for orphan processes
1840
     * @todo preemption might not be the most elegant way to clean up
1841
     *
1842
     * @param string $id identification string for the process
1843
     * @return boolean
1844
     */
1845
    public function CLI_checkAndAcquireNewProcess($id)
1846
    {
1847
        $queryBuilder = GeneralUtility::makeInstance(ConnectionPool::class)->getQueryBuilderForTable($this->tableName);
1848
        $ret = true;
1849
1850
        $systemProcessId = getmypid();
1851
        if ($systemProcessId < 1) {
1852
            return false;
1853
        }
1854
1855
        $processCount = 0;
1856
        $orphanProcesses = [];
1857
1858
        //$this->queryBuilder->getConnection()->executeQuery('BEGIN');
1859
1860
        $statement = $queryBuilder
1861
            ->select('process_id', 'ttl')
1862
            ->from('tx_crawler_process')
1863
            ->where(
1864
                'active = 1 AND deleted = 0'
1865
            )
1866
            ->execute();
1867
1868
        $currentTime = $this->getCurrentTime();
1869
1870
        while ($row = $statement->fetch()) {
1871
            if ($row['ttl'] < $currentTime) {
1872
                $orphanProcesses[] = $row['process_id'];
1873
            } else {
1874
                $processCount++;
1875
            }
1876
        }
1877
1878
        // if there are less than allowed active processes then add a new one
1879
        if ($processCount < (int)$this->extensionSettings['processLimit']) {
1880
            $this->CLI_debug("add process " . $this->CLI_buildProcessId() . " (" . ($processCount + 1) . "/" . (int)$this->extensionSettings['processLimit'] . ")");
1881
1882
            GeneralUtility::makeInstance(ConnectionPool::class)->getConnectionForTable('tx_crawler_process')->insert(
1883
                'tx_crawler_process',
1884
                [
1885
                    'process_id' => $id,
1886
                    'active' => 1,
1887
                    'ttl' => $currentTime + (int)$this->extensionSettings['processMaxRunTime'],
1888
                    'system_process_id' => $systemProcessId,
1889
                ]
1890
            );
1891
        } else {
1892
            $this->CLI_debug("Processlimit reached (" . ($processCount) . "/" . (int)$this->extensionSettings['processLimit'] . ")");
1893
            $ret = false;
1894
        }
1895
1896
        $this->processRepository->deleteProcessesMarkedAsDeleted();
1897
        $this->CLI_releaseProcesses($orphanProcesses, true); // maybe this should be somehow included into the current lock
1898
1899
        return $ret;
1900
    }
1901
1902
    /**
1903
     * Release a process and the required resources
1904
     *
1905
     * @param  mixed    $releaseIds   string with a single process-id or array with multiple process-ids
1906
     * @param  boolean  $withinLock   show whether the DB-actions are included within an existing lock
1907
     * @return boolean
1908
     */
1909
    public function CLI_releaseProcesses($releaseIds, $withinLock = false)
1910
    {
1911
        $queryBuilder = GeneralUtility::makeInstance(ConnectionPool::class)->getQueryBuilderForTable($this->tableName);
1912
1913
        if (!is_array($releaseIds)) {
1914
            $releaseIds = [$releaseIds];
1915
        }
1916
1917
        if (empty($releaseIds)) {
1918
            return false;   //nothing to release
1919
        }
1920
1921
        if (!$withinLock) {
1922
            //$this->queryBuilder->getConnection()->executeQuery('BEGIN');
1923
        }
1924
1925
        // some kind of 2nd chance algo - this way you need at least 2 processes to have a real cleanup
1926
        // this ensures that a single process can't mess up the entire process table
1927
1928
        // mark all processes as deleted which have no "waiting" queue-entires and which are not active
1929
1930
        $queryBuilder
1931
        ->update($this->tableName, 'q')
1932
        ->where(
1933
            'q.process_id IN(SELECT p.process_id FROM tx_crawler_process as p WHERE p.active = 0)'
1934
        )
1935
        ->set('q.process_scheduled', 0)
1936
        ->set('q.process_id', '')
1937
        ->execute();
1938
1939
        // FIXME: Not entirely sure that this is equivalent to the previous version
1940
        $queryBuilder->resetQueryPart('set');
1941
1942
        $queryBuilder
1943
            ->update('tx_crawler_process')
1944
            ->where(
1945
                $queryBuilder->expr()->eq('active', 0),
1946
                'process_id IN(SELECT q.process_id FROM tx_crawler_queue as q WHERE q.exec_time = 0)'
1947
            )
1948
            ->set('system_process_id', 0)
1949
            ->execute();
1950
        // previous version for reference
1951
        /*
1952
        $GLOBALS['TYPO3_DB']->exec_UPDATEquery(
1953
            'tx_crawler_process',
1954
            'active=0 AND deleted=0
1955
            AND NOT EXISTS (
1956
                SELECT * FROM tx_crawler_queue
1957
                WHERE tx_crawler_queue.process_id = tx_crawler_process.process_id
1958
                AND tx_crawler_queue.exec_time = 0
1959
            )',
1960
            [
1961
                'deleted' => '1',
1962
                'system_process_id' => 0
1963
            ]
1964
        );*/
1965
        // mark all requested processes as non-active
1966
        $queryBuilder
1967
            ->update('tx_crawler_process')
1968
            ->where(
1969
                'NOT EXISTS (
1970
                SELECT * FROM tx_crawler_queue
1971
                    WHERE tx_crawler_queue.process_id = tx_crawler_process.process_id
1972
                    AND tx_crawler_queue.exec_time = 0
1973
                )',
1974
                $queryBuilder->expr()->in('process_id', $queryBuilder->createNamedParameter($releaseIds, Connection::PARAM_STR_ARRAY)),
1975
                $queryBuilder->expr()->eq('deleted', 0)
1976
            )
1977
            ->set('active', 0)
1978
            ->execute();
1979
        $queryBuilder->resetQueryPart('set');
1980
        $queryBuilder
1981
            ->update($this->tableName)
1982
            ->where(
1983
                $queryBuilder->expr()->eq('exec_time', 0),
1984
                $queryBuilder->expr()->in('process_id', $queryBuilder->createNamedParameter($releaseIds, Connection::PARAM_STR_ARRAY))
1985
            )
1986
            ->set('process_scheduled', 0)
1987
            ->set('process_id', '')
1988
            ->execute();
1989
1990
        if (!$withinLock) {
1991
            //$this->queryBuilder->getConnection()->executeQuery('COMMIT');
1992
        }
1993
1994
        return true;
1995
    }
1996
1997
    /**
1998
     * Create a unique Id for the current process
1999
     *
2000
     * @return string  the ID
2001
     */
2002 1
    public function CLI_buildProcessId()
2003
    {
2004 1
        if (!$this->processID) {
2005
            $this->processID = GeneralUtility::shortMD5(microtime(true));
2006
        }
2007 1
        return $this->processID;
2008
    }
2009
2010
    /**
2011
     * Prints a message to the stdout (only if debug-mode is enabled)
2012
     *
2013
     * @param  string $msg  the message
2014
     */
2015
    public function CLI_debug($msg): void
2016
    {
2017
        if ((int)$this->extensionSettings['processDebug']) {
2018
            echo $msg . "\n";
2019
            flush();
2020
        }
2021
    }
2022
2023
    /**
2024
     * Cleans up entries that stayed for too long in the queue. These are:
2025
     * - processed entries that are over 1.5 days in age
2026
     * - scheduled entries that are over 7 days old
2027
     *
2028
     * @return void
2029
     */
2030
    public function cleanUpOldQueueEntries(): void
2031
    {
2032
        $processedAgeInSeconds = $this->extensionSettings['cleanUpProcessedAge'] * 86400; // 24*60*60 Seconds in 24 hours
2033
        $scheduledAgeInSeconds = $this->extensionSettings['cleanUpScheduledAge'] * 86400;
2034
2035
        $now = time();
2036
        $condition = '(exec_time<>0 AND exec_time<' . ($now - $processedAgeInSeconds) . ') OR scheduled<=' . ($now - $scheduledAgeInSeconds);
2037
        $this->flushQueue($condition);
2038
    }
2039
2040
    /**
2041
     * Returns a md5 hash generated from a serialized configuration array.
2042
     *
2043
     * @param array $configuration
2044
     *
2045
     * @return string
2046
     */
2047 8
    protected function getConfigurationHash(array $configuration)
2048
    {
2049 8
        unset($configuration['paramExpanded']);
2050 8
        unset($configuration['URLs']);
2051 8
        return md5(serialize($configuration));
2052
    }
2053
2054
    /**
2055
     * Build a URL from a Page and the Query String. If the page has a Site configuration, it can be built by using
2056
     * the Site instance.
2057
     *
2058
     * @param int $pageId
2059
     * @param string $queryString
2060
     * @param string|null $alternativeBaseUrl
2061
     * @param int $httpsOrHttp see tx_crawler_configuration.force_ssl
2062
     * @return UriInterface
2063
     * @throws \TYPO3\CMS\Core\Exception\SiteNotFoundException
2064
     * @throws \TYPO3\CMS\Core\Routing\InvalidRouteArgumentsException
2065
     */
2066 2
    protected function getUrlFromPageAndQueryParameters(int $pageId, string $queryString, ?string $alternativeBaseUrl, int $httpsOrHttp): UriInterface
2067
    {
2068 2
        $site = GeneralUtility::makeInstance(SiteMatcher::class)->matchByPageId((int)$pageId);
2069 2
        if ($site instanceof Site) {
2070
            $queryString = ltrim($queryString, '?&');
2071
            $queryParts = [];
2072
            parse_str($queryString, $queryParts);
2073
            unset($queryParts['id']);
2074
            // workaround as long as we don't have native language support in crawler configurations
2075
            if (isset($queryParts['L'])) {
2076
                $queryParts['_language'] = $queryParts['L'];
2077
                unset($queryParts['L']);
2078
                $siteLanguage = $site->getLanguageById((int)$queryParts['_language']);
0 ignored issues
show
Unused Code introduced by
The assignment to $siteLanguage is dead and can be removed.
Loading history...
2079
            } else {
2080
                $siteLanguage = $site->getDefaultLanguage();
2081
            }
2082
            $url = $site->getRouter()->generateUri($pageId, $queryParts);
2083
            if (!empty($alternativeBaseUrl)) {
2084
                $alternativeBaseUrl = new Uri($alternativeBaseUrl);
2085
                $url = $url->withHost($alternativeBaseUrl->getHost());
2086
                $url = $url->withScheme($alternativeBaseUrl->getScheme());
2087
                $url = $url->withPort($alternativeBaseUrl->getPort());
2088
            }
2089
        } else {
2090
            // Technically this is not possible with site handling, but kept for backwards-compatibility reasons
2091
            // Once EXT:crawler is v10-only compatible, this should be removed completely
2092 2
            $baseUrl = ($alternativeBaseUrl ?: GeneralUtility::getIndpEnv('TYPO3_SITE_URL'));
2093 2
            $cacheHashCalculator = GeneralUtility::makeInstance(CacheHashCalculator::class);
2094 2
            $queryString .= '&cHash=' . $cacheHashCalculator->generateForParameters($queryString);
2095 2
            $url = rtrim($baseUrl, '/') . '/index.php' . $queryString;
2096 2
            $url = new Uri($url);
2097
        }
2098
2099 2
        if ($httpsOrHttp === -1) {
2100
            $url = $url->withScheme('http');
2101 2
        } elseif ($httpsOrHttp === 1) {
2102
            $url = $url->withScheme('https');
2103
        }
2104
2105 2
        return $url;
2106
    }
2107
}
2108