Passed
Push — typo3v9 ( 5cd459...4bbe3c )
by Tomas Norre
06:20
created

CrawlerController::getUrlsForPageId()   C

Complexity

Conditions 16
Paths 96

Size

Total Lines 93
Code Lines 51

Duplication

Lines 0
Ratio 0 %

Code Coverage

Tests 46
CRAP Score 16.0585

Importance

Changes 3
Bugs 0 Features 0
Metric Value
cc 16
eloc 51
c 3
b 0
f 0
nc 96
nop 1
dl 0
loc 93
ccs 46
cts 49
cp 0.9388
crap 16.0585
rs 5.5666

How to fix   Long Method    Complexity   

Long Method

Small methods make your code easier to understand, in particular if combined with a good name. Besides, if your method is small, finding a good name is usually much easier.

For example, if you find yourself adding comments to a method's body, this is usually a good sign to extract the commented part to a new method, and use the comment as a starting point when coming up with a good name for this new method.

Commonly applied refactorings include:

1
<?php
2
3
declare(strict_types=1);
4
5
namespace AOE\Crawler\Controller;
6
7
/***************************************************************
8
 *  Copyright notice
9
 *
10
 *  (c) 2019 AOE GmbH <[email protected]>
11
 *
12
 *  All rights reserved
13
 *
14
 *  This script is part of the TYPO3 project. The TYPO3 project is
15
 *  free software; you can redistribute it and/or modify
16
 *  it under the terms of the GNU General Public License as published by
17
 *  the Free Software Foundation; either version 3 of the License, or
18
 *  (at your option) any later version.
19
 *
20
 *  The GNU General Public License can be found at
21
 *  http://www.gnu.org/copyleft/gpl.html.
22
 *
23
 *  This script is distributed in the hope that it will be useful,
24
 *  but WITHOUT ANY WARRANTY; without even the implied warranty of
25
 *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
26
 *  GNU General Public License for more details.
27
 *
28
 *  This copyright notice MUST APPEAR in all copies of the script!
29
 ***************************************************************/
30
31
use AOE\Crawler\Configuration\ExtensionConfigurationProvider;
32
use AOE\Crawler\Domain\Repository\ConfigurationRepository;
33
use AOE\Crawler\Domain\Repository\ProcessRepository;
34
use AOE\Crawler\Domain\Repository\QueueRepository;
35
use AOE\Crawler\Event\EventDispatcher;
36
use AOE\Crawler\QueueExecutor;
37
use AOE\Crawler\Utility\SignalSlotUtility;
38
use Psr\Http\Message\UriInterface;
39
use Psr\Log\LoggerAwareInterface;
40
use Psr\Log\LoggerAwareTrait;
41
use TYPO3\CMS\Backend\Tree\View\PageTreeView;
42
use TYPO3\CMS\Backend\Utility\BackendUtility;
43
use TYPO3\CMS\Core\Authentication\BackendUserAuthentication;
44
use TYPO3\CMS\Core\Core\Bootstrap;
45
use TYPO3\CMS\Core\Core\Environment;
46
use TYPO3\CMS\Core\Database\Connection;
47
use TYPO3\CMS\Core\Database\ConnectionPool;
48
use TYPO3\CMS\Core\Database\Query\Restriction\DeletedRestriction;
49
use TYPO3\CMS\Core\Http\Uri;
50
use TYPO3\CMS\Core\Imaging\Icon;
51
use TYPO3\CMS\Core\Imaging\IconFactory;
52
use TYPO3\CMS\Core\Routing\SiteMatcher;
53
use TYPO3\CMS\Core\Site\Entity\Site;
54
use TYPO3\CMS\Core\Type\Bitmask\Permission;
55
use TYPO3\CMS\Core\TypoScript\Parser\TypoScriptParser;
56
use TYPO3\CMS\Core\Utility\DebugUtility;
57
use TYPO3\CMS\Core\Utility\GeneralUtility;
58
use TYPO3\CMS\Core\Utility\MathUtility;
59
use TYPO3\CMS\Extbase\Object\ObjectManager;
60
use TYPO3\CMS\Frontend\Page\CacheHashCalculator;
61
use TYPO3\CMS\Frontend\Page\PageRepository;
62
63
/**
64
 * Class CrawlerController
65
 *
66
 * @package AOE\Crawler\Controller
67
 */
68
class CrawlerController implements LoggerAwareInterface
69
{
70
    use LoggerAwareTrait;
71
72
    public const CLI_STATUS_NOTHING_PROCCESSED = 0;
73
    public const CLI_STATUS_REMAIN = 1; //queue not empty
74
    public const CLI_STATUS_PROCESSED = 2; //(some) queue items where processed
75
    public const CLI_STATUS_ABORTED = 4; //instance didn't finish
76
    public const CLI_STATUS_POLLABLE_PROCESSED = 8;
77
78
    /**
79
     * @var integer
80
     */
81
    public $setID = 0;
82
83
    /**
84
     * @var string
85
     */
86
    public $processID = '';
87
88
    /**
89
     * @var array
90
     */
91
    public $duplicateTrack = [];
92
93
    /**
94
     * @var array
95
     */
96
    public $downloadUrls = [];
97
98
    /**
99
     * @var array
100
     */
101
    public $incomingProcInstructions = [];
102
103
    /**
104
     * @var array
105
     */
106
    public $incomingConfigurationSelection = [];
107
108
    /**
109
     * @var bool
110
     */
111
    public $registerQueueEntriesInternallyOnly = false;
112
113
    /**
114
     * @var array
115
     */
116
    public $queueEntries = [];
117
118
    /**
119
     * @var array
120
     */
121
    public $urlList = [];
122
123
    /**
124
     * @var array
125
     */
126
    public $extensionSettings = [];
127
128
    /**
129
     * Mount Point
130
     *
131
     * @var bool
132
     * Todo: Check what this is used for and adjust the type hint or code, as bool doesn't match the current code.
133
     */
134
    public $MP = false;
135
136
    /**
137
     * @var string
138
     */
139
    protected $processFilename;
140
141
    /**
142
     * Holds the internal access mode can be 'gui','cli' or 'cli_im'
143
     *
144
     * @var string
145
     */
146
    protected $accessMode;
147
148
    /**
149
     * @var BackendUserAuthentication|null
150
     */
151
    private $backendUser;
152
153
    /**
154
     * @var integer
155
     */
156
    private $scheduledTime = 0;
157
158
    /**
159
     * @var integer
160
     */
161
    private $reqMinute = 0;
162
163
    /**
164
     * @var bool
165
     */
166
    private $submitCrawlUrls = false;
167
168
    /**
169
     * @var bool
170
     */
171
    private $downloadCrawlUrls = false;
172
173
    /**
174
     * @var QueueRepository
175
     */
176
    protected $queueRepository;
177
178
    /**
179
     * @var ProcessRepository
180
     */
181
    protected $processRepository;
182
183
    /**
184
     * @var ConfigurationRepository
185
     */
186
    protected $configurationRepository;
187
188
    /**
189
     * @var string
190
     */
191
    protected $tableName = 'tx_crawler_queue';
192
193
    /**
194
     * @var QueueExecutor
195
     */
196
    protected $queueExecutor;
197
198
    /**
199
     * @var int
200
     */
201
    protected $maximumUrlsToCompile = 10000;
202
203
    /**
204
     * @var IconFactory
205
     */
206
    protected $iconFactory;
207
208
    /**
209
     * Method to set the accessMode can be gui, cli or cli_im
210
     *
211
     * @return string
212
     */
213 1
    public function getAccessMode()
214
    {
215 1
        return $this->accessMode;
216
    }
217
218
    /**
219
     * @param string $accessMode
220
     */
221 1
    public function setAccessMode($accessMode): void
222
    {
223 1
        $this->accessMode = $accessMode;
224 1
    }
225
226
    /**
227
     * Set disabled status to prevent processes from being processed
228
     *
229
     * @param bool $disabled (optional, defaults to true)
230
     * @return void
231
     */
232 2
    public function setDisabled($disabled = true): void
233
    {
234 2
        if ($disabled) {
235 1
            GeneralUtility::writeFile($this->processFilename, '');
236
        } else {
237 1
            if (is_file($this->processFilename)) {
238 1
                unlink($this->processFilename);
239
            }
240
        }
241 2
    }
242
243
    /**
244
     * Get disable status
245
     *
246
     * @return bool true if disabled
247
     */
248 2
    public function getDisabled()
249
    {
250 2
        return is_file($this->processFilename);
251
    }
252
253
    /**
254
     * @param string $filenameWithPath
255
     *
256
     * @return void
257
     */
258 3
    public function setProcessFilename($filenameWithPath): void
259
    {
260 3
        $this->processFilename = $filenameWithPath;
261 3
    }
262
263
    /**
264
     * @return string
265
     */
266 1
    public function getProcessFilename()
267
    {
268 1
        return $this->processFilename;
269
    }
270
271
    /************************************
272
     *
273
     * Getting URLs based on Page TSconfig
274
     *
275
     ************************************/
276
277 40
    public function __construct()
278
    {
279 40
        $objectManager = GeneralUtility::makeInstance(ObjectManager::class);
280 40
        $this->queueRepository = $objectManager->get(QueueRepository::class);
281 40
        $this->processRepository = $objectManager->get(ProcessRepository::class);
282 40
        $this->configurationRepository = $objectManager->get(ConfigurationRepository::class);
283 40
        $this->queueExecutor = $objectManager->get(QueueExecutor::class);
284 40
        $this->iconFactory = GeneralUtility::makeInstance(IconFactory::class);
285
286 40
        $this->processFilename = Environment::getVarPath() . '/lock/tx_crawler.proc';
287
288
        /** @var ExtensionConfigurationProvider $configurationProvider */
289 40
        $configurationProvider = GeneralUtility::makeInstance(ExtensionConfigurationProvider::class);
290 40
        $settings = $configurationProvider->getExtensionConfiguration();
291 40
        $this->extensionSettings = is_array($settings) ? $settings : [];
0 ignored issues
show
introduced by
The condition is_array($settings) is always true.
Loading history...
292
293
        // set defaults:
294 40
        if (MathUtility::convertToPositiveInteger($this->extensionSettings['countInARun']) == 0) {
295
            $this->extensionSettings['countInARun'] = 100;
296
        }
297
298 40
        $this->extensionSettings['processLimit'] = MathUtility::forceIntegerInRange($this->extensionSettings['processLimit'], 1, 99, 1);
299 40
        $this->maximumUrlsToCompile = MathUtility::forceIntegerInRange($this->extensionSettings['maxCompileUrls'], 1, 1000000000, 10000);
300 40
    }
301
302
    /**
303
     * @return BackendUserAuthentication
304
     */
305 1
    private function getBackendUser()
306
    {
307
        // Make sure the _cli_ user is loaded
308 1
        Bootstrap::initializeBackendAuthentication();
309 1
        if ($this->backendUser === null) {
310 1
            $this->backendUser = $GLOBALS['BE_USER'];
311
        }
312 1
        return $this->backendUser;
313
    }
314
315
    /**
316
     * Sets the extensions settings (unserialized pendant of $TYPO3_CONF_VARS['EXT']['extConf']['crawler']).
317
     *
318
     * @param array $extensionSettings
319
     * @return void
320
     */
321 12
    public function setExtensionSettings(array $extensionSettings): void
322
    {
323 12
        $this->extensionSettings = $extensionSettings;
324 12
    }
325
326
    /**
327
     * Check if the given page should be crawled
328
     *
329
     * @param array $pageRow
330
     * @return false|string false if the page should be crawled (not excluded), true / skipMessage if it should be skipped
331
     */
332 8
    public function checkIfPageShouldBeSkipped(array $pageRow)
333
    {
334 8
        $skipPage = false;
335 8
        $skipMessage = 'Skipped'; // message will be overwritten later
336
337
        // if page is hidden
338 8
        if (!$this->extensionSettings['crawlHiddenPages']) {
339 8
            if ($pageRow['hidden']) {
340 1
                $skipPage = true;
341 1
                $skipMessage = 'Because page is hidden';
342
            }
343
        }
344
345 8
        if (!$skipPage) {
346 7
            if (GeneralUtility::inList('3,4', $pageRow['doktype']) || $pageRow['doktype'] >= 199) {
347 3
                $skipPage = true;
348 3
                $skipMessage = 'Because doktype is not allowed';
349
            }
350
        }
351
352 8
        if (!$skipPage) {
353 4
            foreach ($GLOBALS['TYPO3_CONF_VARS']['EXTCONF']['crawler']['excludeDoktype'] ?? [] as $key => $doktypeList) {
354 1
                if (GeneralUtility::inList($doktypeList, $pageRow['doktype'])) {
355 1
                    $skipPage = true;
356 1
                    $skipMessage = 'Doktype was excluded by "' . $key . '"';
357 1
                    break;
358
                }
359
            }
360
        }
361
362 8
        if (!$skipPage) {
363
            // veto hook
364 3
            foreach ($GLOBALS['TYPO3_CONF_VARS']['EXTCONF']['crawler']['pageVeto'] ?? [] as $key => $func) {
365
                $params = [
366
                    'pageRow' => $pageRow,
367
                ];
368
                // expects "false" if page is ok and "true" or a skipMessage if this page should _not_ be crawled
369
                $veto = GeneralUtility::callUserFunction($func, $params, $this);
370
                if ($veto !== false) {
371
                    $skipPage = true;
372
                    if (is_string($veto)) {
373
                        $skipMessage = $veto;
374
                    } else {
375
                        $skipMessage = 'Veto from hook "' . htmlspecialchars($key) . '"';
376
                    }
377
                    // no need to execute other hooks if a previous one return a veto
378
                    break;
379
                }
380
            }
381
        }
382
383 8
        return $skipPage ? $skipMessage : false;
384
    }
385
386
    /**
387
     * Wrapper method for getUrlsForPageId()
388
     * It returns an array of configurations and no urls!
389
     *
390
     * @param array $pageRow Page record with at least dok-type and uid columns.
391
     * @param string $skipMessage
392
     * @return array
393
     * @see getUrlsForPageId()
394
     */
395 4
    public function getUrlsForPageRow(array $pageRow, &$skipMessage = '')
396
    {
397 4
        $message = $this->checkIfPageShouldBeSkipped($pageRow);
398
399 4
        if ($message === false) {
400 3
            $res = $this->getUrlsForPageId($pageRow['uid']);
401 3
            $skipMessage = '';
402
        } else {
403 1
            $skipMessage = $message;
404 1
            $res = [];
405
        }
406
407 4
        return $res;
408
    }
409
410
    /**
411
     * This method is used to count if there are ANY unprocessed queue entries
412
     * of a given page_id and the configuration which matches a given hash.
413
     * If there if none, we can skip an inner detail check
414
     *
415
     * @param int $uid
416
     * @param string $configurationHash
417
     * @return boolean
418
     */
419 5
    protected function noUnprocessedQueueEntriesForPageWithConfigurationHashExist($uid, $configurationHash)
420
    {
421 5
        $queryBuilder = GeneralUtility::makeInstance(ConnectionPool::class)->getQueryBuilderForTable($this->tableName);
422 5
        $noUnprocessedQueueEntriesFound = true;
423
424
        $result = $queryBuilder
425 5
            ->count('*')
426 5
            ->from($this->tableName)
427 5
            ->where(
428 5
                $queryBuilder->expr()->eq('page_id', (int)$uid),
429 5
                $queryBuilder->expr()->eq('configuration_hash', $queryBuilder->createNamedParameter($configurationHash)),
430 5
                $queryBuilder->expr()->eq('exec_time', 0)
431
            )
432 5
            ->execute()
433 5
            ->fetchColumn();
434
435 5
        if ($result) {
436 3
            $noUnprocessedQueueEntriesFound = false;
437
        }
438
439 5
        return $noUnprocessedQueueEntriesFound;
440
    }
441
442
    /**
443
     * Creates a list of URLs from input array (and submits them to queue if asked for)
444
     * See Web > Info module script + "indexed_search"'s crawler hook-client using this!
445
     *
446
     * @param array $vv Information about URLs from pageRow to crawl.
447
     * @param array $pageRow Page row
448
     * @param int $scheduledTime Unix time to schedule indexing to, typically time()
449
     * @param int $reqMinute Number of requests per minute (creates the interleave between requests)
450
     * @param bool $submitCrawlUrls If set, submits the URLs to queue
451
     * @param bool $downloadCrawlUrls If set (and submitcrawlUrls is false) will fill $downloadUrls with entries)
452
     * @param array $duplicateTrack Array which is passed by reference and contains the an id per url to secure we will not crawl duplicates
453
     * @param array $downloadUrls Array which will be filled with URLS for download if flag is set.
454
     * @param array $incomingProcInstructions Array of processing instructions
455
     * @return string List of URLs (meant for display in backend module)
456
     */
457 2
    public function urlListFromUrlArray(
458
        array $vv,
459
        array $pageRow,
460
        $scheduledTime,
461
        $reqMinute,
462
        $submitCrawlUrls,
463
        $downloadCrawlUrls,
464
        array &$duplicateTrack,
465
        array &$downloadUrls,
466
        array $incomingProcInstructions
467
    ) {
468 2
        if (!is_array($vv['URLs'])) {
469
            return 'ERROR - no URL generated';
470
        }
471 2
        $urlLog = [];
472 2
        $pageId = (int)$pageRow['uid'];
473 2
        $configurationHash = $this->getConfigurationHash($vv);
474 2
        $skipInnerCheck = $this->noUnprocessedQueueEntriesForPageWithConfigurationHashExist($pageId, $configurationHash);
475
476 2
        foreach ($vv['URLs'] as $urlQuery) {
477 2
            if (!$this->drawURLs_PIfilter($vv['subCfg']['procInstrFilter'], $incomingProcInstructions)) {
478
                continue;
479
            }
480 2
            $url = (string)$this->getUrlFromPageAndQueryParameters(
481 2
                $pageId,
482 2
                $urlQuery,
483 2
                $vv['subCfg']['baseUrl'] ?? null,
484 2
                $vv['subCfg']['force_ssl'] ?? 0
485
            );
486
487
            // Create key by which to determine unique-ness:
488 2
            $uKey = $url . '|' . $vv['subCfg']['userGroups'] . '|' . $vv['subCfg']['procInstrFilter'];
489
490 2
            if (isset($duplicateTrack[$uKey])) {
491
                //if the url key is registered just display it and do not resubmit is
492
                $urlLog[] = '<em><span class="text-muted">' . htmlspecialchars($url) . '</span></em>';
493
            } else {
494
                // Scheduled time:
495 2
                $schTime = $scheduledTime + round(count($duplicateTrack) * (60 / $reqMinute));
496 2
                $schTime = intval($schTime / 60) * 60;
497 2
                $formattedDate = BackendUtility::datetime($schTime);
498 2
                $this->urlList[] = '[' . $formattedDate . '] ' . $url;
499 2
                $urlList = '[' . $formattedDate . '] ' . htmlspecialchars($url);
500
501
                // Submit for crawling!
502 2
                if ($submitCrawlUrls) {
503 2
                    $added = $this->addUrl(
504 2
                        $pageId,
505 2
                        $url,
506 2
                        $vv['subCfg'],
507 2
                        $scheduledTime,
508 2
                        $configurationHash,
509 2
                        $skipInnerCheck
510
                    );
511 2
                    if ($added === false) {
512 2
                        $urlList .= ' (URL already existed)';
513
                    }
514
                } elseif ($downloadCrawlUrls) {
515
                    $downloadUrls[$url] = $url;
516
                }
517 2
                $urlLog[] = $urlList;
518
            }
519 2
            $duplicateTrack[$uKey] = true;
520
        }
521
522 2
        return implode('<br>', $urlLog);
523
    }
524
525
    /**
526
     * Returns true if input processing instruction is among registered ones.
527
     *
528
     * @param string $piString PI to test
529
     * @param array $incomingProcInstructions Processing instructions
530
     * @return boolean
531
     */
532 5
    public function drawURLs_PIfilter($piString, array $incomingProcInstructions)
533
    {
534 5
        if (empty($incomingProcInstructions)) {
535 1
            return true;
536
        }
537
538 4
        foreach ($incomingProcInstructions as $pi) {
539 4
            if (GeneralUtility::inList($piString, $pi)) {
540 2
                return true;
541
            }
542
        }
543 2
        return false;
544
    }
545
546 3
    public function getPageTSconfigForId($id): array
547
    {
548 3
        if (!$this->MP) {
549 3
            $pageTSconfig = BackendUtility::getPagesTSconfig($id);
0 ignored issues
show
Deprecated Code introduced by
The function TYPO3\CMS\Backend\Utilit...ity::getPagesTSconfig() has been deprecated. ( Ignorable by Annotation )

If this is a false-positive, you can also ignore this issue in your code via the ignore-deprecated  annotation

549
            $pageTSconfig = /** @scrutinizer ignore-deprecated */ BackendUtility::getPagesTSconfig($id);
Loading history...
550
        } else {
551
            // TODO: Please check, this makes no sense to split a boolean value.
552
            [, $mountPointId] = explode('-', $this->MP);
0 ignored issues
show
Bug introduced by
$this->MP of type true is incompatible with the type string expected by parameter $string of explode(). ( Ignorable by Annotation )

If this is a false-positive, you can also ignore this issue in your code via the ignore-type  annotation

552
            [, $mountPointId] = explode('-', /** @scrutinizer ignore-type */ $this->MP);
Loading history...
553
            $pageTSconfig = BackendUtility::getPagesTSconfig($mountPointId);
0 ignored issues
show
Bug introduced by
$mountPointId of type string is incompatible with the type integer expected by parameter $id of TYPO3\CMS\Backend\Utilit...ity::getPagesTSconfig(). ( Ignorable by Annotation )

If this is a false-positive, you can also ignore this issue in your code via the ignore-type  annotation

553
            $pageTSconfig = BackendUtility::getPagesTSconfig(/** @scrutinizer ignore-type */ $mountPointId);
Loading history...
Deprecated Code introduced by
The function TYPO3\CMS\Backend\Utilit...ity::getPagesTSconfig() has been deprecated. ( Ignorable by Annotation )

If this is a false-positive, you can also ignore this issue in your code via the ignore-deprecated  annotation

553
            $pageTSconfig = /** @scrutinizer ignore-deprecated */ BackendUtility::getPagesTSconfig($mountPointId);
Loading history...
554
        }
555
556
        // Call a hook to alter configuration
557 3
        if (is_array($GLOBALS['TYPO3_CONF_VARS']['EXTCONF']['crawler']['getPageTSconfigForId'])) {
558
            $params = [
559
                'pageId' => $id,
560
                'pageTSConfig' => &$pageTSconfig,
561
            ];
562
            foreach ($GLOBALS['TYPO3_CONF_VARS']['EXTCONF']['crawler']['getPageTSconfigForId'] as $userFunc) {
563
                GeneralUtility::callUserFunction($userFunc, $params, $this);
564
            }
565
        }
566 3
        return $pageTSconfig;
567
    }
568
569
    /**
570
     * This methods returns an array of configurations.
571
     * Adds no urls!
572
     */
573 2
    public function getUrlsForPageId(int $pageId): array
574
    {
575
        // Get page TSconfig for page ID
576 2
        $pageTSconfig = $this->getPageTSconfigForId($pageId);
577
578 2
        $res = [];
579
580
        // Fetch Crawler Configuration from pageTSconfig
581 2
        $crawlerCfg = $pageTSconfig['tx_crawler.']['crawlerCfg.']['paramSets.'] ?? [];
582 2
        foreach ($crawlerCfg as $key => $values) {
583 1
            if (!is_array($values)) {
584 1
                continue;
585
            }
586 1
            $key = str_replace('.', '', $key);
587
            // Sub configuration for a single configuration string:
588 1
            $subCfg = (array)$crawlerCfg[$key . '.'];
589 1
            $subCfg['key'] = $key;
590
591 1
            if (strcmp($subCfg['procInstrFilter'], '')) {
592 1
                $subCfg['procInstrFilter'] = implode(',', GeneralUtility::trimExplode(',', $subCfg['procInstrFilter']));
593
            }
594 1
            $pidOnlyList = implode(',', GeneralUtility::trimExplode(',', $subCfg['pidsOnly'], true));
595
596
            // process configuration if it is not page-specific or if the specific page is the current page:
597
            // TODO: Check if $pidOnlyList can be kept as Array instead of imploded
598 1
            if (!strcmp((string)$subCfg['pidsOnly'], '') || GeneralUtility::inList($pidOnlyList, strval($pageId))) {
599
600
                // Explode, process etc.:
601 1
                $res[$key] = [];
602 1
                $res[$key]['subCfg'] = $subCfg;
603 1
                $res[$key]['paramParsed'] = GeneralUtility::explodeUrl2Array($crawlerCfg[$key]);
604 1
                $res[$key]['paramExpanded'] = $this->expandParameters($res[$key]['paramParsed'], $pageId);
605 1
                $res[$key]['origin'] = 'pagets';
606
607
                // recognize MP value
608 1
                if (!$this->MP) {
609 1
                    $res[$key]['URLs'] = $this->compileUrls($res[$key]['paramExpanded'], ['?id=' . $pageId]);
610
                } else {
611
                    $res[$key]['URLs'] = $this->compileUrls($res[$key]['paramExpanded'], ['?id=' . $pageId . '&MP=' . $this->MP]);
0 ignored issues
show
Bug introduced by
Are you sure $this->MP of type true can be used in concatenation? ( Ignorable by Annotation )

If this is a false-positive, you can also ignore this issue in your code via the ignore-type  annotation

611
                    $res[$key]['URLs'] = $this->compileUrls($res[$key]['paramExpanded'], ['?id=' . $pageId . '&MP=' . /** @scrutinizer ignore-type */ $this->MP]);
Loading history...
612
                }
613
            }
614
        }
615
616
        // Get configuration from tx_crawler_configuration records up the rootline
617 2
        $crawlerConfigurations = $this->configurationRepository->getCrawlerConfigurationRecordsFromRootLine($pageId);
618 2
        foreach ($crawlerConfigurations as $configurationRecord) {
619
620
            // check access to the configuration record
621 1
            if (empty($configurationRecord['begroups']) || $this->getBackendUser()->isAdmin() || $this->hasGroupAccess($this->getBackendUser()->user['usergroup_cached_list'], $configurationRecord['begroups'])) {
622 1
                $pidOnlyList = implode(',', GeneralUtility::trimExplode(',', $configurationRecord['pidsonly'], true));
623
624
                // process configuration if it is not page-specific or if the specific page is the current page:
625
                // TODO: Check if $pidOnlyList can be kept as Array instead of imploded
626 1
                if (!strcmp($configurationRecord['pidsonly'], '') || GeneralUtility::inList($pidOnlyList, strval($pageId))) {
627 1
                    $key = $configurationRecord['name'];
628
629
                    // don't overwrite previously defined paramSets
630 1
                    if (!isset($res[$key])) {
631
632
                        /* @var $TSparserObject \TYPO3\CMS\Core\TypoScript\Parser\TypoScriptParser */
633 1
                        $TSparserObject = GeneralUtility::makeInstance(TypoScriptParser::class);
634 1
                        $TSparserObject->parse($configurationRecord['processing_instruction_parameters_ts']);
635
636
                        $subCfg = [
637 1
                            'procInstrFilter' => $configurationRecord['processing_instruction_filter'],
638 1
                            'procInstrParams.' => $TSparserObject->setup,
639 1
                            'baseUrl' => $configurationRecord['base_url'],
640 1
                            'force_ssl' => (int)$configurationRecord['force_ssl'],
641 1
                            'userGroups' => $configurationRecord['fegroups'],
642 1
                            'exclude' => $configurationRecord['exclude'],
643 1
                            'key' => $key,
644
                        ];
645
646 1
                        if (!in_array($pageId, $this->expandExcludeString($subCfg['exclude']))) {
647 1
                            $res[$key] = [];
648 1
                            $res[$key]['subCfg'] = $subCfg;
649 1
                            $res[$key]['paramParsed'] = GeneralUtility::explodeUrl2Array($configurationRecord['configuration']);
650 1
                            $res[$key]['paramExpanded'] = $this->expandParameters($res[$key]['paramParsed'], $pageId);
651 1
                            $res[$key]['URLs'] = $this->compileUrls($res[$key]['paramExpanded'], ['?id=' . $pageId]);
652 1
                            $res[$key]['origin'] = 'tx_crawler_configuration_' . $configurationRecord['uid'];
653
                        }
654
                    }
655
                }
656
            }
657
        }
658
659 2
        foreach ($GLOBALS['TYPO3_CONF_VARS']['EXTCONF']['crawler']['processUrls'] ?? [] as $func) {
660
            $params = [
661
                'res' => &$res,
662
            ];
663
            GeneralUtility::callUserFunction($func, $params, $this);
664
        }
665 2
        return $res;
666
    }
667
668
    /**
669
     * Find all configurations of subpages of a page
670
     * TODO: Write Functional Tests
671
     */
672 1
    public function getConfigurationsForBranch(int $rootid, int $depth): array
673
    {
674 1
        $configurationsForBranch = [];
675 1
        $pageTSconfig = $this->getPageTSconfigForId($rootid);
676 1
        $sets = $pageTSconfig['tx_crawler.']['crawlerCfg.']['paramSets.'] ?? [];
677 1
        foreach ($sets as $key => $value) {
678
            if (!is_array($value)) {
679
                continue;
680
            }
681
            $configurationsForBranch[] = substr($key, -1) == '.' ? substr($key, 0, -1) : $key;
682
        }
683 1
        $pids = [];
684 1
        $rootLine = BackendUtility::BEgetRootLine($rootid);
685 1
        foreach ($rootLine as $node) {
686 1
            $pids[] = $node['uid'];
687
        }
688
        /* @var PageTreeView $tree */
689 1
        $tree = GeneralUtility::makeInstance(PageTreeView::class);
690 1
        $perms_clause = $this->getBackendUser()->getPagePermsClause(Permission::PAGE_SHOW);
691 1
        $tree->init(empty($perms_clause) ? '' : ('AND ' . $perms_clause));
692 1
        $tree->getTree($rootid, $depth, '');
693 1
        foreach ($tree->tree as $node) {
694
            $pids[] = $node['row']['uid'];
695
        }
696
697 1
        $queryBuilder = $this->getQueryBuilder('tx_crawler_configuration');
698
        $statement = $queryBuilder
699 1
            ->select('name')
700 1
            ->from('tx_crawler_configuration')
701 1
            ->where(
702 1
                $queryBuilder->expr()->in('pid', $queryBuilder->createNamedParameter($pids, Connection::PARAM_INT_ARRAY))
703
            )
704 1
            ->execute();
705
706 1
        while ($row = $statement->fetch()) {
707 1
            $configurationsForBranch[] = $row['name'];
708
        }
709 1
        return $configurationsForBranch;
710
    }
711
712
    /**
713
     * Get querybuilder for given table
714
     *
715
     * @param string $table
716
     * @return \TYPO3\CMS\Core\Database\Query\QueryBuilder
717
     */
718 18
    private function getQueryBuilder(string $table)
719
    {
720 18
        return GeneralUtility::makeInstance(ConnectionPool::class)->getQueryBuilderForTable($table);
721
    }
722
723
    /**
724
     * Check if a user has access to an item
725
     * (e.g. get the group list of the current logged in user from $GLOBALS['TSFE']->gr_list)
726
     *
727
     * @param string $groupList Comma-separated list of (fe_)group UIDs from a user
728
     * @param string $accessList Comma-separated list of (fe_)group UIDs of the item to access
729
     * @return bool                 TRUE if at least one of the users group UIDs is in the access list or the access list is empty
730
     * @see \TYPO3\CMS\Frontend\Page\PageRepository::getMultipleGroupsWhereClause()
731
     */
732 3
    public function hasGroupAccess($groupList, $accessList)
733
    {
734 3
        if (empty($accessList)) {
735 1
            return true;
736
        }
737 2
        foreach (GeneralUtility::intExplode(',', $groupList) as $groupUid) {
738 2
            if (GeneralUtility::inList($accessList, $groupUid)) {
739 1
                return true;
740
            }
741
        }
742 1
        return false;
743
    }
744
745
    /**
746
     * Will expand the parameters configuration to individual values. This follows a certain syntax of the value of each parameter.
747
     * Syntax of values:
748
     * - Basically: If the value is wrapped in [...] it will be expanded according to the following syntax, otherwise the value is taken literally
749
     * - Configuration is splitted by "|" and the parts are processed individually and finally added together
750
     * - For each configuration part:
751
     *         - "[int]-[int]" = Integer range, will be expanded to all values in between, values included, starting from low to high (max. 1000). Example "1-34" or "-40--30"
752
     *         - "_TABLE:[TCA table name];[_PID:[optional page id, default is current page]];[_ENABLELANG:1]" = Look up of table records from PID, filtering out deleted records. Example "_TABLE:tt_content; _PID:123"
753
     *        _ENABLELANG:1 picks only original records without their language overlays
754
     *         - Default: Literal value
755
     *
756
     * @param array $paramArray Array with key (GET var name) and values (value of GET var which is configuration for expansion)
757
     * @param integer $pid Current page ID
758
     * @return array
759
     *
760
     * TODO: Write Functional Tests
761
     */
762 9
    public function expandParameters($paramArray, $pid)
763
    {
764
        // Traverse parameter names:
765 9
        foreach ($paramArray as $p => $v) {
766 9
            $v = trim($v);
767
768
            // If value is encapsulated in square brackets it means there are some ranges of values to find, otherwise the value is literal
769 9
            if (strpos($v, '[') === 0 && substr($v, -1) === ']') {
770
                // So, find the value inside brackets and reset the paramArray value as an array.
771 9
                $v = substr($v, 1, -1);
772 9
                $paramArray[$p] = [];
773
774
                // Explode parts and traverse them:
775 9
                $parts = explode('|', $v);
776 9
                foreach ($parts as $pV) {
777
778
                    // Look for integer range: (fx. 1-34 or -40--30 // reads minus 40 to minus 30)
779 9
                    if (preg_match('/^(-?[0-9]+)\s*-\s*(-?[0-9]+)$/', trim($pV), $reg)) {
780 1
                        $reg = $this->swapIfFirstIsLargerThanSecond($reg);
781
782
                        // Traverse range, add values:
783 1
                        $runAwayBrake = 1000; // Limit to size of range!
784 1
                        for ($a = $reg[1]; $a <= $reg[2]; $a++) {
785 1
                            $paramArray[$p][] = $a;
786 1
                            $runAwayBrake--;
787 1
                            if ($runAwayBrake <= 0) {
788
                                break;
789
                            }
790
                        }
791 8
                    } elseif (strpos(trim($pV), '_TABLE:') === 0) {
792
793
                        // Parse parameters:
794 6
                        $subparts = GeneralUtility::trimExplode(';', $pV);
795 6
                        $subpartParams = [];
796 6
                        foreach ($subparts as $spV) {
797 6
                            [$pKey, $pVal] = GeneralUtility::trimExplode(':', $spV);
798 6
                            $subpartParams[$pKey] = $pVal;
799
                        }
800
801
                        // Table exists:
802 6
                        if (isset($GLOBALS['TCA'][$subpartParams['_TABLE']])) {
803 6
                            $lookUpPid = isset($subpartParams['_PID']) ? intval($subpartParams['_PID']) : intval($pid);
804 6
                            $recursiveDepth = isset($subpartParams['_RECURSIVE']) ? intval($subpartParams['_RECURSIVE']) : 0;
805 6
                            $pidField = isset($subpartParams['_PIDFIELD']) ? trim($subpartParams['_PIDFIELD']) : 'pid';
806 6
                            $where = $subpartParams['_WHERE'] ?? '';
807 6
                            $addTable = $subpartParams['_ADDTABLE'] ?? '';
808
809 6
                            $fieldName = $subpartParams['_FIELD'] ? $subpartParams['_FIELD'] : 'uid';
810 6
                            if ($fieldName === 'uid' || $GLOBALS['TCA'][$subpartParams['_TABLE']]['columns'][$fieldName]) {
811 6
                                $queryBuilder = $this->getQueryBuilder($subpartParams['_TABLE']);
812
813 6
                                if ($recursiveDepth > 0) {
814
                                    /** @var \TYPO3\CMS\Core\Database\QueryGenerator $queryGenerator */
815 2
                                    $queryGenerator = GeneralUtility::makeInstance(\TYPO3\CMS\Core\Database\QueryGenerator::class);
816 2
                                    $pidList = $queryGenerator->getTreeList($lookUpPid, $recursiveDepth, 0, 1);
817 2
                                    $pidArray = GeneralUtility::intExplode(',', $pidList);
818
                                } else {
819 4
                                    $pidArray = [(string)$lookUpPid];
820
                                }
821
822 6
                                $queryBuilder->getRestrictions()
823 6
                                    ->removeAll()
824 6
                                    ->add(GeneralUtility::makeInstance(DeletedRestriction::class));
825
826
                                $queryBuilder
827 6
                                    ->select($fieldName)
828 6
                                    ->from($subpartParams['_TABLE'])
829 6
                                    ->where(
830 6
                                        $queryBuilder->expr()->in($pidField, $queryBuilder->createNamedParameter($pidArray, Connection::PARAM_INT_ARRAY)),
831 6
                                        $where
832
                                    );
833
834 6
                                if (!empty($addTable)) {
835
                                    // TODO: Check if this works as intended!
836
                                    $queryBuilder->add('from', $addTable);
837
                                }
838 6
                                $transOrigPointerField = $GLOBALS['TCA'][$subpartParams['_TABLE']]['ctrl']['transOrigPointerField'];
839
840 6
                                if ($subpartParams['_ENABLELANG'] && $transOrigPointerField) {
841
                                    $queryBuilder->andWhere(
842
                                        $queryBuilder->expr()->lte(
843
                                            $transOrigPointerField,
844
                                            0
845
                                        )
846
                                    );
847
                                }
848
849 6
                                $statement = $queryBuilder->execute();
850
851 6
                                $rows = [];
852 6
                                while ($row = $statement->fetch()) {
853 6
                                    $rows[$row[$fieldName]] = $row;
854
                                }
855
856 6
                                if (is_array($rows)) {
857 6
                                    $paramArray[$p] = array_merge($paramArray[$p], array_keys($rows));
858
                                }
859
                            }
860
                        }
861
                    } else { // Just add value:
862 2
                        $paramArray[$p][] = $pV;
863
                    }
864
                    // Hook for processing own expandParameters place holder
865 9
                    if (is_array($GLOBALS['TYPO3_CONF_VARS']['SC_OPTIONS']['crawler/class.tx_crawler_lib.php']['expandParameters'])) {
866
                        $_params = [
867
                            'pObj' => &$this,
868
                            'paramArray' => &$paramArray,
869
                            'currentKey' => $p,
870
                            'currentValue' => $pV,
871
                            'pid' => $pid,
872
                        ];
873
                        foreach ($GLOBALS['TYPO3_CONF_VARS']['SC_OPTIONS']['crawler/class.tx_crawler_lib.php']['expandParameters'] as $_funcRef) {
874
                            GeneralUtility::callUserFunction($_funcRef, $_params, $this);
875
                        }
876
                    }
877
                }
878
879
                // Make unique set of values and sort array by key:
880 9
                $paramArray[$p] = array_unique($paramArray[$p]);
881 9
                ksort($paramArray);
882
            } else {
883
                // Set the literal value as only value in array:
884 2
                $paramArray[$p] = [$v];
885
            }
886
        }
887
888 9
        return $paramArray;
889
    }
890
891
    /**
892
     * Compiling URLs from parameter array (output of expandParameters())
893
     * The number of URLs will be the multiplication of the number of parameter values for each key
894
     *
895
     * @param array $paramArray Output of expandParameters(): Array with keys (GET var names) and for each an array of values
896
     * @param array $urls URLs accumulated in this array (for recursion)
897
     * @return array
898
     */
899 5
    public function compileUrls($paramArray, array $urls)
900
    {
901 5
        if (empty($paramArray)) {
902 5
            return $urls;
903
        }
904
        // shift first off stack:
905 4
        reset($paramArray);
906 4
        $varName = key($paramArray);
907 4
        $valueSet = array_shift($paramArray);
908
909
        // Traverse value set:
910 4
        $newUrls = [];
911 4
        foreach ($urls as $url) {
912 3
            foreach ($valueSet as $val) {
913 3
                $newUrls[] = $url . (strcmp((string)$val, '') ? '&' . rawurlencode($varName) . '=' . rawurlencode((string)$val) : '');
914
915 3
                if (count($newUrls) > $this->maximumUrlsToCompile) {
916
                    break;
917
                }
918
            }
919
        }
920 4
        return $this->compileUrls($paramArray, $newUrls);
921
    }
922
923
    /************************************
924
     *
925
     * Crawler log
926
     *
927
     ************************************/
928
929
    /**
930
     * Return array of records from crawler queue for input page ID
931
     *
932
     * @param integer $id Page ID for which to look up log entries.
933
     * @param string $filter Filter: "all" => all entries, "pending" => all that is not yet run, "finished" => all complete ones
934
     * @param boolean $doFlush If TRUE, then entries selected at DELETED(!) instead of selected!
935
     * @param boolean $doFullFlush
936
     * @param integer $itemsPerPage Limit the amount of entries per page default is 10
937
     * @return array
938
     */
939 4
    public function getLogEntriesForPageId($id, $filter = '', $doFlush = false, $doFullFlush = false, $itemsPerPage = 10)
940
    {
941 4
        $queryBuilder = GeneralUtility::makeInstance(ConnectionPool::class)->getQueryBuilderForTable($this->tableName);
942
        $queryBuilder
943 4
            ->select('*')
944 4
            ->from($this->tableName)
945 4
            ->where(
946 4
                $queryBuilder->expr()->eq('page_id', $queryBuilder->createNamedParameter($id, \PDO::PARAM_INT))
947
            )
948 4
            ->orderBy('scheduled', 'DESC');
949
950 4
        $expressionBuilder = GeneralUtility::makeInstance(ConnectionPool::class)
951 4
            ->getConnectionForTable($this->tableName)
952 4
            ->getExpressionBuilder();
953 4
        $query = $expressionBuilder->andX();
954
        // PHPStorm adds the highlight that the $addWhere is immediately overwritten,
955
        // but the $query = $expressionBuilder->andX() ensures that the $addWhere is written correctly with AND
956
        // between the statements, it's not a mistake in the code.
957 4
        $addWhere = '';
958 4
        switch ($filter) {
959 4
            case 'pending':
960
                $queryBuilder->andWhere($queryBuilder->expr()->eq('exec_time', 0));
961
                $addWhere = ' AND ' . $query->add($expressionBuilder->eq('exec_time', 0));
0 ignored issues
show
Unused Code introduced by
The assignment to $addWhere is dead and can be removed.
Loading history...
962
                break;
963 4
            case 'finished':
964
                $queryBuilder->andWhere($queryBuilder->expr()->gt('exec_time', 0));
965
                $addWhere = ' AND ' . $query->add($expressionBuilder->gt('exec_time', 0));
966
                break;
967
        }
968
969
        // FIXME: Write unit test that ensures that the right records are deleted.
970 4
        if ($doFlush) {
971 2
            $addWhere = $query->add($expressionBuilder->eq('page_id', (int)$id));
972 2
            $this->flushQueue($doFullFlush ? '1=1' : $addWhere);
973 2
            return [];
974
        } else {
975 2
            if ($itemsPerPage > 0) {
976
                $queryBuilder
977 2
                    ->setMaxResults((int)$itemsPerPage);
978
            }
979
980 2
            return $queryBuilder->execute()->fetchAll();
981
        }
982
    }
983
984
    /**
985
     * Return array of records from crawler queue for input set ID
986
     *
987
     * @param int $set_id Set ID for which to look up log entries.
988
     * @param string $filter Filter: "all" => all entries, "pending" => all that is not yet run, "finished" => all complete ones
989
     * @param bool $doFlush If TRUE, then entries selected at DELETED(!) instead of selected!
990
     * @param int $itemsPerPage Limit the amount of entires per page default is 10
991
     * @return array
992
     */
993 6
    public function getLogEntriesForSetId(int $set_id, string $filter = '', bool $doFlush = false, bool $doFullFlush = false, int $itemsPerPage = 10)
994
    {
995 6
        $queryBuilder = GeneralUtility::makeInstance(ConnectionPool::class)->getQueryBuilderForTable($this->tableName);
996
        $queryBuilder
997 6
            ->select('*')
998 6
            ->from($this->tableName)
999 6
            ->where(
1000 6
                $queryBuilder->expr()->eq('set_id', $queryBuilder->createNamedParameter($set_id, \PDO::PARAM_INT))
1001
            )
1002 6
            ->orderBy('scheduled', 'DESC');
1003
1004 6
        $expressionBuilder = GeneralUtility::makeInstance(ConnectionPool::class)
1005 6
            ->getConnectionForTable($this->tableName)
1006 6
            ->getExpressionBuilder();
1007 6
        $query = $expressionBuilder->andX();
1008
        // FIXME: Write Unit tests for Filters
1009
        // PHPStorm adds the highlight that the $addWhere is immediately overwritten,
1010
        // but the $query = $expressionBuilder->andX() ensures that the $addWhere is written correctly with AND
1011
        // between the statements, it's not a mistake in the code.
1012 6
        $addWhere = '';
1013 6
        switch ($filter) {
1014 6
            case 'pending':
1015 1
                $queryBuilder->andWhere($queryBuilder->expr()->eq('exec_time', 0));
1016 1
                $addWhere = $query->add($expressionBuilder->eq('exec_time', 0));
0 ignored issues
show
Unused Code introduced by
The assignment to $addWhere is dead and can be removed.
Loading history...
1017 1
                break;
1018 5
            case 'finished':
1019 1
                $queryBuilder->andWhere($queryBuilder->expr()->gt('exec_time', 0));
1020 1
                $addWhere = $query->add($expressionBuilder->gt('exec_time', 0));
1021 1
                break;
1022
        }
1023
        // FIXME: Write unit test that ensures that the right records are deleted.
1024 6
        if ($doFlush) {
1025 4
            $addWhere = $query->add($expressionBuilder->eq('set_id', (int)$set_id));
1026 4
            $this->flushQueue($doFullFlush ? '' : $addWhere);
1027 4
            return [];
1028
        } else {
1029 2
            if ($itemsPerPage > 0) {
1030
                $queryBuilder
1031 2
                    ->setMaxResults((int)$itemsPerPage);
1032
            }
1033
1034 2
            return $queryBuilder->execute()->fetchAll();
1035
        }
1036
    }
1037
1038
    /**
1039
     * Removes queue entries
1040
     *
1041
     * @param string $where SQL related filter for the entries which should be removed
1042
     * @return void
1043
     */
1044 11
    protected function flushQueue($where = ''): void
1045
    {
1046 11
        $realWhere = strlen((string)$where) > 0 ? $where : '1=1';
1047
1048 11
        $queryBuilder = $this->getQueryBuilder($this->tableName);
1049
1050 11
        if (EventDispatcher::getInstance()->hasObserver('queueEntryFlush')) {
1051
            $groups = $queryBuilder
1052
                ->select('DISTINCT set_id')
1053
                ->from($this->tableName)
1054
                ->where($realWhere)
1055
                ->execute()
1056
                ->fetchAll();
1057
            if (is_array($groups)) {
0 ignored issues
show
introduced by
The condition is_array($groups) is always true.
Loading history...
1058
                foreach ($groups as $group) {
1059
                    $subSet = $queryBuilder
1060
                        ->select('uid', 'set_id')
1061
                        ->from($this->tableName)
1062
                        ->where(
1063
                            $realWhere,
1064
                            $queryBuilder->expr()->eq('set_id', $group['set_id'])
1065
                        )
1066
                        ->execute()
1067
                        ->fetchAll();
1068
                    EventDispatcher::getInstance()->post('queueEntryFlush', $group['set_id'], $subSet);
1069
                }
1070
            }
1071
        }
1072
1073
        $queryBuilder
1074 11
            ->delete($this->tableName)
1075 11
            ->where($realWhere)
1076 11
            ->execute();
1077 11
    }
1078
1079
    /**
1080
     * Adding call back entries to log (called from hooks typically, see indexed search class "class.crawler.php"
1081
     *
1082
     * @param integer $setId Set ID
1083
     * @param array $params Parameters to pass to call back function
1084
     * @param string $callBack Call back object reference, eg. 'EXT:indexed_search/class.crawler.php:&tx_indexedsearch_crawler'
1085
     * @param integer $page_id Page ID to attach it to
1086
     * @param integer $schedule Time at which to activate
1087
     * @return void
1088
     */
1089
    public function addQueueEntry_callBack($setId, $params, $callBack, $page_id = 0, $schedule = 0): void
1090
    {
1091
        if (!is_array($params)) {
0 ignored issues
show
introduced by
The condition is_array($params) is always true.
Loading history...
1092
            $params = [];
1093
        }
1094
        $params['_CALLBACKOBJ'] = $callBack;
1095
1096
        GeneralUtility::makeInstance(ConnectionPool::class)->getConnectionForTable('tx_crawler_queue')
1097
            ->insert(
1098
                'tx_crawler_queue',
1099
                [
1100
                    'page_id' => (int)$page_id,
1101
                    'parameters' => serialize($params),
1102
                    'scheduled' => (int)$schedule ?: $this->getCurrentTime(),
1103
                    'exec_time' => 0,
1104
                    'set_id' => (int)$setId,
1105
                    'result_data' => '',
1106
                ]
1107
            );
1108
    }
1109
1110
    /************************************
1111
     *
1112
     * URL setting
1113
     *
1114
     ************************************/
1115
1116
    /**
1117
     * Setting a URL for crawling:
1118
     *
1119
     * @param integer $id Page ID
1120
     * @param string $url Complete URL
1121
     * @param array $subCfg Sub configuration array (from TS config)
1122
     * @param integer $tstamp Scheduled-time
1123
     * @param string $configurationHash (optional) configuration hash
1124
     * @param bool $skipInnerDuplicationCheck (optional) skip inner duplication check
1125
     * @return bool
1126
     */
1127 6
    public function addUrl(
1128
        $id,
1129
        $url,
1130
        array $subCfg,
1131
        $tstamp,
1132
        $configurationHash = '',
1133
        $skipInnerDuplicationCheck = false
1134
    ) {
1135 6
        $urlAdded = false;
1136 6
        $rows = [];
1137
1138
        // Creating parameters:
1139
        $parameters = [
1140 6
            'url' => $url,
1141
        ];
1142
1143
        // fe user group simulation:
1144 6
        $uGs = implode(',', array_unique(GeneralUtility::intExplode(',', $subCfg['userGroups'], true)));
1145 6
        if ($uGs) {
1146 1
            $parameters['feUserGroupList'] = $uGs;
1147
        }
1148
1149
        // Setting processing instructions
1150 6
        $parameters['procInstructions'] = GeneralUtility::trimExplode(',', $subCfg['procInstrFilter']);
1151 6
        if (is_array($subCfg['procInstrParams.'])) {
1152 3
            $parameters['procInstrParams'] = $subCfg['procInstrParams.'];
1153
        }
1154
1155
        // Compile value array:
1156 6
        $parameters_serialized = serialize($parameters);
1157
        $fieldArray = [
1158 6
            'page_id' => (int)$id,
1159 6
            'parameters' => $parameters_serialized,
1160 6
            'parameters_hash' => GeneralUtility::shortMD5($parameters_serialized),
1161 6
            'configuration_hash' => $configurationHash,
1162 6
            'scheduled' => $tstamp,
1163 6
            'exec_time' => 0,
1164 6
            'set_id' => (int)$this->setID,
1165 6
            'result_data' => '',
1166 6
            'configuration' => $subCfg['key'],
1167
        ];
1168
1169 6
        if ($this->registerQueueEntriesInternallyOnly) {
1170
            //the entries will only be registered and not stored to the database
1171 1
            $this->queueEntries[] = $fieldArray;
1172
        } else {
1173 5
            if (!$skipInnerDuplicationCheck) {
1174
                // check if there is already an equal entry
1175 4
                $rows = $this->getDuplicateRowsIfExist($tstamp, $fieldArray);
1176
            }
1177
1178 5
            if (empty($rows)) {
1179 4
                $connectionForCrawlerQueue = GeneralUtility::makeInstance(ConnectionPool::class)->getConnectionForTable('tx_crawler_queue');
1180 4
                $connectionForCrawlerQueue->insert(
1181 4
                    'tx_crawler_queue',
1182 4
                    $fieldArray
1183
                );
1184 4
                $uid = $connectionForCrawlerQueue->lastInsertId('tx_crawler_queue', 'qid');
1185 4
                $rows[] = $uid;
1186 4
                $urlAdded = true;
1187 4
                EventDispatcher::getInstance()->post('urlAddedToQueue', strval($this->setID), ['uid' => $uid, 'fieldArray' => $fieldArray]);
1188
            } else {
1189 1
                EventDispatcher::getInstance()->post('duplicateUrlInQueue', strval($this->setID), ['rows' => $rows, 'fieldArray' => $fieldArray]);
1190
            }
1191
        }
1192
1193 6
        return $urlAdded;
1194
    }
1195
1196
    /**
1197
     * This method determines duplicates for a queue entry with the same parameters and this timestamp.
1198
     * If the timestamp is in the past, it will check if there is any unprocessed queue entry in the past.
1199
     * If the timestamp is in the future it will check, if the queued entry has exactly the same timestamp
1200
     *
1201
     * @param int $tstamp
1202
     * @param array $fieldArray
1203
     *
1204
     * @return array
1205
     */
1206 7
    protected function getDuplicateRowsIfExist($tstamp, $fieldArray)
1207
    {
1208 7
        $rows = [];
1209
1210 7
        $currentTime = $this->getCurrentTime();
1211
1212 7
        $queryBuilder = GeneralUtility::makeInstance(ConnectionPool::class)->getQueryBuilderForTable($this->tableName);
1213
        $queryBuilder
1214 7
            ->select('qid')
1215 7
            ->from('tx_crawler_queue');
1216
        //if this entry is scheduled with "now"
1217 7
        if ($tstamp <= $currentTime) {
1218 2
            if ($this->extensionSettings['enableTimeslot']) {
1219 1
                $timeBegin = $currentTime - 100;
1220 1
                $timeEnd = $currentTime + 100;
1221
                $queryBuilder
1222 1
                    ->where(
1223 1
                        'scheduled BETWEEN ' . $timeBegin . ' AND ' . $timeEnd . ''
1224
                    )
1225 1
                    ->orWhere(
1226 1
                        $queryBuilder->expr()->lte('scheduled', $currentTime)
1227
                    );
1228
            } else {
1229
                $queryBuilder
1230 1
                    ->where(
1231 2
                        $queryBuilder->expr()->lte('scheduled', $currentTime)
1232
                    );
1233
            }
1234 5
        } elseif ($tstamp > $currentTime) {
1235
            //entry with a timestamp in the future need to have the same schedule time
1236
            $queryBuilder
1237 5
                ->where(
1238 5
                    $queryBuilder->expr()->eq('scheduled', $tstamp)
1239
                );
1240
        }
1241
1242
        $queryBuilder
1243 7
            ->andWhere('NOT exec_time')
1244 7
            ->andWhere('NOT process_id')
1245 7
            ->andWhere($queryBuilder->expr()->eq('page_id', $queryBuilder->createNamedParameter($fieldArray['page_id'], \PDO::PARAM_INT)))
1246 7
            ->andWhere($queryBuilder->expr()->eq('parameters_hash', $queryBuilder->createNamedParameter($fieldArray['parameters_hash'], \PDO::PARAM_STR)));
1247
1248 7
        $statement = $queryBuilder->execute();
1249
1250 7
        while ($row = $statement->fetch()) {
1251 5
            $rows[] = $row['qid'];
1252
        }
1253
1254 7
        return $rows;
1255
    }
1256
1257
    /**
1258
     * Returns the current system time
1259
     *
1260
     * @return int
1261
     */
1262
    public function getCurrentTime()
1263
    {
1264
        return time();
1265
    }
1266
1267
    /************************************
1268
     *
1269
     * URL reading
1270
     *
1271
     ************************************/
1272
1273
    /**
1274
     * Read URL for single queue entry
1275
     *
1276
     * @param integer $queueId
1277
     * @param boolean $force If set, will process even if exec_time has been set!
1278
     * @return integer
1279
     */
1280
    public function readUrl($queueId, $force = false)
1281
    {
1282
        $queryBuilder = GeneralUtility::makeInstance(ConnectionPool::class)->getQueryBuilderForTable($this->tableName);
1283
        $ret = 0;
1284
        $this->logger->debug('crawler-readurl start ' . microtime(true));
1285
        // Get entry:
1286
        $queryBuilder
1287
            ->select('*')
1288
            ->from('tx_crawler_queue')
1289
            ->where(
1290
                $queryBuilder->expr()->eq('qid', $queryBuilder->createNamedParameter($queueId, \PDO::PARAM_INT))
1291
            );
1292
        if (!$force) {
1293
            $queryBuilder
1294
                ->andWhere('exec_time = 0')
1295
                ->andWhere('process_scheduled > 0');
1296
        }
1297
        $queueRec = $queryBuilder->execute()->fetch();
1298
1299
        if (!is_array($queueRec)) {
1300
            return;
1301
        }
1302
1303
        SignalSlotUtility::emitSignal(
1304
            __CLASS__,
1305
            SignalSlotUtility::SIGNNAL_QUEUEITEM_PREPROCESS,
1306
            [$queueId, &$queueRec]
1307
        );
1308
1309
        // Set exec_time to lock record:
1310
        $field_array = ['exec_time' => $this->getCurrentTime()];
1311
1312
        if (isset($this->processID)) {
1313
            //if mulitprocessing is used we need to store the id of the process which has handled this entry
1314
            $field_array['process_id_completed'] = $this->processID;
1315
        }
1316
1317
        GeneralUtility::makeInstance(ConnectionPool::class)->getConnectionForTable('tx_crawler_queue')
1318
            ->update(
1319
                'tx_crawler_queue',
1320
                $field_array,
1321
                ['qid' => (int)$queueId]
1322
            );
1323
1324
        $result = $this->queueExecutor->executeQueueItem($queueRec, $this);
1325
        $resultData = unserialize($result['content']);
1326
1327
        //atm there's no need to point to specific pollable extensions
1328
        if (is_array($GLOBALS['TYPO3_CONF_VARS']['EXTCONF']['crawler']['pollSuccess'])) {
1329
            foreach ($GLOBALS['TYPO3_CONF_VARS']['EXTCONF']['crawler']['pollSuccess'] as $pollable) {
1330
                // only check the success value if the instruction is runnig
1331
                // it is important to name the pollSuccess key same as the procInstructions key
1332
                if (is_array($resultData['parameters']['procInstructions'])
1333
                    && in_array(
1334
                        $pollable,
1335
                        $resultData['parameters']['procInstructions']
1336
                    )
1337
                ) {
1338
                    if (!empty($resultData['success'][$pollable]) && $resultData['success'][$pollable]) {
1339
                        $ret |= self::CLI_STATUS_POLLABLE_PROCESSED;
1340
                    }
1341
                }
1342
            }
1343
        }
1344
1345
        // Set result in log which also denotes the end of the processing of this entry.
1346
        $field_array = ['result_data' => serialize($result)];
1347
1348
        SignalSlotUtility::emitSignal(
1349
            __CLASS__,
1350
            SignalSlotUtility::SIGNNAL_QUEUEITEM_POSTPROCESS,
1351
            [$queueId, &$field_array]
1352
        );
1353
1354
        GeneralUtility::makeInstance(ConnectionPool::class)->getConnectionForTable('tx_crawler_queue')
1355
            ->update(
1356
                'tx_crawler_queue',
1357
                $field_array,
1358
                ['qid' => (int)$queueId]
1359
            );
1360
1361
        $this->logger->debug('crawler-readurl stop ' . microtime(true));
1362
        return $ret;
1363
    }
1364
1365
    /**
1366
     * Read URL for not-yet-inserted log-entry
1367
     *
1368
     * @param array $field_array Queue field array,
1369
     *
1370
     * @return string
1371
     */
1372
    public function readUrlFromArray($field_array)
1373
    {
1374
        // Set exec_time to lock record:
1375
        $field_array['exec_time'] = $this->getCurrentTime();
1376
        $connectionForCrawlerQueue = GeneralUtility::makeInstance(ConnectionPool::class)->getConnectionForTable($this->tableName);
1377
        $connectionForCrawlerQueue->insert(
1378
            $this->tableName,
1379
            $field_array
1380
        );
1381
        $queueId = $field_array['qid'] = $connectionForCrawlerQueue->lastInsertId($this->tableName, 'qid');
1382
1383
        $result = $this->queueExecutor->executeQueueItem($field_array, $this);
1384
1385
        // Set result in log which also denotes the end of the processing of this entry.
1386
        $field_array = ['result_data' => serialize($result)];
1387
1388
        SignalSlotUtility::emitSignal(
1389
            __CLASS__,
1390
            SignalSlotUtility::SIGNNAL_QUEUEITEM_POSTPROCESS,
1391
            [$queueId, &$field_array]
1392
        );
1393
1394
        $connectionForCrawlerQueue->update(
1395
            $this->tableName,
1396
            $field_array,
1397
            ['qid' => $queueId]
1398
        );
1399
1400
        return $result;
0 ignored issues
show
Bug Best Practice introduced by
The expression return $result also could return the type array|boolean which is incompatible with the documented return type string.
Loading history...
1401
    }
1402
1403
    /*****************************
1404
     *
1405
     * Compiling URLs to crawl - tools
1406
     *
1407
     *****************************/
1408
1409
    /**
1410
     * @param integer $id Root page id to start from.
1411
     * @param integer $depth Depth of tree, 0=only id-page, 1= on sublevel, 99 = infinite
1412
     * @param integer $scheduledTime Unix Time when the URL is timed to be visited when put in queue
1413
     * @param integer $reqMinute Number of requests per minute (creates the interleave between requests)
1414
     * @param boolean $submitCrawlUrls If set, submits the URLs to queue in database (real crawling)
1415
     * @param boolean $downloadCrawlUrls If set (and submitcrawlUrls is false) will fill $downloadUrls with entries)
1416
     * @param array $incomingProcInstructions Array of processing instructions
1417
     * @param array $configurationSelection Array of configuration keys
1418
     * @return string
1419
     */
1420
    public function getPageTreeAndUrls(
1421
        $id,
1422
        $depth,
1423
        $scheduledTime,
1424
        $reqMinute,
1425
        $submitCrawlUrls,
1426
        $downloadCrawlUrls,
1427
        array $incomingProcInstructions,
1428
        array $configurationSelection
1429
    ) {
1430
        $this->scheduledTime = $scheduledTime;
1431
        $this->reqMinute = $reqMinute;
1432
        $this->submitCrawlUrls = $submitCrawlUrls;
1433
        $this->downloadCrawlUrls = $downloadCrawlUrls;
1434
        $this->incomingProcInstructions = $incomingProcInstructions;
1435
        $this->incomingConfigurationSelection = $configurationSelection;
1436
1437
        $this->duplicateTrack = [];
1438
        $this->downloadUrls = [];
1439
1440
        // Drawing tree:
1441
        /* @var PageTreeView $tree */
1442
        $tree = GeneralUtility::makeInstance(PageTreeView::class);
1443
        $perms_clause = $this->getBackendUser()->getPagePermsClause(Permission::PAGE_SHOW);
1444
        $tree->init('AND ' . $perms_clause);
1445
1446
        $pageInfo = BackendUtility::readPageAccess($id, $perms_clause);
1447
        if (is_array($pageInfo)) {
0 ignored issues
show
introduced by
The condition is_array($pageInfo) is always true.
Loading history...
1448
            // Set root row:
1449
            $tree->tree[] = [
1450
                'row' => $pageInfo,
1451
                'HTML' => $this->iconFactory->getIconForRecord('pages', $pageInfo, Icon::SIZE_SMALL),
1452
            ];
1453
        }
1454
1455
        // Get branch beneath:
1456
        if ($depth) {
1457
            $tree->getTree($id, $depth, '');
1458
        }
1459
1460
        // Traverse page tree:
1461
        $code = '';
1462
1463
        foreach ($tree->tree as $data) {
1464
            $this->MP = false;
1465
1466
            // recognize mount points
1467
            if ($data['row']['doktype'] == PageRepository::DOKTYPE_MOUNTPOINT) {
1468
                $queryBuilder = GeneralUtility::makeInstance(ConnectionPool::class)->getQueryBuilderForTable($this->tableName);
1469
                $queryBuilder->getRestrictions()->removeAll()->add(GeneralUtility::makeInstance(DeletedRestriction::class));
1470
                $mountpage = $queryBuilder
1471
                    ->select('*')
1472
                    ->from('pages')
1473
                    ->where(
1474
                        $queryBuilder->expr()->eq('uid', $queryBuilder->createNamedParameter($data['row']['uid'], \PDO::PARAM_INT))
1475
                    )
1476
                    ->execute()
1477
                    ->fetchAll();
1478
                $queryBuilder->resetRestrictions();
1479
1480
                // fetch mounted pages
1481
                $this->MP = $mountpage[0]['mount_pid'] . '-' . $data['row']['uid'];
0 ignored issues
show
Documentation Bug introduced by
The property $MP was declared of type boolean, but $mountpage[0]['mount_pid...' . $data['row']['uid'] is of type string. Maybe add a type cast?

This check looks for assignments to scalar types that may be of the wrong type.

To ensure the code behaves as expected, it may be a good idea to add an explicit type cast.

$answer = 42;

$correct = false;

$correct = (bool) $answer;
Loading history...
1482
1483
                $mountTree = GeneralUtility::makeInstance(PageTreeView::class);
1484
                $mountTree->init('AND ' . $perms_clause);
1485
                $mountTree->getTree($mountpage[0]['mount_pid'], $depth);
1486
1487
                foreach ($mountTree->tree as $mountData) {
1488
                    $code .= $this->drawURLs_addRowsForPage(
1489
                        $mountData['row'],
1490
                        $mountData['HTML'] . BackendUtility::getRecordTitle('pages', $mountData['row'], true)
1491
                    );
1492
                }
1493
1494
                // replace page when mount_pid_ol is enabled
1495
                if ($mountpage[0]['mount_pid_ol']) {
1496
                    $data['row']['uid'] = $mountpage[0]['mount_pid'];
1497
                } else {
1498
                    // if the mount_pid_ol is not set the MP must not be used for the mountpoint page
1499
                    $this->MP = false;
1500
                }
1501
            }
1502
1503
            $code .= $this->drawURLs_addRowsForPage(
1504
                $data['row'],
1505
                $data['HTML'] . BackendUtility::getRecordTitle('pages', $data['row'], true)
1506
            );
1507
        }
1508
1509
        return $code;
1510
    }
1511
1512
    /**
1513
     * Expands exclude string
1514
     *
1515
     * @param string $excludeString Exclude string
1516
     * @return array
1517
     */
1518 1
    public function expandExcludeString($excludeString)
1519
    {
1520
        // internal static caches;
1521 1
        static $expandedExcludeStringCache;
1522 1
        static $treeCache;
1523
1524 1
        if (empty($expandedExcludeStringCache[$excludeString])) {
1525 1
            $pidList = [];
1526
1527 1
            if (!empty($excludeString)) {
1528
                /** @var PageTreeView $tree */
1529
                $tree = GeneralUtility::makeInstance(PageTreeView::class);
1530
                $tree->init('AND ' . $this->getBackendUser()->getPagePermsClause(Permission::PAGE_SHOW));
1531
1532
                $excludeParts = GeneralUtility::trimExplode(',', $excludeString);
1533
1534
                foreach ($excludeParts as $excludePart) {
1535
                    [$pid, $depth] = GeneralUtility::trimExplode('+', $excludePart);
1536
1537
                    // default is "page only" = "depth=0"
1538
                    if (empty($depth)) {
1539
                        $depth = (stristr($excludePart, '+')) ? 99 : 0;
1540
                    }
1541
1542
                    $pidList[] = $pid;
1543
1544
                    if ($depth > 0) {
1545
                        if (empty($treeCache[$pid][$depth])) {
1546
                            $tree->reset();
1547
                            $tree->getTree($pid, $depth);
1548
                            $treeCache[$pid][$depth] = $tree->tree;
1549
                        }
1550
1551
                        foreach ($treeCache[$pid][$depth] as $data) {
1552
                            $pidList[] = $data['row']['uid'];
1553
                        }
1554
                    }
1555
                }
1556
            }
1557
1558 1
            $expandedExcludeStringCache[$excludeString] = array_unique($pidList);
1559
        }
1560
1561 1
        return $expandedExcludeStringCache[$excludeString];
1562
    }
1563
1564
    /**
1565
     * Create the rows for display of the page tree
1566
     * For each page a number of rows are shown displaying GET variable configuration
1567
     */
1568
    public function drawURLs_addRowsForPage(array $pageRow, string $pageTitle): string
1569
    {
1570
        $skipMessage = '';
1571
1572
        // Get list of configurations
1573
        $configurations = $this->getUrlsForPageRow($pageRow, $skipMessage);
1574
1575
        if (!empty($this->incomingConfigurationSelection)) {
1576
            // remove configuration that does not match the current selection
1577
            foreach ($configurations as $confKey => $confArray) {
1578
                if (!in_array($confKey, $this->incomingConfigurationSelection)) {
1579
                    unset($configurations[$confKey]);
1580
                }
1581
            }
1582
        }
1583
1584
        // Traverse parameter combinations:
1585
        $c = 0;
1586
        $content = '';
1587
        if (!empty($configurations)) {
1588
            foreach ($configurations as $confKey => $confArray) {
1589
1590
                // Title column:
1591
                if (!$c) {
1592
                    $titleClm = '<td rowspan="' . count($configurations) . '">' . $pageTitle . '</td>';
1593
                } else {
1594
                    $titleClm = '';
1595
                }
1596
1597
                if (!in_array($pageRow['uid'], $this->expandExcludeString($confArray['subCfg']['exclude']))) {
1598
1599
                    // URL list:
1600
                    $urlList = $this->urlListFromUrlArray(
1601
                        $confArray,
1602
                        $pageRow,
1603
                        $this->scheduledTime,
1604
                        $this->reqMinute,
1605
                        $this->submitCrawlUrls,
1606
                        $this->downloadCrawlUrls,
1607
                        $this->duplicateTrack,
1608
                        $this->downloadUrls,
1609
                        $this->incomingProcInstructions // if empty the urls won't be filtered by processing instructions
1610
                    );
1611
1612
                    // Expanded parameters:
1613
                    $paramExpanded = '';
1614
                    $calcAccu = [];
1615
                    $calcRes = 1;
1616
                    foreach ($confArray['paramExpanded'] as $gVar => $gVal) {
1617
                        $paramExpanded .= '
1618
                            <tr>
1619
                                <td>' . htmlspecialchars('&' . $gVar . '=') . '<br/>' .
1620
                            '(' . count($gVal) . ')' .
1621
                            '</td>
1622
                                <td nowrap="nowrap">' . nl2br(htmlspecialchars(implode(chr(10), $gVal))) . '</td>
1623
                            </tr>
1624
                        ';
1625
                        $calcRes *= count($gVal);
1626
                        $calcAccu[] = count($gVal);
1627
                    }
1628
                    $paramExpanded = '<table>' . $paramExpanded . '</table>';
1629
                    $paramExpanded .= 'Comb: ' . implode('*', $calcAccu) . '=' . $calcRes;
1630
1631
                    // Options
1632
                    $optionValues = '';
1633
                    if ($confArray['subCfg']['userGroups']) {
1634
                        $optionValues .= 'User Groups: ' . $confArray['subCfg']['userGroups'] . '<br/>';
1635
                    }
1636
                    if ($confArray['subCfg']['procInstrFilter']) {
1637
                        $optionValues .= 'ProcInstr: ' . $confArray['subCfg']['procInstrFilter'] . '<br/>';
1638
                    }
1639
1640
                    // Compile row:
1641
                    $content .= '
1642
                        <tr>
1643
                            ' . $titleClm . '
1644
                            <td>' . htmlspecialchars($confKey) . '</td>
1645
                            <td>' . nl2br(htmlspecialchars(rawurldecode(trim(str_replace('&', chr(10) . '&', GeneralUtility::implodeArrayForUrl('', $confArray['paramParsed'])))))) . '</td>
1646
                            <td>' . $paramExpanded . '</td>
1647
                            <td nowrap="nowrap">' . $urlList . '</td>
1648
                            <td nowrap="nowrap">' . $optionValues . '</td>
1649
                            <td nowrap="nowrap">' . DebugUtility::viewArray($confArray['subCfg']['procInstrParams.']) . '</td>
1650
                        </tr>';
1651
                } else {
1652
                    $content .= '<tr>
1653
                            ' . $titleClm . '
1654
                            <td>' . htmlspecialchars($confKey) . '</td>
1655
                            <td colspan="5"><em>No entries</em> (Page is excluded in this configuration)</td>
1656
                        </tr>';
1657
                }
1658
1659
                $c++;
1660
            }
1661
        } else {
1662
            $message = !empty($skipMessage) ? ' (' . $skipMessage . ')' : '';
1663
1664
            // Compile row:
1665
            $content .= '
1666
                <tr>
1667
                    <td>' . $pageTitle . '</td>
1668
                    <td colspan="6"><em>No entries</em>' . $message . '</td>
1669
                </tr>';
1670
        }
1671
1672
        return $content;
1673
    }
1674
1675
    /*****************************
1676
     *
1677
     * CLI functions
1678
     *
1679
     *****************************/
1680
1681
    /**
1682
     * Running the functionality of the CLI (crawling URLs from queue)
1683
     */
1684
    public function CLI_run(int $countInARun, int $sleepTime, int $sleepAfterFinish): int
1685
    {
1686
        $result = 0;
1687
        $counter = 0;
1688
1689
        // First, run hooks:
1690
        $this->CLI_runHooks();
1691
1692
        // Clean up the queue
1693
        if ((int)$this->extensionSettings['purgeQueueDays'] > 0) {
1694
            $purgeDate = $this->getCurrentTime() - 24 * 60 * 60 * (int)$this->extensionSettings['purgeQueueDays'];
1695
1696
            $queryBuilderDelete = GeneralUtility::makeInstance(ConnectionPool::class)->getQueryBuilderForTable($this->tableName);
1697
            $del = $queryBuilderDelete
1698
                ->delete($this->tableName)
1699
                ->where(
1700
                    'exec_time != 0 AND exec_time < ' . $purgeDate
1701
                )->execute();
1702
1703
            if (false === $del) {
1704
                $this->logger->info(
1705
                    'Records could not be deleted.'
1706
                );
1707
            }
1708
        }
1709
1710
        // Select entries:
1711
        //TODO Shouldn't this reside within the transaction?
1712
        $queryBuilderSelect = GeneralUtility::makeInstance(ConnectionPool::class)->getQueryBuilderForTable($this->tableName);
1713
        $rows = $queryBuilderSelect
1714
            ->select('qid', 'scheduled')
1715
            ->from($this->tableName)
1716
            ->where(
1717
                $queryBuilderSelect->expr()->eq('exec_time', 0),
1718
                $queryBuilderSelect->expr()->eq('process_scheduled', 0),
1719
                $queryBuilderSelect->expr()->lte('scheduled', $this->getCurrentTime())
1720
            )
1721
            ->orderBy('scheduled')
1722
            ->addOrderBy('qid')
1723
            ->setMaxResults($countInARun)
1724
            ->execute()
1725
            ->fetchAll();
1726
1727
        if (!empty($rows)) {
1728
            $quidList = [];
1729
1730
            foreach ($rows as $r) {
1731
                $quidList[] = $r['qid'];
1732
            }
1733
1734
            $processId = $this->CLI_buildProcessId();
1735
1736
            //reserve queue entries for process
1737
1738
            //$this->queryBuilder->getConnection()->executeQuery('BEGIN');
1739
            //TODO make sure we're not taking assigned queue-entires
1740
1741
            //save the number of assigned queue entrys to determine who many have been processed later
1742
            $queryBuilderUpdate = GeneralUtility::makeInstance(ConnectionPool::class)->getQueryBuilderForTable($this->tableName);
1743
            $numberOfAffectedRows = $queryBuilderUpdate
1744
                ->update($this->tableName)
1745
                ->where(
1746
                    $queryBuilderUpdate->expr()->in('qid', $quidList)
1747
                )
1748
                ->set('process_scheduled', $this->getCurrentTime())
1749
                ->set('process_id', $processId)
1750
                ->execute();
1751
1752
            GeneralUtility::makeInstance(ConnectionPool::class)->getConnectionForTable('tx_crawler_process')
1753
                ->update(
1754
                    'tx_crawler_process',
1755
                    ['assigned_items_count' => (int)$numberOfAffectedRows],
1756
                    ['process_id' => $processId]
1757
                );
1758
1759
            if ($numberOfAffectedRows == count($quidList)) {
1760
                //$this->queryBuilder->getConnection()->executeQuery('COMMIT');
1761
            } else {
1762
                //$this->queryBuilder->getConnection()->executeQuery('ROLLBACK');
1763
                $this->CLI_debug("Nothing processed due to multi-process collision (" . $this->CLI_buildProcessId() . ")");
1764
                return ($result | self::CLI_STATUS_ABORTED);
1765
            }
1766
1767
            foreach ($rows as $r) {
1768
                $result |= $this->readUrl($r['qid']);
1769
1770
                $counter++;
1771
                usleep((int)$sleepTime); // Just to relax the system
1772
1773
                // if during the start and the current read url the cli has been disable we need to return from the function
1774
                // mark the process NOT as ended.
1775
                if ($this->getDisabled()) {
1776
                    return ($result | self::CLI_STATUS_ABORTED);
1777
                }
1778
1779
                if (!$this->processRepository->isProcessActive($this->CLI_buildProcessId())) {
1780
                    $this->CLI_debug("conflict / timeout (" . $this->CLI_buildProcessId() . ")");
1781
1782
                    //TODO might need an additional returncode
1783
                    $result |= self::CLI_STATUS_ABORTED;
1784
                    break; //possible timeout
1785
                }
1786
            }
1787
1788
            sleep((int)$sleepAfterFinish);
1789
1790
            $msg = 'Rows: ' . $counter;
1791
            $this->CLI_debug($msg . " (" . $this->CLI_buildProcessId() . ")");
1792
        } else {
1793
            $this->CLI_debug("Nothing within queue which needs to be processed (" . $this->CLI_buildProcessId() . ")");
1794
        }
1795
1796
        if ($counter > 0) {
1797
            $result |= self::CLI_STATUS_PROCESSED;
1798
        }
1799
1800
        return $result;
1801
    }
1802
1803
    /**
1804
     * Activate hooks
1805
     *
1806
     * @return void
1807
     */
1808
    public function CLI_runHooks(): void
1809
    {
1810
        foreach ($GLOBALS['TYPO3_CONF_VARS']['EXTCONF']['crawler']['cli_hooks'] ?? [] as $objRef) {
1811
            $hookObj = GeneralUtility::makeInstance($objRef);
1812
            if (is_object($hookObj)) {
1813
                $hookObj->crawler_init($this);
1814
            }
1815
        }
1816
    }
1817
1818
    /**
1819
     * Try to acquire a new process with the given id
1820
     * also performs some auto-cleanup for orphan processes
1821
     * @param string $id identification string for the process
1822
     * @return boolean
1823
     * @todo preemption might not be the most elegant way to clean up
1824
     */
1825
    public function CLI_checkAndAcquireNewProcess($id)
1826
    {
1827
        $queryBuilder = GeneralUtility::makeInstance(ConnectionPool::class)->getQueryBuilderForTable($this->tableName);
1828
        $ret = true;
1829
1830
        $systemProcessId = getmypid();
1831
        if ($systemProcessId < 1) {
1832
            return false;
1833
        }
1834
1835
        $processCount = 0;
1836
        $orphanProcesses = [];
1837
1838
        //$this->queryBuilder->getConnection()->executeQuery('BEGIN');
1839
1840
        $statement = $queryBuilder
1841
            ->select('process_id', 'ttl')
1842
            ->from('tx_crawler_process')
1843
            ->where(
1844
                'active = 1 AND deleted = 0'
1845
            )
1846
            ->execute();
1847
1848
        $currentTime = $this->getCurrentTime();
1849
1850
        while ($row = $statement->fetch()) {
1851
            if ($row['ttl'] < $currentTime) {
1852
                $orphanProcesses[] = $row['process_id'];
1853
            } else {
1854
                $processCount++;
1855
            }
1856
        }
1857
1858
        // if there are less than allowed active processes then add a new one
1859
        if ($processCount < (int)$this->extensionSettings['processLimit']) {
1860
            $this->CLI_debug("add process " . $this->CLI_buildProcessId() . " (" . ($processCount + 1) . "/" . (int)$this->extensionSettings['processLimit'] . ")");
1861
1862
            GeneralUtility::makeInstance(ConnectionPool::class)->getConnectionForTable('tx_crawler_process')->insert(
1863
                'tx_crawler_process',
1864
                [
1865
                    'process_id' => $id,
1866
                    'active' => 1,
1867
                    'ttl' => $currentTime + (int)$this->extensionSettings['processMaxRunTime'],
1868
                    'system_process_id' => $systemProcessId,
1869
                ]
1870
            );
1871
        } else {
1872
            $this->CLI_debug("Processlimit reached (" . ($processCount) . "/" . (int)$this->extensionSettings['processLimit'] . ")");
1873
            $ret = false;
1874
        }
1875
1876
        $this->processRepository->deleteProcessesMarkedAsDeleted();
1877
        $this->CLI_releaseProcesses($orphanProcesses);
1878
1879
        return $ret;
1880
    }
1881
1882
    /**
1883
     * Release a process and the required resources
1884
     *
1885
     * @param mixed $releaseIds string with a single process-id or array with multiple process-ids
1886
     * @return boolean
1887
     */
1888
    public function CLI_releaseProcesses($releaseIds)
1889
    {
1890
        $queryBuilder = GeneralUtility::makeInstance(ConnectionPool::class)->getQueryBuilderForTable($this->tableName);
1891
1892
        if (!is_array($releaseIds)) {
1893
            $releaseIds = [$releaseIds];
1894
        }
1895
1896
        if (empty($releaseIds)) {
1897
            return false;   //nothing to release
1898
        }
1899
1900
        // some kind of 2nd chance algo - this way you need at least 2 processes to have a real cleanup
1901
        // this ensures that a single process can't mess up the entire process table
1902
1903
        // mark all processes as deleted which have no "waiting" queue-entires and which are not active
1904
1905
        $queryBuilder
1906
            ->update($this->tableName, 'q')
1907
            ->where(
1908
                'q.process_id IN(SELECT p.process_id FROM tx_crawler_process as p WHERE p.active = 0)'
1909
            )
1910
            ->set('q.process_scheduled', 0)
1911
            ->set('q.process_id', '')
1912
            ->execute();
1913
1914
        // FIXME: Not entirely sure that this is equivalent to the previous version
1915
        $queryBuilder->resetQueryPart('set');
1916
1917
        $queryBuilder
1918
            ->update('tx_crawler_process')
1919
            ->where(
1920
                $queryBuilder->expr()->eq('active', 0),
1921
                'process_id IN(SELECT q.process_id FROM tx_crawler_queue as q WHERE q.exec_time = 0)'
1922
            )
1923
            ->set('system_process_id', 0)
1924
            ->execute();
1925
1926
        // mark all requested processes as non-active
1927
        $queryBuilder
1928
            ->update('tx_crawler_process')
1929
            ->where(
1930
                'NOT EXISTS (
1931
                SELECT * FROM tx_crawler_queue
1932
                    WHERE tx_crawler_queue.process_id = tx_crawler_process.process_id
1933
                    AND tx_crawler_queue.exec_time = 0
1934
                )',
1935
                $queryBuilder->expr()->in('process_id', $queryBuilder->createNamedParameter($releaseIds, Connection::PARAM_STR_ARRAY)),
1936
                $queryBuilder->expr()->eq('deleted', 0)
1937
            )
1938
            ->set('active', 0)
1939
            ->execute();
1940
        $queryBuilder->resetQueryPart('set');
1941
        $queryBuilder
1942
            ->update($this->tableName)
1943
            ->where(
1944
                $queryBuilder->expr()->eq('exec_time', 0),
1945
                $queryBuilder->expr()->in('process_id', $queryBuilder->createNamedParameter($releaseIds, Connection::PARAM_STR_ARRAY))
1946
            )
1947
            ->set('process_scheduled', 0)
1948
            ->set('process_id', '')
1949
            ->execute();
1950
1951
        return true;
1952
    }
1953
1954
    /**
1955
     * Create a unique Id for the current process
1956
     *
1957
     * @return string  the ID
1958
     */
1959 1
    public function CLI_buildProcessId()
1960
    {
1961 1
        if (!$this->processID) {
1962
            $this->processID = GeneralUtility::shortMD5(microtime(true));
1963
        }
1964 1
        return $this->processID;
1965
    }
1966
1967
    /**
1968
     * Prints a message to the stdout (only if debug-mode is enabled)
1969
     *
1970
     * @param string $msg the message
1971
     */
1972
    public function CLI_debug($msg): void
1973
    {
1974
        if ((int)$this->extensionSettings['processDebug']) {
1975
            echo $msg . "\n";
1976
            flush();
1977
        }
1978
    }
1979
1980
    /**
1981
     * Cleans up entries that stayed for too long in the queue. These are:
1982
     * - processed entries that are over 1.5 days in age
1983
     * - scheduled entries that are over 7 days old
1984
     *
1985
     * @return void
1986
     */
1987 1
    public function cleanUpOldQueueEntries(): void
1988
    {
1989 1
        $processedAgeInSeconds = $this->extensionSettings['cleanUpProcessedAge'] * 86400; // 24*60*60 Seconds in 24 hours
1990 1
        $scheduledAgeInSeconds = $this->extensionSettings['cleanUpScheduledAge'] * 86400;
1991
1992 1
        $now = time();
1993 1
        $condition = '(exec_time<>0 AND exec_time<' . ($now - $processedAgeInSeconds) . ') OR scheduled<=' . ($now - $scheduledAgeInSeconds);
1994 1
        $this->flushQueue($condition);
1995 1
    }
1996
1997
    /**
1998
     * Returns a md5 hash generated from a serialized configuration array.
1999
     *
2000
     * @param array $configuration
2001
     *
2002
     * @return string
2003
     */
2004 8
    protected function getConfigurationHash(array $configuration)
2005
    {
2006 8
        unset($configuration['paramExpanded']);
2007 8
        unset($configuration['URLs']);
2008 8
        return md5(serialize($configuration));
2009
    }
2010
2011
    /**
2012
     * Build a URL from a Page and the Query String. If the page has a Site configuration, it can be built by using
2013
     * the Site instance.
2014
     *
2015
     * @param int $pageId
2016
     * @param string $queryString
2017
     * @param string|null $alternativeBaseUrl
2018
     * @param int $httpsOrHttp see tx_crawler_configuration.force_ssl
2019
     * @return UriInterface
2020
     * @throws \TYPO3\CMS\Core\Exception\SiteNotFoundException
2021
     * @throws \TYPO3\CMS\Core\Routing\InvalidRouteArgumentsException
2022
     */
2023 2
    protected function getUrlFromPageAndQueryParameters(int $pageId, string $queryString, ?string $alternativeBaseUrl, int $httpsOrHttp): UriInterface
2024
    {
2025 2
        $site = GeneralUtility::makeInstance(SiteMatcher::class)->matchByPageId((int)$pageId);
2026 2
        if ($site instanceof Site) {
2027
            $queryString = ltrim($queryString, '?&');
2028
            $queryParts = [];
2029
            parse_str($queryString, $queryParts);
2030
            unset($queryParts['id']);
2031
            // workaround as long as we don't have native language support in crawler configurations
2032
            if (isset($queryParts['L'])) {
2033
                $queryParts['_language'] = $queryParts['L'];
2034
                unset($queryParts['L']);
2035
                $siteLanguage = $site->getLanguageById((int)$queryParts['_language']);
0 ignored issues
show
Unused Code introduced by
The assignment to $siteLanguage is dead and can be removed.
Loading history...
2036
            } else {
2037
                $siteLanguage = $site->getDefaultLanguage();
2038
            }
2039
            $url = $site->getRouter()->generateUri($pageId, $queryParts);
2040
            if (!empty($alternativeBaseUrl)) {
2041
                $alternativeBaseUrl = new Uri($alternativeBaseUrl);
2042
                $url = $url->withHost($alternativeBaseUrl->getHost());
2043
                $url = $url->withScheme($alternativeBaseUrl->getScheme());
2044
                $url = $url->withPort($alternativeBaseUrl->getPort());
2045
            }
2046
        } else {
2047
            // Technically this is not possible with site handling, but kept for backwards-compatibility reasons
2048
            // Once EXT:crawler is v10-only compatible, this should be removed completely
2049 2
            $baseUrl = ($alternativeBaseUrl ?: GeneralUtility::getIndpEnv('TYPO3_SITE_URL'));
2050 2
            $cacheHashCalculator = GeneralUtility::makeInstance(CacheHashCalculator::class);
2051 2
            $queryString .= '&cHash=' . $cacheHashCalculator->generateForParameters($queryString);
2052 2
            $url = rtrim($baseUrl, '/') . '/index.php' . $queryString;
2053 2
            $url = new Uri($url);
2054
        }
2055
2056 2
        if ($httpsOrHttp === -1) {
2057
            $url = $url->withScheme('http');
2058 2
        } elseif ($httpsOrHttp === 1) {
2059
            $url = $url->withScheme('https');
2060
        }
2061
2062 2
        return $url;
2063
    }
2064
2065 1
    protected function swapIfFirstIsLargerThanSecond(array $reg): array
2066
    {
2067
        // Swap if first is larger than last:
2068 1
        if ($reg[1] > $reg[2]) {
2069
            $temp = $reg[2];
2070
            $reg[2] = $reg[1];
2071
            $reg[1] = $temp;
2072
        }
2073
2074 1
        return $reg;
2075
    }
2076
}
2077