Passed
Push — cleanup/crawlercontroller-cli-... ( 1c5632 )
by Tomas Norre
12:41 queued 02:22
created

CrawlerController::getUrlsForPageId()   C

Complexity

Conditions 16
Paths 96

Size

Total Lines 93
Code Lines 51

Duplication

Lines 0
Ratio 0 %

Code Coverage

Tests 46
CRAP Score 16.0585

Importance

Changes 3
Bugs 0 Features 0
Metric Value
cc 16
eloc 51
c 3
b 0
f 0
nc 96
nop 1
dl 0
loc 93
ccs 46
cts 49
cp 0.9388
crap 16.0585
rs 5.5666

How to fix   Long Method    Complexity   

Long Method

Small methods make your code easier to understand, in particular if combined with a good name. Besides, if your method is small, finding a good name is usually much easier.

For example, if you find yourself adding comments to a method's body, this is usually a good sign to extract the commented part to a new method, and use the comment as a starting point when coming up with a good name for this new method.

Commonly applied refactorings include:

1
<?php
2
3
declare(strict_types=1);
4
5
namespace AOE\Crawler\Controller;
6
7
/***************************************************************
8
 *  Copyright notice
9
 *
10
 *  (c) 2019 AOE GmbH <[email protected]>
11
 *
12
 *  All rights reserved
13
 *
14
 *  This script is part of the TYPO3 project. The TYPO3 project is
15
 *  free software; you can redistribute it and/or modify
16
 *  it under the terms of the GNU General Public License as published by
17
 *  the Free Software Foundation; either version 3 of the License, or
18
 *  (at your option) any later version.
19
 *
20
 *  The GNU General Public License can be found at
21
 *  http://www.gnu.org/copyleft/gpl.html.
22
 *
23
 *  This script is distributed in the hope that it will be useful,
24
 *  but WITHOUT ANY WARRANTY; without even the implied warranty of
25
 *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
26
 *  GNU General Public License for more details.
27
 *
28
 *  This copyright notice MUST APPEAR in all copies of the script!
29
 ***************************************************************/
30
31
use AOE\Crawler\Configuration\ExtensionConfigurationProvider;
32
use AOE\Crawler\Domain\Repository\ConfigurationRepository;
33
use AOE\Crawler\Domain\Repository\ProcessRepository;
34
use AOE\Crawler\Domain\Repository\QueueRepository;
35
use AOE\Crawler\Event\EventDispatcher;
36
use AOE\Crawler\QueueExecutor;
37
use AOE\Crawler\Utility\SignalSlotUtility;
38
use Psr\Http\Message\UriInterface;
39
use Psr\Log\LoggerAwareInterface;
40
use Psr\Log\LoggerAwareTrait;
41
use TYPO3\CMS\Backend\Tree\View\PageTreeView;
42
use TYPO3\CMS\Backend\Utility\BackendUtility;
43
use TYPO3\CMS\Core\Authentication\BackendUserAuthentication;
44
use TYPO3\CMS\Core\Core\Bootstrap;
45
use TYPO3\CMS\Core\Core\Environment;
46
use TYPO3\CMS\Core\Database\Connection;
47
use TYPO3\CMS\Core\Database\ConnectionPool;
48
use TYPO3\CMS\Core\Database\Query\Restriction\DeletedRestriction;
49
use TYPO3\CMS\Core\Http\Uri;
50
use TYPO3\CMS\Core\Imaging\Icon;
51
use TYPO3\CMS\Core\Imaging\IconFactory;
52
use TYPO3\CMS\Core\Routing\SiteMatcher;
53
use TYPO3\CMS\Core\Site\Entity\Site;
54
use TYPO3\CMS\Core\Type\Bitmask\Permission;
55
use TYPO3\CMS\Core\TypoScript\Parser\TypoScriptParser;
56
use TYPO3\CMS\Core\Utility\DebugUtility;
57
use TYPO3\CMS\Core\Utility\GeneralUtility;
58
use TYPO3\CMS\Core\Utility\MathUtility;
59
use TYPO3\CMS\Extbase\Object\ObjectManager;
60
use TYPO3\CMS\Frontend\Page\CacheHashCalculator;
61
use TYPO3\CMS\Frontend\Page\PageRepository;
62
63
/**
64
 * Class CrawlerController
65
 *
66
 * @package AOE\Crawler\Controller
67
 */
68
class CrawlerController implements LoggerAwareInterface
69
{
70
    use LoggerAwareTrait;
71
72
    public const CLI_STATUS_NOTHING_PROCCESSED = 0;
73
    public const CLI_STATUS_REMAIN = 1; //queue not empty
74
    public const CLI_STATUS_PROCESSED = 2; //(some) queue items where processed
75
    public const CLI_STATUS_ABORTED = 4; //instance didn't finish
76
    public const CLI_STATUS_POLLABLE_PROCESSED = 8;
77
78
    /**
79
     * @var integer
80
     */
81
    public $setID = 0;
82
83
    /**
84
     * @var string
85
     */
86
    public $processID = '';
87
88
    /**
89
     * @var array
90
     */
91
    public $duplicateTrack = [];
92
93
    /**
94
     * @var array
95
     */
96
    public $downloadUrls = [];
97
98
    /**
99
     * @var array
100
     */
101
    public $incomingProcInstructions = [];
102
103
    /**
104
     * @var array
105
     */
106
    public $incomingConfigurationSelection = [];
107
108
    /**
109
     * @var bool
110
     */
111
    public $registerQueueEntriesInternallyOnly = false;
112
113
    /**
114
     * @var array
115
     */
116
    public $queueEntries = [];
117
118
    /**
119
     * @var array
120
     */
121
    public $urlList = [];
122
123
    /**
124
     * @var array
125
     */
126
    public $extensionSettings = [];
127
128
    /**
129
     * Mount Point
130
     *
131
     * @var bool
132
     * Todo: Check what this is used for and adjust the type hint or code, as bool doesn't match the current code.
133
     */
134
    public $MP = false;
135
136
    /**
137
     * @var string
138
     */
139
    protected $processFilename;
140
141
    /**
142
     * Holds the internal access mode can be 'gui','cli' or 'cli_im'
143
     *
144
     * @var string
145
     */
146
    protected $accessMode;
147
148
    /**
149
     * @var BackendUserAuthentication|null
150
     */
151
    private $backendUser;
152
153
    /**
154
     * @var integer
155
     */
156
    private $scheduledTime = 0;
157
158
    /**
159
     * @var integer
160
     */
161
    private $reqMinute = 0;
162
163
    /**
164
     * @var bool
165
     */
166
    private $submitCrawlUrls = false;
167
168
    /**
169
     * @var bool
170
     */
171
    private $downloadCrawlUrls = false;
172
173
    /**
174
     * @var QueueRepository
175
     */
176
    protected $queueRepository;
177
178
    /**
179
     * @var ProcessRepository
180
     */
181
    protected $processRepository;
182
183
    /**
184
     * @var ConfigurationRepository
185
     */
186
    protected $configurationRepository;
187
188
    /**
189
     * @var string
190
     */
191
    protected $tableName = 'tx_crawler_queue';
192
193
    /**
194
     * @var QueueExecutor
195
     */
196
    protected $queueExecutor;
197
198
    /**
199
     * @var int
200
     */
201
    protected $maximumUrlsToCompile = 10000;
202
203
    /**
204
     * @var IconFactory
205
     */
206
    protected $iconFactory;
207
208
    /**
209
     * Method to set the accessMode can be gui, cli or cli_im
210
     *
211
     * @return string
212
     */
213 1
    public function getAccessMode()
214
    {
215 1
        return $this->accessMode;
216
    }
217
218
    /**
219
     * @param string $accessMode
220
     */
221 1
    public function setAccessMode($accessMode): void
222
    {
223 1
        $this->accessMode = $accessMode;
224 1
    }
225
226
    /**
227
     * Set disabled status to prevent processes from being processed
228
     *
229
     * @param bool $disabled (optional, defaults to true)
230
     * @return void
231
     */
232 2
    public function setDisabled($disabled = true): void
233
    {
234 2
        if ($disabled) {
235 1
            GeneralUtility::writeFile($this->processFilename, '');
236
        } else {
237 1
            if (is_file($this->processFilename)) {
238 1
                unlink($this->processFilename);
239
            }
240
        }
241 2
    }
242
243
    /**
244
     * Get disable status
245
     *
246
     * @return bool true if disabled
247
     */
248 2
    public function getDisabled()
249
    {
250 2
        return is_file($this->processFilename);
251
    }
252
253
    /**
254
     * @param string $filenameWithPath
255
     *
256
     * @return void
257
     */
258 3
    public function setProcessFilename($filenameWithPath): void
259
    {
260 3
        $this->processFilename = $filenameWithPath;
261 3
    }
262
263
    /**
264
     * @return string
265
     */
266 1
    public function getProcessFilename()
267
    {
268 1
        return $this->processFilename;
269
    }
270
271
    /************************************
272
     *
273
     * Getting URLs based on Page TSconfig
274
     *
275
     ************************************/
276
277 48
    public function __construct()
278
    {
279 48
        $objectManager = GeneralUtility::makeInstance(ObjectManager::class);
280 48
        $this->queueRepository = $objectManager->get(QueueRepository::class);
281 48
        $this->processRepository = $objectManager->get(ProcessRepository::class);
282 48
        $this->configurationRepository = $objectManager->get(ConfigurationRepository::class);
283 48
        $this->queueExecutor = $objectManager->get(QueueExecutor::class);
284 48
        $this->iconFactory = GeneralUtility::makeInstance(IconFactory::class);
285
286 48
        $this->processFilename = Environment::getVarPath() . '/lock/tx_crawler.proc';
287
288
        /** @var ExtensionConfigurationProvider $configurationProvider */
289 48
        $configurationProvider = GeneralUtility::makeInstance(ExtensionConfigurationProvider::class);
290 48
        $settings = $configurationProvider->getExtensionConfiguration();
291 48
        $this->extensionSettings = is_array($settings) ? $settings : [];
0 ignored issues
show
introduced by
The condition is_array($settings) is always true.
Loading history...
292
293
        // set defaults:
294 48
        if (MathUtility::convertToPositiveInteger($this->extensionSettings['countInARun']) == 0) {
295
            $this->extensionSettings['countInARun'] = 100;
296
        }
297
298 48
        $this->extensionSettings['processLimit'] = MathUtility::forceIntegerInRange($this->extensionSettings['processLimit'], 1, 99, 1);
299 48
        $this->maximumUrlsToCompile = MathUtility::forceIntegerInRange($this->extensionSettings['maxCompileUrls'], 1, 1000000000, 10000);
300 48
    }
301
302
    /**
303
     * @return BackendUserAuthentication
304
     */
305 1
    private function getBackendUser()
306
    {
307
        // Make sure the _cli_ user is loaded
308 1
        Bootstrap::initializeBackendAuthentication();
309 1
        if ($this->backendUser === null) {
310 1
            $this->backendUser = $GLOBALS['BE_USER'];
311
        }
312 1
        return $this->backendUser;
313
    }
314
315
    /**
316
     * Sets the extensions settings (unserialized pendant of $TYPO3_CONF_VARS['EXT']['extConf']['crawler']).
317
     *
318
     * @param array $extensionSettings
319
     * @return void
320
     */
321 12
    public function setExtensionSettings(array $extensionSettings): void
322
    {
323 12
        $this->extensionSettings = $extensionSettings;
324 12
    }
325
326
    /**
327
     * Check if the given page should be crawled
328
     *
329
     * @param array $pageRow
330
     * @return false|string false if the page should be crawled (not excluded), true / skipMessage if it should be skipped
331
     */
332 8
    public function checkIfPageShouldBeSkipped(array $pageRow)
333
    {
334 8
        $skipPage = false;
335 8
        $skipMessage = 'Skipped'; // message will be overwritten later
336
337
        // if page is hidden
338 8
        if (!$this->extensionSettings['crawlHiddenPages']) {
339 8
            if ($pageRow['hidden']) {
340 1
                $skipPage = true;
341 1
                $skipMessage = 'Because page is hidden';
342
            }
343
        }
344
345 8
        if (!$skipPage) {
346 7
            if (GeneralUtility::inList('3,4', $pageRow['doktype']) || $pageRow['doktype'] >= 199) {
347 3
                $skipPage = true;
348 3
                $skipMessage = 'Because doktype is not allowed';
349
            }
350
        }
351
352 8
        if (!$skipPage) {
353 4
            foreach ($GLOBALS['TYPO3_CONF_VARS']['EXTCONF']['crawler']['excludeDoktype'] ?? [] as $key => $doktypeList) {
354 1
                if (GeneralUtility::inList($doktypeList, $pageRow['doktype'])) {
355 1
                    $skipPage = true;
356 1
                    $skipMessage = 'Doktype was excluded by "' . $key . '"';
357 1
                    break;
358
                }
359
            }
360
        }
361
362 8
        if (!$skipPage) {
363
            // veto hook
364 3
            foreach ($GLOBALS['TYPO3_CONF_VARS']['EXTCONF']['crawler']['pageVeto'] ?? [] as $key => $func) {
365
                $params = [
366
                    'pageRow' => $pageRow,
367
                ];
368
                // expects "false" if page is ok and "true" or a skipMessage if this page should _not_ be crawled
369
                $veto = GeneralUtility::callUserFunction($func, $params, $this);
370
                if ($veto !== false) {
371
                    $skipPage = true;
372
                    if (is_string($veto)) {
373
                        $skipMessage = $veto;
374
                    } else {
375
                        $skipMessage = 'Veto from hook "' . htmlspecialchars($key) . '"';
376
                    }
377
                    // no need to execute other hooks if a previous one return a veto
378
                    break;
379
                }
380
            }
381
        }
382
383 8
        return $skipPage ? $skipMessage : false;
384
    }
385
386
    /**
387
     * Wrapper method for getUrlsForPageId()
388
     * It returns an array of configurations and no urls!
389
     *
390
     * @param array $pageRow Page record with at least dok-type and uid columns.
391
     * @param string $skipMessage
392
     * @return array
393
     * @see getUrlsForPageId()
394
     */
395 4
    public function getUrlsForPageRow(array $pageRow, &$skipMessage = '')
396
    {
397 4
        $message = $this->checkIfPageShouldBeSkipped($pageRow);
398
399 4
        if ($message === false) {
400 3
            $res = $this->getUrlsForPageId($pageRow['uid']);
401 3
            $skipMessage = '';
402
        } else {
403 1
            $skipMessage = $message;
404 1
            $res = [];
405
        }
406
407 4
        return $res;
408
    }
409
410
    /**
411
     * This method is used to count if there are ANY unprocessed queue entries
412
     * of a given page_id and the configuration which matches a given hash.
413
     * If there if none, we can skip an inner detail check
414
     *
415
     * @param int $uid
416
     * @param string $configurationHash
417
     * @return boolean
418
     */
419 5
    protected function noUnprocessedQueueEntriesForPageWithConfigurationHashExist($uid, $configurationHash)
420
    {
421 5
        $queryBuilder = GeneralUtility::makeInstance(ConnectionPool::class)->getQueryBuilderForTable($this->tableName);
422 5
        $noUnprocessedQueueEntriesFound = true;
423
424
        $result = $queryBuilder
425 5
            ->count('*')
426 5
            ->from($this->tableName)
427 5
            ->where(
428 5
                $queryBuilder->expr()->eq('page_id', (int)$uid),
429 5
                $queryBuilder->expr()->eq('configuration_hash', $queryBuilder->createNamedParameter($configurationHash)),
430 5
                $queryBuilder->expr()->eq('exec_time', 0)
431
            )
432 5
            ->execute()
433 5
            ->fetchColumn();
434
435 5
        if ($result) {
436 3
            $noUnprocessedQueueEntriesFound = false;
437
        }
438
439 5
        return $noUnprocessedQueueEntriesFound;
440
    }
441
442
    /**
443
     * Creates a list of URLs from input array (and submits them to queue if asked for)
444
     * See Web > Info module script + "indexed_search"'s crawler hook-client using this!
445
     *
446
     * @param array $vv Information about URLs from pageRow to crawl.
447
     * @param array $pageRow Page row
448
     * @param int $scheduledTime Unix time to schedule indexing to, typically time()
449
     * @param int $reqMinute Number of requests per minute (creates the interleave between requests)
450
     * @param bool $submitCrawlUrls If set, submits the URLs to queue
451
     * @param bool $downloadCrawlUrls If set (and submitcrawlUrls is false) will fill $downloadUrls with entries)
452
     * @param array $duplicateTrack Array which is passed by reference and contains the an id per url to secure we will not crawl duplicates
453
     * @param array $downloadUrls Array which will be filled with URLS for download if flag is set.
454
     * @param array $incomingProcInstructions Array of processing instructions
455
     * @return string List of URLs (meant for display in backend module)
456
     */
457 2
    public function urlListFromUrlArray(
458
        array $vv,
459
        array $pageRow,
460
        $scheduledTime,
461
        $reqMinute,
462
        $submitCrawlUrls,
463
        $downloadCrawlUrls,
464
        array &$duplicateTrack,
465
        array &$downloadUrls,
466
        array $incomingProcInstructions
467
    ) {
468 2
        if (!is_array($vv['URLs'])) {
469
            return 'ERROR - no URL generated';
470
        }
471 2
        $urlLog = [];
472 2
        $pageId = (int)$pageRow['uid'];
473 2
        $configurationHash = $this->getConfigurationHash($vv);
474 2
        $skipInnerCheck = $this->noUnprocessedQueueEntriesForPageWithConfigurationHashExist($pageId, $configurationHash);
475
476 2
        foreach ($vv['URLs'] as $urlQuery) {
477 2
            if (!$this->drawURLs_PIfilter($vv['subCfg']['procInstrFilter'], $incomingProcInstructions)) {
478
                continue;
479
            }
480 2
            $url = (string)$this->getUrlFromPageAndQueryParameters(
481 2
                $pageId,
482 2
                $urlQuery,
483 2
                $vv['subCfg']['baseUrl'] ?? null,
484 2
                $vv['subCfg']['force_ssl'] ?? 0
485
            );
486
487
            // Create key by which to determine unique-ness:
488 2
            $uKey = $url . '|' . $vv['subCfg']['userGroups'] . '|' . $vv['subCfg']['procInstrFilter'];
489
490 2
            if (isset($duplicateTrack[$uKey])) {
491
                //if the url key is registered just display it and do not resubmit is
492
                $urlLog[] = '<em><span class="text-muted">' . htmlspecialchars($url) . '</span></em>';
493
            } else {
494
                // Scheduled time:
495 2
                $schTime = $scheduledTime + round(count($duplicateTrack) * (60 / $reqMinute));
496 2
                $schTime = intval($schTime / 60) * 60;
497 2
                $formattedDate = BackendUtility::datetime($schTime);
498 2
                $this->urlList[] = '[' . $formattedDate . '] ' . $url;
499 2
                $urlList = '[' . $formattedDate . '] ' . htmlspecialchars($url);
500
501
                // Submit for crawling!
502 2
                if ($submitCrawlUrls) {
503 2
                    $added = $this->addUrl(
504 2
                        $pageId,
505 2
                        $url,
506 2
                        $vv['subCfg'],
507 2
                        $scheduledTime,
508 2
                        $configurationHash,
509 2
                        $skipInnerCheck
510
                    );
511 2
                    if ($added === false) {
512 2
                        $urlList .= ' (URL already existed)';
513
                    }
514
                } elseif ($downloadCrawlUrls) {
515
                    $downloadUrls[$url] = $url;
516
                }
517 2
                $urlLog[] = $urlList;
518
            }
519 2
            $duplicateTrack[$uKey] = true;
520
        }
521
522 2
        return implode('<br>', $urlLog);
523
    }
524
525
    /**
526
     * Returns true if input processing instruction is among registered ones.
527
     *
528
     * @param string $piString PI to test
529
     * @param array $incomingProcInstructions Processing instructions
530
     * @return boolean
531
     */
532 5
    public function drawURLs_PIfilter($piString, array $incomingProcInstructions)
533
    {
534 5
        if (empty($incomingProcInstructions)) {
535 1
            return true;
536
        }
537
538 4
        foreach ($incomingProcInstructions as $pi) {
539 4
            if (GeneralUtility::inList($piString, $pi)) {
540 2
                return true;
541
            }
542
        }
543 2
        return false;
544
    }
545
546 3
    public function getPageTSconfigForId($id): array
547
    {
548 3
        if (!$this->MP) {
549 3
            $pageTSconfig = BackendUtility::getPagesTSconfig($id);
0 ignored issues
show
Deprecated Code introduced by
The function TYPO3\CMS\Backend\Utilit...ity::getPagesTSconfig() has been deprecated. ( Ignorable by Annotation )

If this is a false-positive, you can also ignore this issue in your code via the ignore-deprecated  annotation

549
            $pageTSconfig = /** @scrutinizer ignore-deprecated */ BackendUtility::getPagesTSconfig($id);
Loading history...
550
        } else {
551
            // TODO: Please check, this makes no sense to split a boolean value.
552
            [, $mountPointId] = explode('-', $this->MP);
0 ignored issues
show
Bug introduced by
$this->MP of type true is incompatible with the type string expected by parameter $string of explode(). ( Ignorable by Annotation )

If this is a false-positive, you can also ignore this issue in your code via the ignore-type  annotation

552
            [, $mountPointId] = explode('-', /** @scrutinizer ignore-type */ $this->MP);
Loading history...
553
            $pageTSconfig = BackendUtility::getPagesTSconfig($mountPointId);
0 ignored issues
show
Bug introduced by
$mountPointId of type string is incompatible with the type integer expected by parameter $id of TYPO3\CMS\Backend\Utilit...ity::getPagesTSconfig(). ( Ignorable by Annotation )

If this is a false-positive, you can also ignore this issue in your code via the ignore-type  annotation

553
            $pageTSconfig = BackendUtility::getPagesTSconfig(/** @scrutinizer ignore-type */ $mountPointId);
Loading history...
Deprecated Code introduced by
The function TYPO3\CMS\Backend\Utilit...ity::getPagesTSconfig() has been deprecated. ( Ignorable by Annotation )

If this is a false-positive, you can also ignore this issue in your code via the ignore-deprecated  annotation

553
            $pageTSconfig = /** @scrutinizer ignore-deprecated */ BackendUtility::getPagesTSconfig($mountPointId);
Loading history...
554
        }
555
556
        // Call a hook to alter configuration
557 3
        if (is_array($GLOBALS['TYPO3_CONF_VARS']['EXTCONF']['crawler']['getPageTSconfigForId'])) {
558
            $params = [
559
                'pageId' => $id,
560
                'pageTSConfig' => &$pageTSconfig,
561
            ];
562
            foreach ($GLOBALS['TYPO3_CONF_VARS']['EXTCONF']['crawler']['getPageTSconfigForId'] as $userFunc) {
563
                GeneralUtility::callUserFunction($userFunc, $params, $this);
564
            }
565
        }
566 3
        return $pageTSconfig;
567
    }
568
569
    /**
570
     * This methods returns an array of configurations.
571
     * Adds no urls!
572
     */
573 2
    public function getUrlsForPageId(int $pageId): array
574
    {
575
        // Get page TSconfig for page ID
576 2
        $pageTSconfig = $this->getPageTSconfigForId($pageId);
577
578 2
        $res = [];
579
580
        // Fetch Crawler Configuration from pageTSconfig
581 2
        $crawlerCfg = $pageTSconfig['tx_crawler.']['crawlerCfg.']['paramSets.'] ?? [];
582 2
        foreach ($crawlerCfg as $key => $values) {
583 1
            if (!is_array($values)) {
584 1
                continue;
585
            }
586 1
            $key = str_replace('.', '', $key);
587
            // Sub configuration for a single configuration string:
588 1
            $subCfg = (array)$crawlerCfg[$key . '.'];
589 1
            $subCfg['key'] = $key;
590
591 1
            if (strcmp($subCfg['procInstrFilter'], '')) {
592 1
                $subCfg['procInstrFilter'] = implode(',', GeneralUtility::trimExplode(',', $subCfg['procInstrFilter']));
593
            }
594 1
            $pidOnlyList = implode(',', GeneralUtility::trimExplode(',', $subCfg['pidsOnly'], true));
595
596
            // process configuration if it is not page-specific or if the specific page is the current page:
597
            // TODO: Check if $pidOnlyList can be kept as Array instead of imploded
598 1
            if (!strcmp((string)$subCfg['pidsOnly'], '') || GeneralUtility::inList($pidOnlyList, strval($pageId))) {
599
600
                // Explode, process etc.:
601 1
                $res[$key] = [];
602 1
                $res[$key]['subCfg'] = $subCfg;
603 1
                $res[$key]['paramParsed'] = GeneralUtility::explodeUrl2Array($crawlerCfg[$key]);
604 1
                $res[$key]['paramExpanded'] = $this->expandParameters($res[$key]['paramParsed'], $pageId);
605 1
                $res[$key]['origin'] = 'pagets';
606
607
                // recognize MP value
608 1
                if (!$this->MP) {
609 1
                    $res[$key]['URLs'] = $this->compileUrls($res[$key]['paramExpanded'], ['?id=' . $pageId]);
610
                } else {
611
                    $res[$key]['URLs'] = $this->compileUrls($res[$key]['paramExpanded'], ['?id=' . $pageId . '&MP=' . $this->MP]);
0 ignored issues
show
Bug introduced by
Are you sure $this->MP of type true can be used in concatenation? ( Ignorable by Annotation )

If this is a false-positive, you can also ignore this issue in your code via the ignore-type  annotation

611
                    $res[$key]['URLs'] = $this->compileUrls($res[$key]['paramExpanded'], ['?id=' . $pageId . '&MP=' . /** @scrutinizer ignore-type */ $this->MP]);
Loading history...
612
                }
613
            }
614
        }
615
616
        // Get configuration from tx_crawler_configuration records up the rootline
617 2
        $crawlerConfigurations = $this->configurationRepository->getCrawlerConfigurationRecordsFromRootLine($pageId);
618 2
        foreach ($crawlerConfigurations as $configurationRecord) {
619
620
            // check access to the configuration record
621 1
            if (empty($configurationRecord['begroups']) || $this->getBackendUser()->isAdmin() || $this->hasGroupAccess($this->getBackendUser()->user['usergroup_cached_list'], $configurationRecord['begroups'])) {
622 1
                $pidOnlyList = implode(',', GeneralUtility::trimExplode(',', $configurationRecord['pidsonly'], true));
623
624
                // process configuration if it is not page-specific or if the specific page is the current page:
625
                // TODO: Check if $pidOnlyList can be kept as Array instead of imploded
626 1
                if (!strcmp($configurationRecord['pidsonly'], '') || GeneralUtility::inList($pidOnlyList, strval($pageId))) {
627 1
                    $key = $configurationRecord['name'];
628
629
                    // don't overwrite previously defined paramSets
630 1
                    if (!isset($res[$key])) {
631
632
                        /* @var $TSparserObject \TYPO3\CMS\Core\TypoScript\Parser\TypoScriptParser */
633 1
                        $TSparserObject = GeneralUtility::makeInstance(TypoScriptParser::class);
634 1
                        $TSparserObject->parse($configurationRecord['processing_instruction_parameters_ts']);
635
636
                        $subCfg = [
637 1
                            'procInstrFilter' => $configurationRecord['processing_instruction_filter'],
638 1
                            'procInstrParams.' => $TSparserObject->setup,
639 1
                            'baseUrl' => $configurationRecord['base_url'],
640 1
                            'force_ssl' => (int)$configurationRecord['force_ssl'],
641 1
                            'userGroups' => $configurationRecord['fegroups'],
642 1
                            'exclude' => $configurationRecord['exclude'],
643 1
                            'key' => $key,
644
                        ];
645
646 1
                        if (!in_array($pageId, $this->expandExcludeString($subCfg['exclude']))) {
647 1
                            $res[$key] = [];
648 1
                            $res[$key]['subCfg'] = $subCfg;
649 1
                            $res[$key]['paramParsed'] = GeneralUtility::explodeUrl2Array($configurationRecord['configuration']);
650 1
                            $res[$key]['paramExpanded'] = $this->expandParameters($res[$key]['paramParsed'], $pageId);
651 1
                            $res[$key]['URLs'] = $this->compileUrls($res[$key]['paramExpanded'], ['?id=' . $pageId]);
652 1
                            $res[$key]['origin'] = 'tx_crawler_configuration_' . $configurationRecord['uid'];
653
                        }
654
                    }
655
                }
656
            }
657
        }
658
659 2
        foreach ($GLOBALS['TYPO3_CONF_VARS']['EXTCONF']['crawler']['processUrls'] ?? [] as $func) {
660
            $params = [
661
                'res' => &$res,
662
            ];
663
            GeneralUtility::callUserFunction($func, $params, $this);
664
        }
665 2
        return $res;
666
    }
667
668
    /**
669
     * Find all configurations of subpages of a page
670
     * TODO: Write Functional Tests
671
     */
672 1
    public function getConfigurationsForBranch(int $rootid, int $depth): array
673
    {
674 1
        $configurationsForBranch = [];
675 1
        $pageTSconfig = $this->getPageTSconfigForId($rootid);
676 1
        $sets = $pageTSconfig['tx_crawler.']['crawlerCfg.']['paramSets.'] ?? [];
677 1
        foreach ($sets as $key => $value) {
678
            if (!is_array($value)) {
679
                continue;
680
            }
681
            $configurationsForBranch[] = substr($key, -1) == '.' ? substr($key, 0, -1) : $key;
682
        }
683 1
        $pids = [];
684 1
        $rootLine = BackendUtility::BEgetRootLine($rootid);
685 1
        foreach ($rootLine as $node) {
686 1
            $pids[] = $node['uid'];
687
        }
688
        /* @var PageTreeView $tree */
689 1
        $tree = GeneralUtility::makeInstance(PageTreeView::class);
690 1
        $perms_clause = $this->getBackendUser()->getPagePermsClause(Permission::PAGE_SHOW);
691 1
        $tree->init(empty($perms_clause) ? '' : ('AND ' . $perms_clause));
692 1
        $tree->getTree($rootid, $depth, '');
693 1
        foreach ($tree->tree as $node) {
694
            $pids[] = $node['row']['uid'];
695
        }
696
697 1
        $queryBuilder = $this->getQueryBuilder('tx_crawler_configuration');
698
        $statement = $queryBuilder
699 1
            ->select('name')
700 1
            ->from('tx_crawler_configuration')
701 1
            ->where(
702 1
                $queryBuilder->expr()->in('pid', $queryBuilder->createNamedParameter($pids, Connection::PARAM_INT_ARRAY))
703
            )
704 1
            ->execute();
705
706 1
        while ($row = $statement->fetch()) {
707 1
            $configurationsForBranch[] = $row['name'];
708
        }
709 1
        return $configurationsForBranch;
710
    }
711
712
    /**
713
     * Get querybuilder for given table
714
     *
715
     * @param string $table
716
     * @return \TYPO3\CMS\Core\Database\Query\QueryBuilder
717
     */
718 18
    private function getQueryBuilder(string $table)
719
    {
720 18
        return GeneralUtility::makeInstance(ConnectionPool::class)->getQueryBuilderForTable($table);
721
    }
722
723
    /**
724
     * Check if a user has access to an item
725
     * (e.g. get the group list of the current logged in user from $GLOBALS['TSFE']->gr_list)
726
     *
727
     * @param string $groupList Comma-separated list of (fe_)group UIDs from a user
728
     * @param string $accessList Comma-separated list of (fe_)group UIDs of the item to access
729
     * @return bool                 TRUE if at least one of the users group UIDs is in the access list or the access list is empty
730
     * @see \TYPO3\CMS\Frontend\Page\PageRepository::getMultipleGroupsWhereClause()
731
     */
732 3
    public function hasGroupAccess($groupList, $accessList)
733
    {
734 3
        if (empty($accessList)) {
735 1
            return true;
736
        }
737 2
        foreach (GeneralUtility::intExplode(',', $groupList) as $groupUid) {
738 2
            if (GeneralUtility::inList($accessList, $groupUid)) {
739 1
                return true;
740
            }
741
        }
742 1
        return false;
743
    }
744
745
    /**
746
     * Will expand the parameters configuration to individual values. This follows a certain syntax of the value of each parameter.
747
     * Syntax of values:
748
     * - Basically: If the value is wrapped in [...] it will be expanded according to the following syntax, otherwise the value is taken literally
749
     * - Configuration is splitted by "|" and the parts are processed individually and finally added together
750
     * - For each configuration part:
751
     *         - "[int]-[int]" = Integer range, will be expanded to all values in between, values included, starting from low to high (max. 1000). Example "1-34" or "-40--30"
752
     *         - "_TABLE:[TCA table name];[_PID:[optional page id, default is current page]];[_ENABLELANG:1]" = Look up of table records from PID, filtering out deleted records. Example "_TABLE:tt_content; _PID:123"
753
     *        _ENABLELANG:1 picks only original records without their language overlays
754
     *         - Default: Literal value
755
     *
756
     * @param array $paramArray Array with key (GET var name) and values (value of GET var which is configuration for expansion)
757
     * @param integer $pid Current page ID
758
     * @return array
759
     *
760
     * TODO: Write Functional Tests
761
     */
762 9
    public function expandParameters($paramArray, $pid)
763
    {
764
        // Traverse parameter names:
765 9
        foreach ($paramArray as $p => $v) {
766 9
            $v = trim($v);
767
768
            // If value is encapsulated in square brackets it means there are some ranges of values to find, otherwise the value is literal
769 9
            if (strpos($v, '[') === 0 && substr($v, -1) === ']') {
770
                // So, find the value inside brackets and reset the paramArray value as an array.
771 9
                $v = substr($v, 1, -1);
772 9
                $paramArray[$p] = [];
773
774
                // Explode parts and traverse them:
775 9
                $parts = explode('|', $v);
776 9
                foreach ($parts as $pV) {
777
778
                    // Look for integer range: (fx. 1-34 or -40--30 // reads minus 40 to minus 30)
779 9
                    if (preg_match('/^(-?[0-9]+)\s*-\s*(-?[0-9]+)$/', trim($pV), $reg)) {
780 1
                        $reg = $this->swapIfFirstIsLargerThanSecond($reg);
781
782
                        // Traverse range, add values:
783 1
                        $runAwayBrake = 1000; // Limit to size of range!
784 1
                        for ($a = $reg[1]; $a <= $reg[2]; $a++) {
785 1
                            $paramArray[$p][] = $a;
786 1
                            $runAwayBrake--;
787 1
                            if ($runAwayBrake <= 0) {
788
                                break;
789
                            }
790
                        }
791 8
                    } elseif (strpos(trim($pV), '_TABLE:') === 0) {
792
793
                        // Parse parameters:
794 6
                        $subparts = GeneralUtility::trimExplode(';', $pV);
795 6
                        $subpartParams = [];
796 6
                        foreach ($subparts as $spV) {
797 6
                            [$pKey, $pVal] = GeneralUtility::trimExplode(':', $spV);
798 6
                            $subpartParams[$pKey] = $pVal;
799
                        }
800
801
                        // Table exists:
802 6
                        if (isset($GLOBALS['TCA'][$subpartParams['_TABLE']])) {
803 6
                            $lookUpPid = isset($subpartParams['_PID']) ? intval($subpartParams['_PID']) : intval($pid);
804 6
                            $recursiveDepth = isset($subpartParams['_RECURSIVE']) ? intval($subpartParams['_RECURSIVE']) : 0;
805 6
                            $pidField = isset($subpartParams['_PIDFIELD']) ? trim($subpartParams['_PIDFIELD']) : 'pid';
806 6
                            $where = $subpartParams['_WHERE'] ?? '';
807 6
                            $addTable = $subpartParams['_ADDTABLE'] ?? '';
808
809 6
                            $fieldName = $subpartParams['_FIELD'] ? $subpartParams['_FIELD'] : 'uid';
810 6
                            if ($fieldName === 'uid' || $GLOBALS['TCA'][$subpartParams['_TABLE']]['columns'][$fieldName]) {
811 6
                                $queryBuilder = $this->getQueryBuilder($subpartParams['_TABLE']);
812
813 6
                                if ($recursiveDepth > 0) {
814
                                    /** @var \TYPO3\CMS\Core\Database\QueryGenerator $queryGenerator */
815 2
                                    $queryGenerator = GeneralUtility::makeInstance(\TYPO3\CMS\Core\Database\QueryGenerator::class);
816 2
                                    $pidList = $queryGenerator->getTreeList($lookUpPid, $recursiveDepth, 0, 1);
817 2
                                    $pidArray = GeneralUtility::intExplode(',', $pidList);
818
                                } else {
819 4
                                    $pidArray = [(string)$lookUpPid];
820
                                }
821
822 6
                                $queryBuilder->getRestrictions()
823 6
                                    ->removeAll()
824 6
                                    ->add(GeneralUtility::makeInstance(DeletedRestriction::class));
825
826
                                $queryBuilder
827 6
                                    ->select($fieldName)
828 6
                                    ->from($subpartParams['_TABLE'])
829 6
                                    ->where(
830 6
                                        $queryBuilder->expr()->in($pidField, $queryBuilder->createNamedParameter($pidArray, Connection::PARAM_INT_ARRAY)),
831 6
                                        $where
832
                                    );
833
834 6
                                if (!empty($addTable)) {
835
                                    // TODO: Check if this works as intended!
836
                                    $queryBuilder->add('from', $addTable);
837
                                }
838 6
                                $transOrigPointerField = $GLOBALS['TCA'][$subpartParams['_TABLE']]['ctrl']['transOrigPointerField'];
839
840 6
                                if ($subpartParams['_ENABLELANG'] && $transOrigPointerField) {
841
                                    $queryBuilder->andWhere(
842
                                        $queryBuilder->expr()->lte(
843
                                            $transOrigPointerField,
844
                                            0
845
                                        )
846
                                    );
847
                                }
848
849 6
                                $statement = $queryBuilder->execute();
850
851 6
                                $rows = [];
852 6
                                while ($row = $statement->fetch()) {
853 6
                                    $rows[$row[$fieldName]] = $row;
854
                                }
855
856 6
                                if (is_array($rows)) {
857 6
                                    $paramArray[$p] = array_merge($paramArray[$p], array_keys($rows));
858
                                }
859
                            }
860
                        }
861
                    } else { // Just add value:
862 2
                        $paramArray[$p][] = $pV;
863
                    }
864
                    // Hook for processing own expandParameters place holder
865 9
                    if (is_array($GLOBALS['TYPO3_CONF_VARS']['SC_OPTIONS']['crawler/class.tx_crawler_lib.php']['expandParameters'])) {
866
                        $_params = [
867
                            'pObj' => &$this,
868
                            'paramArray' => &$paramArray,
869
                            'currentKey' => $p,
870
                            'currentValue' => $pV,
871
                            'pid' => $pid,
872
                        ];
873
                        foreach ($GLOBALS['TYPO3_CONF_VARS']['SC_OPTIONS']['crawler/class.tx_crawler_lib.php']['expandParameters'] as $_funcRef) {
874
                            GeneralUtility::callUserFunction($_funcRef, $_params, $this);
875
                        }
876
                    }
877
                }
878
879
                // Make unique set of values and sort array by key:
880 9
                $paramArray[$p] = array_unique($paramArray[$p]);
881 9
                ksort($paramArray);
882
            } else {
883
                // Set the literal value as only value in array:
884 2
                $paramArray[$p] = [$v];
885
            }
886
        }
887
888 9
        return $paramArray;
889
    }
890
891
    /**
892
     * Compiling URLs from parameter array (output of expandParameters())
893
     * The number of URLs will be the multiplication of the number of parameter values for each key
894
     *
895
     * @param array $paramArray Output of expandParameters(): Array with keys (GET var names) and for each an array of values
896
     * @param array $urls URLs accumulated in this array (for recursion)
897
     * @return array
898
     */
899 5
    public function compileUrls($paramArray, array $urls)
900
    {
901 5
        if (empty($paramArray)) {
902 5
            return $urls;
903
        }
904
        // shift first off stack:
905 4
        reset($paramArray);
906 4
        $varName = key($paramArray);
907 4
        $valueSet = array_shift($paramArray);
908
909
        // Traverse value set:
910 4
        $newUrls = [];
911 4
        foreach ($urls as $url) {
912 3
            foreach ($valueSet as $val) {
913 3
                $newUrls[] = $url . (strcmp((string)$val, '') ? '&' . rawurlencode($varName) . '=' . rawurlencode((string)$val) : '');
914
915 3
                if (count($newUrls) > $this->maximumUrlsToCompile) {
916
                    break;
917
                }
918
            }
919
        }
920 4
        return $this->compileUrls($paramArray, $newUrls);
921
    }
922
923
    /************************************
924
     *
925
     * Crawler log
926
     *
927
     ************************************/
928
929
    /**
930
     * Return array of records from crawler queue for input page ID
931
     *
932
     * @param integer $id Page ID for which to look up log entries.
933
     * @param string $filter Filter: "all" => all entries, "pending" => all that is not yet run, "finished" => all complete ones
934
     * @param boolean $doFlush If TRUE, then entries selected at DELETED(!) instead of selected!
935
     * @param boolean $doFullFlush
936
     * @param integer $itemsPerPage Limit the amount of entries per page default is 10
937
     * @return array
938
     */
939 4
    public function getLogEntriesForPageId($id, $filter = '', $doFlush = false, $doFullFlush = false, $itemsPerPage = 10)
940
    {
941 4
        $queryBuilder = GeneralUtility::makeInstance(ConnectionPool::class)->getQueryBuilderForTable($this->tableName);
942
        $queryBuilder
943 4
            ->select('*')
944 4
            ->from($this->tableName)
945 4
            ->where(
946 4
                $queryBuilder->expr()->eq('page_id', $queryBuilder->createNamedParameter($id, \PDO::PARAM_INT))
947
            )
948 4
            ->orderBy('scheduled', 'DESC');
949
950 4
        $expressionBuilder = GeneralUtility::makeInstance(ConnectionPool::class)
951 4
            ->getConnectionForTable($this->tableName)
952 4
            ->getExpressionBuilder();
953 4
        $query = $expressionBuilder->andX();
954
        // PHPStorm adds the highlight that the $addWhere is immediately overwritten,
955
        // but the $query = $expressionBuilder->andX() ensures that the $addWhere is written correctly with AND
956
        // between the statements, it's not a mistake in the code.
957 4
        $addWhere = '1=1';
958 4
        switch ($filter) {
959 4
            case 'pending':
960
                $queryBuilder->andWhere($queryBuilder->expr()->eq('exec_time', 0));
961
                $addWhere .= ' AND ' . $query->add($expressionBuilder->eq('exec_time', 0));
962
                break;
963 4
            case 'finished':
964
                $queryBuilder->andWhere($queryBuilder->expr()->gt('exec_time', 0));
965
                $addWhere .= ' AND ' . $query->add($expressionBuilder->gt('exec_time', 0));
966
                break;
967 4
            case 'all':
968
                $doFullFlush = $doFullFlush ?: false;
969
                break;
970
        }
971
972
        // FIXME: Write unit test that ensures that the right records are deleted.
973 4
        if ($doFlush) {
974
            // We do currently ignore PageId by flush.
975
            // To have pending and finished parameters accepted
976
            // 2020.04.11 - Tomas Mikkelsen
977
            // $addWhere = $query->add($expressionBuilder->eq('page_id', (int)$id));
978 2
            $this->flushQueue($doFullFlush ? '1=1' : $addWhere);
979 2
            return [];
980
        } else {
981 2
            if ($itemsPerPage > 0) {
982
                $queryBuilder
983 2
                    ->setMaxResults((int)$itemsPerPage);
984
            }
985
986 2
            return $queryBuilder->execute()->fetchAll();
987
        }
988
    }
989
990
    /**
991
     * Return array of records from crawler queue for input set ID
992
     *
993
     * @param int $set_id Set ID for which to look up log entries.
994
     * @param string $filter Filter: "all" => all entries, "pending" => all that is not yet run, "finished" => all complete ones
995
     * @param bool $doFlush If TRUE, then entries selected at DELETED(!) instead of selected!
996
     * @param int $itemsPerPage Limit the amount of entires per page default is 10
997
     * @return array
998
     */
999 6
    public function getLogEntriesForSetId(int $set_id, string $filter = '', bool $doFlush = false, bool $doFullFlush = false, int $itemsPerPage = 10)
1000
    {
1001 6
        $queryBuilder = GeneralUtility::makeInstance(ConnectionPool::class)->getQueryBuilderForTable($this->tableName);
1002
        $queryBuilder
1003 6
            ->select('*')
1004 6
            ->from($this->tableName)
1005 6
            ->where(
1006 6
                $queryBuilder->expr()->eq('set_id', $queryBuilder->createNamedParameter($set_id, \PDO::PARAM_INT))
1007
            )
1008 6
            ->orderBy('scheduled', 'DESC');
1009
1010 6
        $expressionBuilder = GeneralUtility::makeInstance(ConnectionPool::class)
1011 6
            ->getConnectionForTable($this->tableName)
1012 6
            ->getExpressionBuilder();
1013 6
        $query = $expressionBuilder->andX();
1014
        // FIXME: Write Unit tests for Filters
1015
        // PHPStorm adds the highlight that the $addWhere is immediately overwritten,
1016
        // but the $query = $expressionBuilder->andX() ensures that the $addWhere is written correctly with AND
1017
        // between the statements, it's not a mistake in the code.
1018 6
        $addWhere = '';
1019 6
        switch ($filter) {
1020 6
            case 'pending':
1021 1
                $queryBuilder->andWhere($queryBuilder->expr()->eq('exec_time', 0));
1022 1
                $addWhere = $query->add($expressionBuilder->eq('exec_time', 0));
0 ignored issues
show
Unused Code introduced by
The assignment to $addWhere is dead and can be removed.
Loading history...
1023 1
                break;
1024 5
            case 'finished':
1025 1
                $queryBuilder->andWhere($queryBuilder->expr()->gt('exec_time', 0));
1026 1
                $addWhere = $query->add($expressionBuilder->gt('exec_time', 0));
1027 1
                break;
1028
        }
1029
        // FIXME: Write unit test that ensures that the right records are deleted.
1030 6
        if ($doFlush) {
1031 4
            $addWhere = $query->add($expressionBuilder->eq('set_id', (int)$set_id));
1032 4
            $this->flushQueue($doFullFlush ? '' : $addWhere);
1033 4
            return [];
1034
        } else {
1035 2
            if ($itemsPerPage > 0) {
1036
                $queryBuilder
1037 2
                    ->setMaxResults((int)$itemsPerPage);
1038
            }
1039
1040 2
            return $queryBuilder->execute()->fetchAll();
1041
        }
1042
    }
1043
1044
    /**
1045
     * Removes queue entries
1046
     *
1047
     * @param string $where SQL related filter for the entries which should be removed
1048
     * @return void
1049
     */
1050 11
    protected function flushQueue($where = ''): void
1051
    {
1052 11
        $realWhere = strlen((string)$where) > 0 ? $where : '1=1';
1053
1054 11
        $queryBuilder = $this->getQueryBuilder($this->tableName);
1055
1056 11
        if (EventDispatcher::getInstance()->hasObserver('queueEntryFlush')) {
1057
            $groups = $queryBuilder
1058
                ->select('DISTINCT set_id')
1059
                ->from($this->tableName)
1060
                ->where($realWhere)
1061
                ->execute()
1062
                ->fetchAll();
1063
            if (is_array($groups)) {
0 ignored issues
show
introduced by
The condition is_array($groups) is always true.
Loading history...
1064
                foreach ($groups as $group) {
1065
                    $subSet = $queryBuilder
1066
                        ->select('uid', 'set_id')
1067
                        ->from($this->tableName)
1068
                        ->where(
1069
                            $realWhere,
1070
                            $queryBuilder->expr()->eq('set_id', $group['set_id'])
1071
                        )
1072
                        ->execute()
1073
                        ->fetchAll();
1074
                    EventDispatcher::getInstance()->post('queueEntryFlush', $group['set_id'], $subSet);
1075
                }
1076
            }
1077
        }
1078
1079
        $queryBuilder
1080 11
            ->delete($this->tableName)
1081 11
            ->where($realWhere)
1082 11
            ->execute();
1083 11
    }
1084
1085
    /**
1086
     * Adding call back entries to log (called from hooks typically, see indexed search class "class.crawler.php"
1087
     *
1088
     * @param integer $setId Set ID
1089
     * @param array $params Parameters to pass to call back function
1090
     * @param string $callBack Call back object reference, eg. 'EXT:indexed_search/class.crawler.php:&tx_indexedsearch_crawler'
1091
     * @param integer $page_id Page ID to attach it to
1092
     * @param integer $schedule Time at which to activate
1093
     * @return void
1094
     */
1095
    public function addQueueEntry_callBack($setId, $params, $callBack, $page_id = 0, $schedule = 0): void
1096
    {
1097
        if (!is_array($params)) {
0 ignored issues
show
introduced by
The condition is_array($params) is always true.
Loading history...
1098
            $params = [];
1099
        }
1100
        $params['_CALLBACKOBJ'] = $callBack;
1101
1102
        GeneralUtility::makeInstance(ConnectionPool::class)->getConnectionForTable('tx_crawler_queue')
1103
            ->insert(
1104
                'tx_crawler_queue',
1105
                [
1106
                    'page_id' => (int)$page_id,
1107
                    'parameters' => serialize($params),
1108
                    'scheduled' => (int)$schedule ?: $this->getCurrentTime(),
1109
                    'exec_time' => 0,
1110
                    'set_id' => (int)$setId,
1111
                    'result_data' => '',
1112
                ]
1113
            );
1114
    }
1115
1116
    /************************************
1117
     *
1118
     * URL setting
1119
     *
1120
     ************************************/
1121
1122
    /**
1123
     * Setting a URL for crawling:
1124
     *
1125
     * @param integer $id Page ID
1126
     * @param string $url Complete URL
1127
     * @param array $subCfg Sub configuration array (from TS config)
1128
     * @param integer $tstamp Scheduled-time
1129
     * @param string $configurationHash (optional) configuration hash
1130
     * @param bool $skipInnerDuplicationCheck (optional) skip inner duplication check
1131
     * @return bool
1132
     */
1133 6
    public function addUrl(
1134
        $id,
1135
        $url,
1136
        array $subCfg,
1137
        $tstamp,
1138
        $configurationHash = '',
1139
        $skipInnerDuplicationCheck = false
1140
    ) {
1141 6
        $urlAdded = false;
1142 6
        $rows = [];
1143
1144
        // Creating parameters:
1145
        $parameters = [
1146 6
            'url' => $url,
1147
        ];
1148
1149
        // fe user group simulation:
1150 6
        $uGs = implode(',', array_unique(GeneralUtility::intExplode(',', $subCfg['userGroups'], true)));
1151 6
        if ($uGs) {
1152 1
            $parameters['feUserGroupList'] = $uGs;
1153
        }
1154
1155
        // Setting processing instructions
1156 6
        $parameters['procInstructions'] = GeneralUtility::trimExplode(',', $subCfg['procInstrFilter']);
1157 6
        if (is_array($subCfg['procInstrParams.'])) {
1158 3
            $parameters['procInstrParams'] = $subCfg['procInstrParams.'];
1159
        }
1160
1161
        // Compile value array:
1162 6
        $parameters_serialized = serialize($parameters);
1163
        $fieldArray = [
1164 6
            'page_id' => (int)$id,
1165 6
            'parameters' => $parameters_serialized,
1166 6
            'parameters_hash' => GeneralUtility::shortMD5($parameters_serialized),
1167 6
            'configuration_hash' => $configurationHash,
1168 6
            'scheduled' => $tstamp,
1169 6
            'exec_time' => 0,
1170 6
            'set_id' => (int)$this->setID,
1171 6
            'result_data' => '',
1172 6
            'configuration' => $subCfg['key'],
1173
        ];
1174
1175 6
        if ($this->registerQueueEntriesInternallyOnly) {
1176
            //the entries will only be registered and not stored to the database
1177 1
            $this->queueEntries[] = $fieldArray;
1178
        } else {
1179 5
            if (!$skipInnerDuplicationCheck) {
1180
                // check if there is already an equal entry
1181 4
                $rows = $this->getDuplicateRowsIfExist($tstamp, $fieldArray);
1182
            }
1183
1184 5
            if (empty($rows)) {
1185 4
                $connectionForCrawlerQueue = GeneralUtility::makeInstance(ConnectionPool::class)->getConnectionForTable('tx_crawler_queue');
1186 4
                $connectionForCrawlerQueue->insert(
1187 4
                    'tx_crawler_queue',
1188 4
                    $fieldArray
1189
                );
1190 4
                $uid = $connectionForCrawlerQueue->lastInsertId('tx_crawler_queue', 'qid');
1191 4
                $rows[] = $uid;
1192 4
                $urlAdded = true;
1193 4
                EventDispatcher::getInstance()->post('urlAddedToQueue', strval($this->setID), ['uid' => $uid, 'fieldArray' => $fieldArray]);
1194
            } else {
1195 1
                EventDispatcher::getInstance()->post('duplicateUrlInQueue', strval($this->setID), ['rows' => $rows, 'fieldArray' => $fieldArray]);
1196
            }
1197
        }
1198
1199 6
        return $urlAdded;
1200
    }
1201
1202
    /**
1203
     * This method determines duplicates for a queue entry with the same parameters and this timestamp.
1204
     * If the timestamp is in the past, it will check if there is any unprocessed queue entry in the past.
1205
     * If the timestamp is in the future it will check, if the queued entry has exactly the same timestamp
1206
     *
1207
     * @param int $tstamp
1208
     * @param array $fieldArray
1209
     *
1210
     * @return array
1211
     */
1212 7
    protected function getDuplicateRowsIfExist($tstamp, $fieldArray)
1213
    {
1214 7
        $rows = [];
1215
1216 7
        $currentTime = $this->getCurrentTime();
1217
1218 7
        $queryBuilder = GeneralUtility::makeInstance(ConnectionPool::class)->getQueryBuilderForTable($this->tableName);
1219
        $queryBuilder
1220 7
            ->select('qid')
1221 7
            ->from('tx_crawler_queue');
1222
        //if this entry is scheduled with "now"
1223 7
        if ($tstamp <= $currentTime) {
1224 2
            if ($this->extensionSettings['enableTimeslot']) {
1225 1
                $timeBegin = $currentTime - 100;
1226 1
                $timeEnd = $currentTime + 100;
1227
                $queryBuilder
1228 1
                    ->where(
1229 1
                        'scheduled BETWEEN ' . $timeBegin . ' AND ' . $timeEnd . ''
1230
                    )
1231 1
                    ->orWhere(
1232 1
                        $queryBuilder->expr()->lte('scheduled', $currentTime)
1233
                    );
1234
            } else {
1235
                $queryBuilder
1236 1
                    ->where(
1237 2
                        $queryBuilder->expr()->lte('scheduled', $currentTime)
1238
                    );
1239
            }
1240 5
        } elseif ($tstamp > $currentTime) {
1241
            //entry with a timestamp in the future need to have the same schedule time
1242
            $queryBuilder
1243 5
                ->where(
1244 5
                    $queryBuilder->expr()->eq('scheduled', $tstamp)
1245
                );
1246
        }
1247
1248
        $queryBuilder
1249 7
            ->andWhere('NOT exec_time')
1250 7
            ->andWhere('NOT process_id')
1251 7
            ->andWhere($queryBuilder->expr()->eq('page_id', $queryBuilder->createNamedParameter($fieldArray['page_id'], \PDO::PARAM_INT)))
1252 7
            ->andWhere($queryBuilder->expr()->eq('parameters_hash', $queryBuilder->createNamedParameter($fieldArray['parameters_hash'], \PDO::PARAM_STR)));
1253
1254 7
        $statement = $queryBuilder->execute();
1255
1256 7
        while ($row = $statement->fetch()) {
1257 5
            $rows[] = $row['qid'];
1258
        }
1259
1260 7
        return $rows;
1261
    }
1262
1263
    /**
1264
     * Returns the current system time
1265
     *
1266
     * @return int
1267
     */
1268
    public function getCurrentTime()
1269
    {
1270
        return time();
1271
    }
1272
1273
    /************************************
1274
     *
1275
     * URL reading
1276
     *
1277
     ************************************/
1278
1279
    /**
1280
     * Read URL for single queue entry
1281
     *
1282
     * @param integer $queueId
1283
     * @param boolean $force If set, will process even if exec_time has been set!
1284
     * @return integer
1285
     */
1286
    public function readUrl($queueId, $force = false)
1287
    {
1288
        $queryBuilder = GeneralUtility::makeInstance(ConnectionPool::class)->getQueryBuilderForTable($this->tableName);
1289
        $ret = 0;
1290
        $this->logger->debug('crawler-readurl start ' . microtime(true));
1291
        // Get entry:
1292
        $queryBuilder
1293
            ->select('*')
1294
            ->from('tx_crawler_queue')
1295
            ->where(
1296
                $queryBuilder->expr()->eq('qid', $queryBuilder->createNamedParameter($queueId, \PDO::PARAM_INT))
1297
            );
1298
        if (!$force) {
1299
            $queryBuilder
1300
                ->andWhere('exec_time = 0')
1301
                ->andWhere('process_scheduled > 0');
1302
        }
1303
        $queueRec = $queryBuilder->execute()->fetch();
1304
1305
        if (!is_array($queueRec)) {
1306
            return;
1307
        }
1308
1309
        SignalSlotUtility::emitSignal(
1310
            __CLASS__,
1311
            SignalSlotUtility::SIGNAL_QUEUEITEM_PREPROCESS,
1312
            [$queueId, &$queueRec]
1313
        );
1314
1315
        // Set exec_time to lock record:
1316
        $field_array = ['exec_time' => $this->getCurrentTime()];
1317
1318
        if (isset($this->processID)) {
1319
            //if mulitprocessing is used we need to store the id of the process which has handled this entry
1320
            $field_array['process_id_completed'] = $this->processID;
1321
        }
1322
1323
        GeneralUtility::makeInstance(ConnectionPool::class)->getConnectionForTable('tx_crawler_queue')
1324
            ->update(
1325
                'tx_crawler_queue',
1326
                $field_array,
1327
                ['qid' => (int)$queueId]
1328
            );
1329
1330
        $result = $this->queueExecutor->executeQueueItem($queueRec, $this);
1331
        $resultData = unserialize($result['content']);
1332
1333
        //atm there's no need to point to specific pollable extensions
1334
        if (is_array($GLOBALS['TYPO3_CONF_VARS']['EXTCONF']['crawler']['pollSuccess'])) {
1335
            foreach ($GLOBALS['TYPO3_CONF_VARS']['EXTCONF']['crawler']['pollSuccess'] as $pollable) {
1336
                // only check the success value if the instruction is runnig
1337
                // it is important to name the pollSuccess key same as the procInstructions key
1338
                if (is_array($resultData['parameters']['procInstructions'])
1339
                    && in_array(
1340
                        $pollable,
1341
                        $resultData['parameters']['procInstructions']
1342
                    )
1343
                ) {
1344
                    if (!empty($resultData['success'][$pollable]) && $resultData['success'][$pollable]) {
1345
                        $ret |= self::CLI_STATUS_POLLABLE_PROCESSED;
1346
                    }
1347
                }
1348
            }
1349
        }
1350
1351
        // Set result in log which also denotes the end of the processing of this entry.
1352
        $field_array = ['result_data' => serialize($result)];
1353
1354
        SignalSlotUtility::emitSignal(
1355
            __CLASS__,
1356
            SignalSlotUtility::SIGNAL_QUEUEITEM_POSTPROCESS,
1357
            [$queueId, &$field_array]
1358
        );
1359
1360
        GeneralUtility::makeInstance(ConnectionPool::class)->getConnectionForTable('tx_crawler_queue')
1361
            ->update(
1362
                'tx_crawler_queue',
1363
                $field_array,
1364
                ['qid' => (int)$queueId]
1365
            );
1366
1367
        $this->logger->debug('crawler-readurl stop ' . microtime(true));
1368
        return $ret;
1369
    }
1370
1371
    /**
1372
     * Read URL for not-yet-inserted log-entry
1373
     *
1374
     * @param array $field_array Queue field array,
1375
     *
1376
     * @return string
1377
     */
1378
    public function readUrlFromArray($field_array)
1379
    {
1380
        // Set exec_time to lock record:
1381
        $field_array['exec_time'] = $this->getCurrentTime();
1382
        $connectionForCrawlerQueue = GeneralUtility::makeInstance(ConnectionPool::class)->getConnectionForTable($this->tableName);
1383
        $connectionForCrawlerQueue->insert(
1384
            $this->tableName,
1385
            $field_array
1386
        );
1387
        $queueId = $field_array['qid'] = $connectionForCrawlerQueue->lastInsertId($this->tableName, 'qid');
1388
1389
        $result = $this->queueExecutor->executeQueueItem($field_array, $this);
1390
1391
        // Set result in log which also denotes the end of the processing of this entry.
1392
        $field_array = ['result_data' => serialize($result)];
1393
1394
        SignalSlotUtility::emitSignal(
1395
            __CLASS__,
1396
            SignalSlotUtility::SIGNAL_QUEUEITEM_POSTPROCESS,
1397
            [$queueId, &$field_array]
1398
        );
1399
1400
        $connectionForCrawlerQueue->update(
1401
            $this->tableName,
1402
            $field_array,
1403
            ['qid' => $queueId]
1404
        );
1405
1406
        return $result;
0 ignored issues
show
Bug Best Practice introduced by
The expression return $result also could return the type array|boolean which is incompatible with the documented return type string.
Loading history...
1407
    }
1408
1409
    /*****************************
1410
     *
1411
     * Compiling URLs to crawl - tools
1412
     *
1413
     *****************************/
1414
1415
    /**
1416
     * @param integer $id Root page id to start from.
1417
     * @param integer $depth Depth of tree, 0=only id-page, 1= on sublevel, 99 = infinite
1418
     * @param integer $scheduledTime Unix Time when the URL is timed to be visited when put in queue
1419
     * @param integer $reqMinute Number of requests per minute (creates the interleave between requests)
1420
     * @param boolean $submitCrawlUrls If set, submits the URLs to queue in database (real crawling)
1421
     * @param boolean $downloadCrawlUrls If set (and submitcrawlUrls is false) will fill $downloadUrls with entries)
1422
     * @param array $incomingProcInstructions Array of processing instructions
1423
     * @param array $configurationSelection Array of configuration keys
1424
     * @return string
1425
     */
1426
    public function getPageTreeAndUrls(
1427
        $id,
1428
        $depth,
1429
        $scheduledTime,
1430
        $reqMinute,
1431
        $submitCrawlUrls,
1432
        $downloadCrawlUrls,
1433
        array $incomingProcInstructions,
1434
        array $configurationSelection
1435
    ) {
1436
        $this->scheduledTime = $scheduledTime;
1437
        $this->reqMinute = $reqMinute;
1438
        $this->submitCrawlUrls = $submitCrawlUrls;
1439
        $this->downloadCrawlUrls = $downloadCrawlUrls;
1440
        $this->incomingProcInstructions = $incomingProcInstructions;
1441
        $this->incomingConfigurationSelection = $configurationSelection;
1442
1443
        $this->duplicateTrack = [];
1444
        $this->downloadUrls = [];
1445
1446
        // Drawing tree:
1447
        /* @var PageTreeView $tree */
1448
        $tree = GeneralUtility::makeInstance(PageTreeView::class);
1449
        $perms_clause = $this->getBackendUser()->getPagePermsClause(Permission::PAGE_SHOW);
1450
        $tree->init('AND ' . $perms_clause);
1451
1452
        $pageInfo = BackendUtility::readPageAccess($id, $perms_clause);
1453
        if (is_array($pageInfo)) {
0 ignored issues
show
introduced by
The condition is_array($pageInfo) is always true.
Loading history...
1454
            // Set root row:
1455
            $tree->tree[] = [
1456
                'row' => $pageInfo,
1457
                'HTML' => $this->iconFactory->getIconForRecord('pages', $pageInfo, Icon::SIZE_SMALL),
1458
            ];
1459
        }
1460
1461
        // Get branch beneath:
1462
        if ($depth) {
1463
            $tree->getTree($id, $depth, '');
1464
        }
1465
1466
        // Traverse page tree:
1467
        $code = '';
1468
1469
        foreach ($tree->tree as $data) {
1470
            $this->MP = false;
1471
1472
            // recognize mount points
1473
            if ($data['row']['doktype'] == PageRepository::DOKTYPE_MOUNTPOINT) {
1474
                $queryBuilder = GeneralUtility::makeInstance(ConnectionPool::class)->getQueryBuilderForTable($this->tableName);
1475
                $queryBuilder->getRestrictions()->removeAll()->add(GeneralUtility::makeInstance(DeletedRestriction::class));
1476
                $mountpage = $queryBuilder
1477
                    ->select('*')
1478
                    ->from('pages')
1479
                    ->where(
1480
                        $queryBuilder->expr()->eq('uid', $queryBuilder->createNamedParameter($data['row']['uid'], \PDO::PARAM_INT))
1481
                    )
1482
                    ->execute()
1483
                    ->fetchAll();
1484
                $queryBuilder->resetRestrictions();
1485
1486
                // fetch mounted pages
1487
                $this->MP = $mountpage[0]['mount_pid'] . '-' . $data['row']['uid'];
0 ignored issues
show
Documentation Bug introduced by
The property $MP was declared of type boolean, but $mountpage[0]['mount_pid...' . $data['row']['uid'] is of type string. Maybe add a type cast?

This check looks for assignments to scalar types that may be of the wrong type.

To ensure the code behaves as expected, it may be a good idea to add an explicit type cast.

$answer = 42;

$correct = false;

$correct = (bool) $answer;
Loading history...
1488
1489
                $mountTree = GeneralUtility::makeInstance(PageTreeView::class);
1490
                $mountTree->init('AND ' . $perms_clause);
1491
                $mountTree->getTree($mountpage[0]['mount_pid'], $depth);
1492
1493
                foreach ($mountTree->tree as $mountData) {
1494
                    $code .= $this->drawURLs_addRowsForPage(
1495
                        $mountData['row'],
1496
                        $mountData['HTML'] . BackendUtility::getRecordTitle('pages', $mountData['row'], true)
1497
                    );
1498
                }
1499
1500
                // replace page when mount_pid_ol is enabled
1501
                if ($mountpage[0]['mount_pid_ol']) {
1502
                    $data['row']['uid'] = $mountpage[0]['mount_pid'];
1503
                } else {
1504
                    // if the mount_pid_ol is not set the MP must not be used for the mountpoint page
1505
                    $this->MP = false;
1506
                }
1507
            }
1508
1509
            $code .= $this->drawURLs_addRowsForPage(
1510
                $data['row'],
1511
                $data['HTML'] . BackendUtility::getRecordTitle('pages', $data['row'], true)
1512
            );
1513
        }
1514
1515
        return $code;
1516
    }
1517
1518
    /**
1519
     * Expands exclude string
1520
     *
1521
     * @param string $excludeString Exclude string
1522
     * @return array
1523
     */
1524 1
    public function expandExcludeString($excludeString)
1525
    {
1526
        // internal static caches;
1527 1
        static $expandedExcludeStringCache;
1528 1
        static $treeCache;
1529
1530 1
        if (empty($expandedExcludeStringCache[$excludeString])) {
1531 1
            $pidList = [];
1532
1533 1
            if (!empty($excludeString)) {
1534
                /** @var PageTreeView $tree */
1535
                $tree = GeneralUtility::makeInstance(PageTreeView::class);
1536
                $tree->init('AND ' . $this->getBackendUser()->getPagePermsClause(Permission::PAGE_SHOW));
1537
1538
                $excludeParts = GeneralUtility::trimExplode(',', $excludeString);
1539
1540
                foreach ($excludeParts as $excludePart) {
1541
                    [$pid, $depth] = GeneralUtility::trimExplode('+', $excludePart);
1542
1543
                    // default is "page only" = "depth=0"
1544
                    if (empty($depth)) {
1545
                        $depth = (stristr($excludePart, '+')) ? 99 : 0;
1546
                    }
1547
1548
                    $pidList[] = $pid;
1549
1550
                    if ($depth > 0) {
1551
                        if (empty($treeCache[$pid][$depth])) {
1552
                            $tree->reset();
1553
                            $tree->getTree($pid, $depth);
1554
                            $treeCache[$pid][$depth] = $tree->tree;
1555
                        }
1556
1557
                        foreach ($treeCache[$pid][$depth] as $data) {
1558
                            $pidList[] = $data['row']['uid'];
1559
                        }
1560
                    }
1561
                }
1562
            }
1563
1564 1
            $expandedExcludeStringCache[$excludeString] = array_unique($pidList);
1565
        }
1566
1567 1
        return $expandedExcludeStringCache[$excludeString];
1568
    }
1569
1570
    /**
1571
     * Create the rows for display of the page tree
1572
     * For each page a number of rows are shown displaying GET variable configuration
1573
     */
1574
    public function drawURLs_addRowsForPage(array $pageRow, string $pageTitle): string
1575
    {
1576
        $skipMessage = '';
1577
1578
        // Get list of configurations
1579
        $configurations = $this->getUrlsForPageRow($pageRow, $skipMessage);
1580
1581
        if (!empty($this->incomingConfigurationSelection)) {
1582
            // remove configuration that does not match the current selection
1583
            foreach ($configurations as $confKey => $confArray) {
1584
                if (!in_array($confKey, $this->incomingConfigurationSelection)) {
1585
                    unset($configurations[$confKey]);
1586
                }
1587
            }
1588
        }
1589
1590
        // Traverse parameter combinations:
1591
        $c = 0;
1592
        $content = '';
1593
        if (!empty($configurations)) {
1594
            foreach ($configurations as $confKey => $confArray) {
1595
1596
                // Title column:
1597
                if (!$c) {
1598
                    $titleClm = '<td rowspan="' . count($configurations) . '">' . $pageTitle . '</td>';
1599
                } else {
1600
                    $titleClm = '';
1601
                }
1602
1603
                if (!in_array($pageRow['uid'], $this->expandExcludeString($confArray['subCfg']['exclude']))) {
1604
1605
                    // URL list:
1606
                    $urlList = $this->urlListFromUrlArray(
1607
                        $confArray,
1608
                        $pageRow,
1609
                        $this->scheduledTime,
1610
                        $this->reqMinute,
1611
                        $this->submitCrawlUrls,
1612
                        $this->downloadCrawlUrls,
1613
                        $this->duplicateTrack,
1614
                        $this->downloadUrls,
1615
                        $this->incomingProcInstructions // if empty the urls won't be filtered by processing instructions
1616
                    );
1617
1618
                    // Expanded parameters:
1619
                    $paramExpanded = '';
1620
                    $calcAccu = [];
1621
                    $calcRes = 1;
1622
                    foreach ($confArray['paramExpanded'] as $gVar => $gVal) {
1623
                        $paramExpanded .= '
1624
                            <tr>
1625
                                <td>' . htmlspecialchars('&' . $gVar . '=') . '<br/>' .
1626
                            '(' . count($gVal) . ')' .
1627
                            '</td>
1628
                                <td nowrap="nowrap">' . nl2br(htmlspecialchars(implode(chr(10), $gVal))) . '</td>
1629
                            </tr>
1630
                        ';
1631
                        $calcRes *= count($gVal);
1632
                        $calcAccu[] = count($gVal);
1633
                    }
1634
                    $paramExpanded = '<table>' . $paramExpanded . '</table>';
1635
                    $paramExpanded .= 'Comb: ' . implode('*', $calcAccu) . '=' . $calcRes;
1636
1637
                    // Options
1638
                    $optionValues = '';
1639
                    if ($confArray['subCfg']['userGroups']) {
1640
                        $optionValues .= 'User Groups: ' . $confArray['subCfg']['userGroups'] . '<br/>';
1641
                    }
1642
                    if ($confArray['subCfg']['procInstrFilter']) {
1643
                        $optionValues .= 'ProcInstr: ' . $confArray['subCfg']['procInstrFilter'] . '<br/>';
1644
                    }
1645
1646
                    // Compile row:
1647
                    $content .= '
1648
                        <tr>
1649
                            ' . $titleClm . '
1650
                            <td>' . htmlspecialchars($confKey) . '</td>
1651
                            <td>' . nl2br(htmlspecialchars(rawurldecode(trim(str_replace('&', chr(10) . '&', GeneralUtility::implodeArrayForUrl('', $confArray['paramParsed'])))))) . '</td>
1652
                            <td>' . $paramExpanded . '</td>
1653
                            <td nowrap="nowrap">' . $urlList . '</td>
1654
                            <td nowrap="nowrap">' . $optionValues . '</td>
1655
                            <td nowrap="nowrap">' . DebugUtility::viewArray($confArray['subCfg']['procInstrParams.']) . '</td>
1656
                        </tr>';
1657
                } else {
1658
                    $content .= '<tr>
1659
                            ' . $titleClm . '
1660
                            <td>' . htmlspecialchars($confKey) . '</td>
1661
                            <td colspan="5"><em>No entries</em> (Page is excluded in this configuration)</td>
1662
                        </tr>';
1663
                }
1664
1665
                $c++;
1666
            }
1667
        } else {
1668
            $message = !empty($skipMessage) ? ' (' . $skipMessage . ')' : '';
1669
1670
            // Compile row:
1671
            $content .= '
1672
                <tr>
1673
                    <td>' . $pageTitle . '</td>
1674
                    <td colspan="6"><em>No entries</em>' . $message . '</td>
1675
                </tr>';
1676
        }
1677
1678
        return $content;
1679
    }
1680
1681
    /*****************************
1682
     *
1683
     * CLI functions
1684
     *
1685
     *****************************/
1686
1687
    /**
1688
     * Running the functionality of the CLI (crawling URLs from queue)
1689
     */
1690
    public function CLI_run(int $countInARun, int $sleepTime, int $sleepAfterFinish): int
1691
    {
1692
        $result = 0;
1693
        $counter = 0;
1694
1695
        // First, run hooks:
1696
        $this->CLI_runHooks();
1697
1698
        // Clean up the queue
1699
        $this->queueRepository->cleanupQueue();
1700
1701
        // Select entries:
1702
        //TODO Shouldn't this reside within the transaction?
1703
        $queryBuilderSelect = GeneralUtility::makeInstance(ConnectionPool::class)->getQueryBuilderForTable($this->tableName);
1704
        $rows = $queryBuilderSelect
1705
            ->select('qid', 'scheduled')
1706
            ->from($this->tableName)
1707
            ->where(
1708
                $queryBuilderSelect->expr()->eq('exec_time', 0),
1709
                $queryBuilderSelect->expr()->eq('process_scheduled', 0),
1710
                $queryBuilderSelect->expr()->lte('scheduled', $this->getCurrentTime())
1711
            )
1712
            ->orderBy('scheduled')
1713
            ->addOrderBy('qid')
1714
            ->setMaxResults($countInARun)
1715
            ->execute()
1716
            ->fetchAll();
1717
1718
        if (!empty($rows)) {
1719
            $quidList = [];
1720
1721
            foreach ($rows as $r) {
1722
                $quidList[] = $r['qid'];
1723
            }
1724
1725
            $processId = $this->CLI_buildProcessId();
1726
1727
            //reserve queue entries for process
1728
1729
            //$this->queryBuilder->getConnection()->executeQuery('BEGIN');
1730
            //TODO make sure we're not taking assigned queue-entires
1731
1732
            //save the number of assigned queue entries to determine who many have been processed later
1733
            $queryBuilderUpdate = GeneralUtility::makeInstance(ConnectionPool::class)->getQueryBuilderForTable($this->tableName);
1734
            $numberOfAffectedRows = $queryBuilderUpdate
1735
                ->update($this->tableName)
1736
                ->where(
1737
                    $queryBuilderUpdate->expr()->in('qid', $quidList)
1738
                )
1739
                ->set('process_scheduled', $this->getCurrentTime())
1740
                ->set('process_id', $processId)
1741
                ->execute();
1742
1743
            GeneralUtility::makeInstance(ConnectionPool::class)->getConnectionForTable('tx_crawler_process')
1744
                ->update(
1745
                    'tx_crawler_process',
1746
                    ['assigned_items_count' => (int)$numberOfAffectedRows],
1747
                    ['process_id' => $processId]
1748
                );
1749
1750
            if ($numberOfAffectedRows == count($quidList)) {
1751
                //$this->queryBuilder->getConnection()->executeQuery('COMMIT');
1752
            } else {
1753
                //$this->queryBuilder->getConnection()->executeQuery('ROLLBACK');
1754
                $this->CLI_debug("Nothing processed due to multi-process collision (" . $this->CLI_buildProcessId() . ")");
1755
                return ($result | self::CLI_STATUS_ABORTED);
1756
            }
1757
1758
            foreach ($rows as $r) {
1759
                $result |= $this->readUrl($r['qid']);
1760
1761
                $counter++;
1762
                usleep((int)$sleepTime); // Just to relax the system
1763
1764
                // if during the start and the current read url the cli has been disable we need to return from the function
1765
                // mark the process NOT as ended.
1766
                if ($this->getDisabled()) {
1767
                    return ($result | self::CLI_STATUS_ABORTED);
1768
                }
1769
1770
                if (!$this->processRepository->isProcessActive($this->CLI_buildProcessId())) {
1771
                    $this->CLI_debug("conflict / timeout (" . $this->CLI_buildProcessId() . ")");
1772
1773
                    //TODO might need an additional returncode
1774
                    $result |= self::CLI_STATUS_ABORTED;
1775
                    break; //possible timeout
1776
                }
1777
            }
1778
1779
            sleep((int)$sleepAfterFinish);
1780
1781
            $msg = 'Rows: ' . $counter;
1782
            $this->CLI_debug($msg . " (" . $this->CLI_buildProcessId() . ")");
1783
        } else {
1784
            $this->CLI_debug("Nothing within queue which needs to be processed (" . $this->CLI_buildProcessId() . ")");
1785
        }
1786
1787
        if ($counter > 0) {
1788
            $result |= self::CLI_STATUS_PROCESSED;
1789
        }
1790
1791
        return $result;
1792
    }
1793
1794
    /**
1795
     * Activate hooks
1796
     *
1797
     * @return void
1798
     */
1799
    public function CLI_runHooks(): void
1800
    {
1801
        foreach ($GLOBALS['TYPO3_CONF_VARS']['EXTCONF']['crawler']['cli_hooks'] ?? [] as $objRef) {
1802
            $hookObj = GeneralUtility::makeInstance($objRef);
1803
            if (is_object($hookObj)) {
1804
                $hookObj->crawler_init($this);
1805
            }
1806
        }
1807
    }
1808
1809
    /**
1810
     * Try to acquire a new process with the given id
1811
     * also performs some auto-cleanup for orphan processes
1812
     * @param string $id identification string for the process
1813
     * @return boolean
1814
     * @todo preemption might not be the most elegant way to clean up
1815
     */
1816
    public function CLI_checkAndAcquireNewProcess($id)
1817
    {
1818
        $queryBuilder = GeneralUtility::makeInstance(ConnectionPool::class)->getQueryBuilderForTable($this->tableName);
1819
        $ret = true;
1820
1821
        $systemProcessId = getmypid();
1822
        if ($systemProcessId < 1) {
1823
            return false;
1824
        }
1825
1826
        $processCount = 0;
1827
        $orphanProcesses = [];
1828
1829
        //$this->queryBuilder->getConnection()->executeQuery('BEGIN');
1830
1831
        $statement = $queryBuilder
1832
            ->select('process_id', 'ttl')
1833
            ->from('tx_crawler_process')
1834
            ->where(
1835
                'active = 1 AND deleted = 0'
1836
            )
1837
            ->execute();
1838
1839
        $currentTime = $this->getCurrentTime();
1840
1841
        while ($row = $statement->fetch()) {
1842
            if ($row['ttl'] < $currentTime) {
1843
                $orphanProcesses[] = $row['process_id'];
1844
            } else {
1845
                $processCount++;
1846
            }
1847
        }
1848
1849
        // if there are less than allowed active processes then add a new one
1850
        if ($processCount < (int)$this->extensionSettings['processLimit']) {
1851
            $this->CLI_debug("add process " . $this->CLI_buildProcessId() . " (" . ($processCount + 1) . "/" . (int)$this->extensionSettings['processLimit'] . ")");
1852
1853
            GeneralUtility::makeInstance(ConnectionPool::class)->getConnectionForTable('tx_crawler_process')->insert(
1854
                'tx_crawler_process',
1855
                [
1856
                    'process_id' => $id,
1857
                    'active' => 1,
1858
                    'ttl' => $currentTime + (int)$this->extensionSettings['processMaxRunTime'],
1859
                    'system_process_id' => $systemProcessId,
1860
                ]
1861
            );
1862
        } else {
1863
            $this->CLI_debug("Processlimit reached (" . ($processCount) . "/" . (int)$this->extensionSettings['processLimit'] . ")");
1864
            $ret = false;
1865
        }
1866
1867
        $this->processRepository->deleteProcessesMarkedAsDeleted();
1868
        $this->CLI_releaseProcesses($orphanProcesses);
1869
1870
        return $ret;
1871
    }
1872
1873
    /**
1874
     * Release a process and the required resources
1875
     *
1876
     * @param mixed $releaseIds string with a single process-id or array with multiple process-ids
1877
     * @return boolean
1878
     */
1879
    public function CLI_releaseProcesses($releaseIds)
1880
    {
1881
        $queryBuilder = GeneralUtility::makeInstance(ConnectionPool::class)->getQueryBuilderForTable($this->tableName);
1882
1883
        if (!is_array($releaseIds)) {
1884
            $releaseIds = [$releaseIds];
1885
        }
1886
1887
        if (empty($releaseIds)) {
1888
            return false;   //nothing to release
1889
        }
1890
1891
        // some kind of 2nd chance algo - this way you need at least 2 processes to have a real cleanup
1892
        // this ensures that a single process can't mess up the entire process table
1893
1894
        // mark all processes as deleted which have no "waiting" queue-entires and which are not active
1895
1896
        $queryBuilder
1897
            ->update($this->tableName, 'q')
1898
            ->where(
1899
                'q.process_id IN(SELECT p.process_id FROM tx_crawler_process as p WHERE p.active = 0)'
1900
            )
1901
            ->set('q.process_scheduled', 0)
1902
            ->set('q.process_id', '')
1903
            ->execute();
1904
1905
        // FIXME: Not entirely sure that this is equivalent to the previous version
1906
        $queryBuilder->resetQueryPart('set');
1907
1908
        $queryBuilder
1909
            ->update('tx_crawler_process')
1910
            ->where(
1911
                $queryBuilder->expr()->eq('active', 0),
1912
                'process_id IN(SELECT q.process_id FROM tx_crawler_queue as q WHERE q.exec_time = 0)'
1913
            )
1914
            ->set('system_process_id', 0)
1915
            ->execute();
1916
1917
        // mark all requested processes as non-active
1918
        $queryBuilder
1919
            ->update('tx_crawler_process')
1920
            ->where(
1921
                'NOT EXISTS (
1922
                SELECT * FROM tx_crawler_queue
1923
                    WHERE tx_crawler_queue.process_id = tx_crawler_process.process_id
1924
                    AND tx_crawler_queue.exec_time = 0
1925
                )',
1926
                $queryBuilder->expr()->in('process_id', $queryBuilder->createNamedParameter($releaseIds, Connection::PARAM_STR_ARRAY)),
1927
                $queryBuilder->expr()->eq('deleted', 0)
1928
            )
1929
            ->set('active', 0)
1930
            ->execute();
1931
        $queryBuilder->resetQueryPart('set');
1932
        $queryBuilder
1933
            ->update($this->tableName)
1934
            ->where(
1935
                $queryBuilder->expr()->eq('exec_time', 0),
1936
                $queryBuilder->expr()->in('process_id', $queryBuilder->createNamedParameter($releaseIds, Connection::PARAM_STR_ARRAY))
1937
            )
1938
            ->set('process_scheduled', 0)
1939
            ->set('process_id', '')
1940
            ->execute();
1941
1942
        return true;
1943
    }
1944
1945
    /**
1946
     * Create a unique Id for the current process
1947
     *
1948
     * @return string  the ID
1949
     */
1950 1
    public function CLI_buildProcessId()
1951
    {
1952 1
        if (!$this->processID) {
1953
            $this->processID = GeneralUtility::shortMD5(microtime(true));
1954
        }
1955 1
        return $this->processID;
1956
    }
1957
1958
    /**
1959
     * Prints a message to the stdout (only if debug-mode is enabled)
1960
     *
1961
     * @param string $msg the message
1962
     */
1963
    public function CLI_debug($msg): void
1964
    {
1965
        if ((int)$this->extensionSettings['processDebug']) {
1966
            echo $msg . "\n";
1967
            flush();
1968
        }
1969
    }
1970
1971
    /**
1972
     * Cleans up entries that stayed for too long in the queue. These are:
1973
     * - processed entries that are over 1.5 days in age
1974
     * - scheduled entries that are over 7 days old
1975
     *
1976
     * @return void
1977
     */
1978 1
    public function cleanUpOldQueueEntries(): void
1979
    {
1980 1
        $processedAgeInSeconds = $this->extensionSettings['cleanUpProcessedAge'] * 86400; // 24*60*60 Seconds in 24 hours
1981 1
        $scheduledAgeInSeconds = $this->extensionSettings['cleanUpScheduledAge'] * 86400;
1982
1983 1
        $now = time();
1984 1
        $condition = '(exec_time<>0 AND exec_time<' . ($now - $processedAgeInSeconds) . ') OR scheduled<=' . ($now - $scheduledAgeInSeconds);
1985 1
        $this->flushQueue($condition);
1986 1
    }
1987
1988
    /**
1989
     * Returns a md5 hash generated from a serialized configuration array.
1990
     *
1991
     * @param array $configuration
1992
     *
1993
     * @return string
1994
     */
1995 8
    protected function getConfigurationHash(array $configuration)
1996
    {
1997 8
        unset($configuration['paramExpanded']);
1998 8
        unset($configuration['URLs']);
1999 8
        return md5(serialize($configuration));
2000
    }
2001
2002
    /**
2003
     * Build a URL from a Page and the Query String. If the page has a Site configuration, it can be built by using
2004
     * the Site instance.
2005
     *
2006
     * @param int $pageId
2007
     * @param string $queryString
2008
     * @param string|null $alternativeBaseUrl
2009
     * @param int $httpsOrHttp see tx_crawler_configuration.force_ssl
2010
     * @return UriInterface
2011
     * @throws \TYPO3\CMS\Core\Exception\SiteNotFoundException
2012
     * @throws \TYPO3\CMS\Core\Routing\InvalidRouteArgumentsException
2013
     */
2014 10
    protected function getUrlFromPageAndQueryParameters(int $pageId, string $queryString, ?string $alternativeBaseUrl, int $httpsOrHttp): UriInterface
2015
    {
2016 10
        $site = GeneralUtility::makeInstance(SiteMatcher::class)->matchByPageId((int)$pageId);
2017 10
        if ($site instanceof Site) {
2018 5
            $queryString = ltrim($queryString, '?&');
2019 5
            $queryParts = [];
2020 5
            parse_str($queryString, $queryParts);
2021 5
            unset($queryParts['id']);
2022
            // workaround as long as we don't have native language support in crawler configurations
2023 5
            if (isset($queryParts['L'])) {
2024
                $queryParts['_language'] = $queryParts['L'];
2025
                unset($queryParts['L']);
2026
                $siteLanguage = $site->getLanguageById((int)$queryParts['_language']);
0 ignored issues
show
Unused Code introduced by
The assignment to $siteLanguage is dead and can be removed.
Loading history...
2027
            } else {
2028 5
                $siteLanguage = $site->getDefaultLanguage();
2029
            }
2030 5
            $url = $site->getRouter()->generateUri($pageId, $queryParts);
2031 5
            if (!empty($alternativeBaseUrl)) {
2032 3
                $alternativeBaseUrl = new Uri($alternativeBaseUrl);
2033 3
                $url = $url->withHost($alternativeBaseUrl->getHost());
2034 3
                $url = $url->withScheme($alternativeBaseUrl->getScheme());
2035 3
                $url = $url->withPort($alternativeBaseUrl->getPort());
2036 3
                if ($userInfo = $alternativeBaseUrl->getUserInfo()) {
2037 5
                    $url = $url->withUserInfo($userInfo);
2038
                }
2039
            }
2040
        } else {
2041
            // Technically this is not possible with site handling, but kept for backwards-compatibility reasons
2042
            // Once EXT:crawler is v10-only compatible, this should be removed completely
2043 5
            $baseUrl = ($alternativeBaseUrl ?: GeneralUtility::getIndpEnv('TYPO3_SITE_URL'));
2044 5
            $cacheHashCalculator = GeneralUtility::makeInstance(CacheHashCalculator::class);
2045 5
            $queryString .= '&cHash=' . $cacheHashCalculator->generateForParameters($queryString);
2046 5
            $url = rtrim($baseUrl, '/') . '/index.php' . $queryString;
2047 5
            $url = new Uri($url);
2048
        }
2049
2050 10
        if ($httpsOrHttp === -1) {
2051 2
            $url = $url->withScheme('http');
2052 8
        } elseif ($httpsOrHttp === 1) {
2053 6
            $url = $url->withScheme('https');
2054
        }
2055
2056 10
        return $url;
2057
    }
2058
2059 1
    protected function swapIfFirstIsLargerThanSecond(array $reg): array
2060
    {
2061
        // Swap if first is larger than last:
2062 1
        if ($reg[1] > $reg[2]) {
2063
            $temp = $reg[2];
2064
            $reg[2] = $reg[1];
2065
            $reg[1] = $temp;
2066
        }
2067
2068 1
        return $reg;
2069
    }
2070
}
2071