Completed
Push — typo3v9 ( aea555...37a7d2 )
by Tomas Norre
06:20
created

CrawlerController::CLI_debug()   A

Complexity

Conditions 2
Paths 2

Size

Total Lines 7

Duplication

Lines 0
Ratio 0 %

Code Coverage

Tests 0
CRAP Score 6

Importance

Changes 0
Metric Value
cc 2
nc 2
nop 1
dl 0
loc 7
rs 10
c 0
b 0
f 0
ccs 0
cts 5
cp 0
crap 6
1
<?php
2
3
declare(strict_types=1);
4
5
namespace AOE\Crawler\Controller;
6
7
/***************************************************************
8
 *  Copyright notice
9
 *
10
 *  (c) 2019 AOE GmbH <[email protected]>
11
 *
12
 *  All rights reserved
13
 *
14
 *  This script is part of the TYPO3 project. The TYPO3 project is
15
 *  free software; you can redistribute it and/or modify
16
 *  it under the terms of the GNU General Public License as published by
17
 *  the Free Software Foundation; either version 3 of the License, or
18
 *  (at your option) any later version.
19
 *
20
 *  The GNU General Public License can be found at
21
 *  http://www.gnu.org/copyleft/gpl.html.
22
 *
23
 *  This script is distributed in the hope that it will be useful,
24
 *  but WITHOUT ANY WARRANTY; without even the implied warranty of
25
 *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
26
 *  GNU General Public License for more details.
27
 *
28
 *  This copyright notice MUST APPEAR in all copies of the script!
29
 ***************************************************************/
30
31
use AOE\Crawler\Configuration\ExtensionConfigurationProvider;
32
use AOE\Crawler\Domain\Repository\ConfigurationRepository;
33
use AOE\Crawler\Domain\Repository\ProcessRepository;
34
use AOE\Crawler\Domain\Repository\QueueRepository;
35
use AOE\Crawler\Event\EventDispatcher;
36
use AOE\Crawler\QueueExecutor;
37
use AOE\Crawler\Utility\SignalSlotUtility;
38
use Psr\Http\Message\UriInterface;
39
use Psr\Log\LoggerAwareInterface;
40
use Psr\Log\LoggerAwareTrait;
41
use TYPO3\CMS\Backend\Tree\View\PageTreeView;
42
use TYPO3\CMS\Backend\Utility\BackendUtility;
43
use TYPO3\CMS\Core\Authentication\BackendUserAuthentication;
44
use TYPO3\CMS\Core\Core\Environment;
45
use TYPO3\CMS\Core\Database\Connection;
46
use TYPO3\CMS\Core\Database\ConnectionPool;
47
use TYPO3\CMS\Core\Database\Query\Restriction\DeletedRestriction;
48
use TYPO3\CMS\Core\Http\Uri;
49
use TYPO3\CMS\Core\Imaging\Icon;
50
use TYPO3\CMS\Core\Imaging\IconFactory;
51
use TYPO3\CMS\Core\Routing\SiteMatcher;
52
use TYPO3\CMS\Core\Site\Entity\Site;
53
use TYPO3\CMS\Core\Type\Bitmask\Permission;
54
use TYPO3\CMS\Core\TypoScript\Parser\TypoScriptParser;
55
use TYPO3\CMS\Core\Utility\DebugUtility;
56
use TYPO3\CMS\Core\Utility\GeneralUtility;
57
use TYPO3\CMS\Core\Utility\MathUtility;
58
use TYPO3\CMS\Extbase\Object\ObjectManager;
59
use TYPO3\CMS\Frontend\Page\CacheHashCalculator;
60
use TYPO3\CMS\Frontend\Page\PageRepository;
61
62
/**
63
 * Class CrawlerController
64
 *
65
 * @package AOE\Crawler\Controller
66
 */
67
class CrawlerController implements LoggerAwareInterface
68
{
69
    use LoggerAwareTrait;
70
71
    public const CLI_STATUS_NOTHING_PROCCESSED = 0;
72
    public const CLI_STATUS_REMAIN = 1; //queue not empty
73
    public const CLI_STATUS_PROCESSED = 2; //(some) queue items where processed
74
    public const CLI_STATUS_ABORTED = 4; //instance didn't finish
75
    public const CLI_STATUS_POLLABLE_PROCESSED = 8;
76
77
    /**
78
     * @var integer
79
     */
80
    public $setID = 0;
81
82
    /**
83
     * @var string
84
     */
85
    public $processID = '';
86
87
    /**
88
     * @var array
89
     */
90
    public $duplicateTrack = [];
91
92
    /**
93
     * @var array
94
     */
95
    public $downloadUrls = [];
96
97
    /**
98
     * @var array
99
     */
100
    public $incomingProcInstructions = [];
101
102
    /**
103
     * @var array
104
     */
105
    public $incomingConfigurationSelection = [];
106
107
    /**
108
     * @var bool
109
     */
110
    public $registerQueueEntriesInternallyOnly = false;
111
112
    /**
113
     * @var array
114
     */
115
    public $queueEntries = [];
116
117
    /**
118
     * @var array
119
     */
120
    public $urlList = [];
121
122
    /**
123
     * @var array
124
     */
125
    public $extensionSettings = [];
126
127
    /**
128
     * Mount Point
129
     *
130
     * @var boolean
131
     */
132
    public $MP = false;
133
134
    /**
135
     * @var string
136
     */
137
    protected $processFilename;
138
139
    /**
140
     * Holds the internal access mode can be 'gui','cli' or 'cli_im'
141
     *
142
     * @var string
143
     */
144
    protected $accessMode;
145
146
    /**
147
     * @var BackendUserAuthentication|null
148
     */
149
    private $backendUser;
150
151
    /**
152
     * @var integer
153
     */
154
    private $scheduledTime = 0;
155
156
    /**
157
     * @var integer
158
     */
159
    private $reqMinute = 0;
160
161
    /**
162
     * @var bool
163
     */
164
    private $submitCrawlUrls = false;
165
166
    /**
167
     * @var bool
168
     */
169
    private $downloadCrawlUrls = false;
170
171
    /**
172
     * @var QueueRepository
173
     */
174
    protected $queueRepository;
175
176
    /**
177
     * @var ProcessRepository
178
     */
179
    protected $processRepository;
180
181
    /**
182
     * @var ConfigurationRepository
183
     */
184
    protected $configurationRepository;
185
186
    /**
187
     * @var string
188
     */
189
    protected $tableName = 'tx_crawler_queue';
190
191
    /**
192
     * @var QueueExecutor
193
     */
194
    protected $queueExecutor;
195
196
    /**
197
     * @var int
198
     */
199
    protected $maximumUrlsToCompile = 10000;
200
201
    /**
202
     * @var IconFactory
203
     */
204
    protected $iconFactory;
205
206
    /**
207
     * Method to set the accessMode can be gui, cli or cli_im
208
     *
209
     * @return string
210
     */
211 1
    public function getAccessMode()
212
    {
213 1
        return $this->accessMode;
214
    }
215
216
    /**
217
     * @param string $accessMode
218
     */
219 1
    public function setAccessMode($accessMode): void
220
    {
221 1
        $this->accessMode = $accessMode;
222 1
    }
223
224
    /**
225
     * Set disabled status to prevent processes from being processed
226
     *
227
     * @param  bool $disabled (optional, defaults to true)
228
     * @return void
229
     */
230 3
    public function setDisabled($disabled = true): void
231
    {
232 3
        if ($disabled) {
233 2
            GeneralUtility::writeFile($this->processFilename, '');
234
        } else {
235 1
            if (is_file($this->processFilename)) {
236 1
                unlink($this->processFilename);
237
            }
238
        }
239 3
    }
240
241
    /**
242
     * Get disable status
243
     *
244
     * @return bool true if disabled
245
     */
246 3
    public function getDisabled()
247
    {
248 3
        return is_file($this->processFilename);
249
    }
250
251
    /**
252
     * @param string $filenameWithPath
253
     *
254
     * @return void
255
     */
256 4
    public function setProcessFilename($filenameWithPath): void
257
    {
258 4
        $this->processFilename = $filenameWithPath;
259 4
    }
260
261
    /**
262
     * @return string
263
     */
264 1
    public function getProcessFilename()
265
    {
266 1
        return $this->processFilename;
267
    }
268
269
    /************************************
270
     *
271
     * Getting URLs based on Page TSconfig
272
     *
273
     ************************************/
274
275
    public function __construct()
276
    {
277
        $objectManager = GeneralUtility::makeInstance(ObjectManager::class);
278
        $this->queueRepository = $objectManager->get(QueueRepository::class);
279
        $this->processRepository = $objectManager->get(ProcessRepository::class);
280
        $this->configurationRepository = $objectManager->get(ConfigurationRepository::class);
281
        $this->queueExecutor = $objectManager->get(QueueExecutor::class);
282
        $this->iconFactory = GeneralUtility::makeInstance(IconFactory::class);
283
284
        $this->processFilename = Environment::getVarPath() . '/locks/tx_crawler.proc';
285
286
        /** @var ExtensionConfigurationProvider $configurationProvider */
287
        $configurationProvider = GeneralUtility::makeInstance(ExtensionConfigurationProvider::class);
288
        $settings = $configurationProvider->getExtensionConfiguration();
289
        $this->extensionSettings = is_array($settings) ? $settings : [];
290
291
        // set defaults:
292
        if (MathUtility::convertToPositiveInteger($this->extensionSettings['countInARun']) == 0) {
293
            $this->extensionSettings['countInARun'] = 100;
294
        }
295
296
        $this->extensionSettings['processLimit'] = MathUtility::forceIntegerInRange($this->extensionSettings['processLimit'], 1, 99, 1);
297
        $this->maximumUrlsToCompile = MathUtility::forceIntegerInRange($this->extensionSettings['maxCompileUrls'], 1, 1000000000, 10000);
298
    }
299
300
    /**
301
     * @return BackendUserAuthentication
302
     */
303
    private function getBackendUser()
304
    {
305
        if ($this->backendUser === null) {
306
            $this->backendUser = $GLOBALS['BE_USER'];
307
        }
308
        return $this->backendUser;
309
    }
310
311
    /**
312
     * Sets the extensions settings (unserialized pendant of $TYPO3_CONF_VARS['EXT']['extConf']['crawler']).
313
     *
314
     * @param array $extensionSettings
315
     * @return void
316
     */
317 6
    public function setExtensionSettings(array $extensionSettings): void
318
    {
319 6
        $this->extensionSettings = $extensionSettings;
320 6
    }
321
322
    /**
323
     * Check if the given page should be crawled
324
     *
325
     * @param array $pageRow
326
     * @return false|string false if the page should be crawled (not excluded), true / skipMessage if it should be skipped
327
     */
328 6
    public function checkIfPageShouldBeSkipped(array $pageRow)
329
    {
330 6
        $skipPage = false;
331 6
        $skipMessage = 'Skipped'; // message will be overwritten later
332
333
        // if page is hidden
334 6
        if (!$this->extensionSettings['crawlHiddenPages']) {
335 6
            if ($pageRow['hidden']) {
336 1
                $skipPage = true;
337 1
                $skipMessage = 'Because page is hidden';
338
            }
339
        }
340
341 6
        if (!$skipPage) {
342 5
            if (GeneralUtility::inList('3,4', $pageRow['doktype']) || $pageRow['doktype'] >= 199) {
343 3
                $skipPage = true;
344 3
                $skipMessage = 'Because doktype is not allowed';
345
            }
346
        }
347
348 6
        if (!$skipPage) {
349 2
            foreach ($GLOBALS['TYPO3_CONF_VARS']['EXTCONF']['crawler']['excludeDoktype'] ?? [] as $key => $doktypeList) {
350 1
                if (GeneralUtility::inList($doktypeList, $pageRow['doktype'])) {
351 1
                    $skipPage = true;
352 1
                    $skipMessage = 'Doktype was excluded by "' . $key . '"';
353 1
                    break;
354
                }
355
            }
356
        }
357
358 6
        if (!$skipPage) {
359
            // veto hook
360 1
            foreach ($GLOBALS['TYPO3_CONF_VARS']['EXTCONF']['crawler']['pageVeto'] ?? [] as $key => $func) {
361
                $params = [
362
                    'pageRow' => $pageRow,
363
                ];
364
                // expects "false" if page is ok and "true" or a skipMessage if this page should _not_ be crawled
365
                $veto = GeneralUtility::callUserFunction($func, $params, $this);
366
                if ($veto !== false) {
367
                    $skipPage = true;
368
                    if (is_string($veto)) {
369
                        $skipMessage = $veto;
370
                    } else {
371
                        $skipMessage = 'Veto from hook "' . htmlspecialchars($key) . '"';
372
                    }
373
                    // no need to execute other hooks if a previous one return a veto
374
                    break;
375
                }
376
            }
377
        }
378
379 6
        return $skipPage ? $skipMessage : false;
380
    }
381
382
    /**
383
     * Wrapper method for getUrlsForPageId()
384
     * It returns an array of configurations and no urls!
385
     *
386
     * @param array $pageRow Page record with at least dok-type and uid columns.
387
     * @param string $skipMessage
388
     * @return array
389
     * @see getUrlsForPageId()
390
     */
391 2
    public function getUrlsForPageRow(array $pageRow, &$skipMessage = '')
392
    {
393 2
        $message = $this->checkIfPageShouldBeSkipped($pageRow);
394
395 2
        if ($message === false) {
396 1
            $res = $this->getUrlsForPageId($pageRow['uid']);
397 1
            $skipMessage = '';
398
        } else {
399 1
            $skipMessage = $message;
400 1
            $res = [];
401
        }
402
403 2
        return $res;
404
    }
405
406
    /**
407
     * This method is used to count if there are ANY unprocessed queue entries
408
     * of a given page_id and the configuration which matches a given hash.
409
     * If there if none, we can skip an inner detail check
410
     *
411
     * @param  int $uid
412
     * @param  string $configurationHash
413
     * @return boolean
414
     */
415
    protected function noUnprocessedQueueEntriesForPageWithConfigurationHashExist($uid, $configurationHash)
416
    {
417
        $queryBuilder = GeneralUtility::makeInstance(ConnectionPool::class)->getQueryBuilderForTable($this->tableName);
418
        $noUnprocessedQueueEntriesFound = true;
419
420
        $result = $queryBuilder
421
            ->count('*')
422
            ->from($this->tableName)
423
            ->where(
424
                $queryBuilder->expr()->eq('page_id', (int)$uid),
425
                $queryBuilder->expr()->eq('configuration_hash', $queryBuilder->createNamedParameter($configurationHash)),
426
                $queryBuilder->expr()->eq('exec_time', 0)
427
            )
428
            ->execute()
429
            ->fetchColumn();
430
431
        if ($result) {
432
            $noUnprocessedQueueEntriesFound = false;
433
        }
434
435
        return $noUnprocessedQueueEntriesFound;
436
    }
437
438
    /**
439
     * Creates a list of URLs from input array (and submits them to queue if asked for)
440
     * See Web > Info module script + "indexed_search"'s crawler hook-client using this!
441
     *
442
     * @param    array        Information about URLs from pageRow to crawl.
443
     * @param    array        Page row
444
     * @param    integer        Unix time to schedule indexing to, typically time()
445
     * @param    integer        Number of requests per minute (creates the interleave between requests)
446
     * @param    boolean        If set, submits the URLs to queue
447
     * @param    boolean        If set (and submitcrawlUrls is false) will fill $downloadUrls with entries)
448
     * @param    array        Array which is passed by reference and contains the an id per url to secure we will not crawl duplicates
449
     * @param    array        Array which will be filled with URLS for download if flag is set.
450
     * @param    array        Array of processing instructions
451
     * @return    string        List of URLs (meant for display in backend module)
452
     *
453
     */
454
    public function urlListFromUrlArray(
455
        array $vv,
456
        array $pageRow,
457
        $scheduledTime,
458
        $reqMinute,
459
        $submitCrawlUrls,
460
        $downloadCrawlUrls,
461
        array &$duplicateTrack,
462
        array &$downloadUrls,
463
        array $incomingProcInstructions
464
    ) {
465
        if (!is_array($vv['URLs'])) {
466
            return 'ERROR - no URL generated';
467
        }
468
        $urlLog = [];
469
        $pageId = (int)$pageRow['uid'];
470
        $configurationHash = $this->getConfigurationHash($vv);
471
        $skipInnerCheck = $this->noUnprocessedQueueEntriesForPageWithConfigurationHashExist($pageId, $configurationHash);
472
473
        foreach ($vv['URLs'] as $urlQuery) {
474
            if (!$this->drawURLs_PIfilter($vv['subCfg']['procInstrFilter'], $incomingProcInstructions)) {
475
                continue;
476
            }
477
            $url = (string)$this->getUrlFromPageAndQueryParameters(
478
                $pageId,
479
                $urlQuery,
480
                $vv['subCfg']['baseUrl'] ?? null,
481
                $vv['subCfg']['force_ssl'] ?? 0
482
            );
483
484
            // Create key by which to determine unique-ness:
485
            $uKey = $url . '|' . $vv['subCfg']['userGroups'] . '|' . $vv['subCfg']['procInstrFilter'];
486
487
            if (isset($duplicateTrack[$uKey])) {
488
                //if the url key is registered just display it and do not resubmit is
489
                $urlLog[] = '<em><span class="text-muted">' . htmlspecialchars($url) . '</span></em>';
490
            } else {
491
                // Scheduled time:
492
                $schTime = $scheduledTime + round(count($duplicateTrack) * (60 / $reqMinute));
493
                $schTime = floor($schTime / 60) * 60;
494
                $formattedDate = BackendUtility::datetime($schTime);
495
                $this->urlList[] = '[' . $formattedDate . '] ' . $url;
496
                $urlList = '[' . $formattedDate . '] ' . htmlspecialchars($url);
497
498
                // Submit for crawling!
499
                if ($submitCrawlUrls) {
500
                    $added = $this->addUrl(
501
                        $pageId,
502
                        $url,
503
                        $vv['subCfg'],
504
                        $scheduledTime,
505
                        $configurationHash,
506
                        $skipInnerCheck
507
                    );
508
                    if ($added === false) {
509
                        $urlList .= ' (URL already existed)';
510
                    }
511
                } elseif ($downloadCrawlUrls) {
512
                    $downloadUrls[$url] = $url;
513
                }
514
                $urlLog[] = $urlList;
515
            }
516
            $duplicateTrack[$uKey] = true;
517
        }
518
519
        return implode('<br>', $urlLog);
520
    }
521
522
    /**
523
     * Returns true if input processing instruction is among registered ones.
524
     *
525
     * @param string $piString PI to test
526
     * @param array $incomingProcInstructions Processing instructions
527
     * @return boolean
528
     */
529 5
    public function drawURLs_PIfilter($piString, array $incomingProcInstructions)
530
    {
531 5
        if (empty($incomingProcInstructions)) {
532 1
            return true;
533
        }
534
535 4
        foreach ($incomingProcInstructions as $pi) {
536 4
            if (GeneralUtility::inList($piString, $pi)) {
537 2
                return true;
538
            }
539
        }
540 2
        return false;
541
    }
542
543
    public function getPageTSconfigForId($id)
544
    {
545
        if (!$this->MP) {
546
            $pageTSconfig = BackendUtility::getPagesTSconfig($id);
547
        } else {
548
            [, $mountPointId] = explode('-', $this->MP);
0 ignored issues
show
Bug introduced by
The variable $mountPointId does not exist. Did you forget to declare it?

This check marks access to variables or properties that have not been declared yet. While PHP has no explicit notion of declaring a variable, accessing it before a value is assigned to it is most likely a bug.

Loading history...
549
            $pageTSconfig = BackendUtility::getPagesTSconfig($mountPointId);
550
        }
551
552
        // Call a hook to alter configuration
553
        if (is_array($GLOBALS['TYPO3_CONF_VARS']['EXTCONF']['crawler']['getPageTSconfigForId'])) {
554
            $params = [
555
                'pageId' => $id,
556
                'pageTSConfig' => &$pageTSconfig,
557
            ];
558
            foreach ($GLOBALS['TYPO3_CONF_VARS']['EXTCONF']['crawler']['getPageTSconfigForId'] as $userFunc) {
559
                GeneralUtility::callUserFunction($userFunc, $params, $this);
560
            }
561
        }
562
        return $pageTSconfig;
563
    }
564
565
    /**
566
     * This methods returns an array of configurations.
567
     * And no urls!
568
     *
569
     * @param integer $id Page ID
0 ignored issues
show
Bug introduced by
There is no parameter named $id. Was it maybe removed?

This check looks for PHPDoc comments describing methods or function parameters that do not exist on the corresponding method or function.

Consider the following example. The parameter $italy is not defined by the method finale(...).

/**
 * @param array $germany
 * @param array $island
 * @param array $italy
 */
function finale($germany, $island) {
    return "2:1";
}

The most likely cause is that the parameter was removed, but the annotation was not.

Loading history...
570
     * @return array
571
     */
572
    public function getUrlsForPageId($pageId)
573
    {
574
        // Get page TSconfig for page ID
575
        $pageTSconfig = $this->getPageTSconfigForId($pageId);
576
577
        $res = [];
578
579
        // Fetch Crawler Configuration from pageTSconfig
580
        $crawlerCfg = $pageTSconfig['tx_crawler.']['crawlerCfg.']['paramSets.'] ?? [];
581
        foreach ($crawlerCfg as $key => $values) {
582
            if (!is_array($values)) {
583
                continue;
584
            }
585
            $key = str_replace('.', '', $key);
586
            // Sub configuration for a single configuration string:
587
            $subCfg = (array)$crawlerCfg[$key . '.'];
588
            $subCfg['key'] = $key;
589
590
            if (strcmp($subCfg['procInstrFilter'], '')) {
591
                $subCfg['procInstrFilter'] = implode(',', GeneralUtility::trimExplode(',', $subCfg['procInstrFilter']));
592
            }
593
            $pidOnlyList = implode(',', GeneralUtility::trimExplode(',', $subCfg['pidsOnly'], true));
594
595
            // process configuration if it is not page-specific or if the specific page is the current page:
596
            if (!strcmp((string) $subCfg['pidsOnly'], '') || GeneralUtility::inList($pidOnlyList, $pageId)) {
597
598
                // Explode, process etc.:
599
                $res[$key] = [];
600
                $res[$key]['subCfg'] = $subCfg;
601
                $res[$key]['paramParsed'] = GeneralUtility::explodeUrl2Array($crawlerCfg[$key]);
602
                $res[$key]['paramExpanded'] = $this->expandParameters($res[$key]['paramParsed'], $pageId);
603
                $res[$key]['origin'] = 'pagets';
604
605
                // recognize MP value
606
                if (!$this->MP) {
607
                    $res[$key]['URLs'] = $this->compileUrls($res[$key]['paramExpanded'], ['?id=' . $pageId]);
608
                } else {
609
                    $res[$key]['URLs'] = $this->compileUrls($res[$key]['paramExpanded'], ['?id=' . $pageId . '&MP=' . $this->MP]);
610
                }
611
            }
612
        }
613
614
        // Get configuration from tx_crawler_configuration records up the rootline
615
        $crawlerConfigurations = $this->configurationRepository->getCrawlerConfigurationRecordsFromRootLine($pageId);
616
        foreach ($crawlerConfigurations as $configurationRecord) {
617
618
                // check access to the configuration record
619
            if (empty($configurationRecord['begroups']) || $this->getBackendUser()->isAdmin() || $this->hasGroupAccess($this->getBackendUser()->user['usergroup_cached_list'], $configurationRecord['begroups'])) {
620
                $pidOnlyList = implode(',', GeneralUtility::trimExplode(',', $configurationRecord['pidsonly'], true));
621
622
                // process configuration if it is not page-specific or if the specific page is the current page:
623
                if (!strcmp($configurationRecord['pidsonly'], '') || GeneralUtility::inList($pidOnlyList, $pageId)) {
624
                    $key = $configurationRecord['name'];
625
626
                    // don't overwrite previously defined paramSets
627
                    if (!isset($res[$key])) {
628
629
                            /* @var $TSparserObject \TYPO3\CMS\Core\TypoScript\Parser\TypoScriptParser */
630
                        $TSparserObject = GeneralUtility::makeInstance(TypoScriptParser::class);
631
                        $TSparserObject->parse($configurationRecord['processing_instruction_parameters_ts']);
632
633
                        $subCfg = [
634
                            'procInstrFilter' => $configurationRecord['processing_instruction_filter'],
635
                            'procInstrParams.' => $TSparserObject->setup,
636
                            'baseUrl' => $configurationRecord['base_url'],
637
                            'force_ssl' => (int)$configurationRecord['force_ssl'],
638
                            'userGroups' => $configurationRecord['fegroups'],
639
                            'exclude' => $configurationRecord['exclude'],
640
                            'key' => $key,
641
                        ];
642
643
                        if (!in_array($pageId, $this->expandExcludeString($subCfg['exclude']))) {
644
                            $res[$key] = [];
645
                            $res[$key]['subCfg'] = $subCfg;
646
                            $res[$key]['paramParsed'] = GeneralUtility::explodeUrl2Array($configurationRecord['configuration']);
647
                            $res[$key]['paramExpanded'] = $this->expandParameters($res[$key]['paramParsed'], $pageId);
648
                            $res[$key]['URLs'] = $this->compileUrls($res[$key]['paramExpanded'], ['?id=' . $pageId]);
649
                            $res[$key]['origin'] = 'tx_crawler_configuration_' . $configurationRecord['uid'];
650
                        }
651
                    }
652
                }
653
            }
654
        }
655
656
        foreach ($GLOBALS['TYPO3_CONF_VARS']['EXTCONF']['crawler']['processUrls'] ?? [] as $func) {
657
            $params = [
658
                'res' => &$res,
659
            ];
660
            GeneralUtility::callUserFunction($func, $params, $this);
661
        }
662
        return $res;
663
    }
664
665
    /**
666
     * Find all configurations of subpages of a page
667
     *
668
     * @param int $rootid
669
     * @param $depth
670
     * @return array
671
     *
672
     * TODO: Write Functional Tests
673
     */
674
    public function getConfigurationsForBranch(int $rootid, $depth)
675
    {
676
        $configurationsForBranch = [];
677
        $pageTSconfig = $this->getPageTSconfigForId($rootid);
678
        $sets = $pageTSconfig['tx_crawler.']['crawlerCfg.']['paramSets.'] ?? [];
679
        foreach ($sets as $key => $value) {
680
            if (!is_array($value)) {
681
                continue;
682
            }
683
            $configurationsForBranch[] = substr($key, -1) == '.' ? substr($key, 0, -1) : $key;
684
        }
685
        $pids = [];
686
        $rootLine = BackendUtility::BEgetRootLine($rootid);
687
        foreach ($rootLine as $node) {
688
            $pids[] = $node['uid'];
689
        }
690
        /* @var PageTreeView $tree */
691
        $tree = GeneralUtility::makeInstance(PageTreeView::class);
692
        $perms_clause = $this->getBackendUser()->getPagePermsClause(Permission::PAGE_SHOW);
693
        $tree->init(empty($perms_clause) ? '' : ('AND ' . $perms_clause));
694
        $tree->getTree($rootid, $depth, '');
695
        foreach ($tree->tree as $node) {
696
            $pids[] = $node['row']['uid'];
697
        }
698
699
        $queryBuilder = $this->getQueryBuilder('tx_crawler_configuration');
700
        $statement = $queryBuilder
701
            ->select('name')
702
            ->from('tx_crawler_configuration')
703
            ->where(
704
                $queryBuilder->expr()->in('pid', $queryBuilder->createNamedParameter($pids, Connection::PARAM_INT_ARRAY))
705
            )
706
            ->execute();
707
708
        while ($row = $statement->fetch()) {
709
            $configurationsForBranch[] = $row['name'];
710
        }
711
        return $configurationsForBranch;
712
    }
713
714
    /**
715
     * Get querybuilder for given table
716
     *
717
     * @param string $table
718
     * @return \TYPO3\CMS\Core\Database\Query\QueryBuilder
719
     */
720
    private function getQueryBuilder(string $table)
721
    {
722
        return GeneralUtility::makeInstance(ConnectionPool::class)->getQueryBuilderForTable($table);
723
    }
724
725
    /**
726
     * Check if a user has access to an item
727
     * (e.g. get the group list of the current logged in user from $GLOBALS['TSFE']->gr_list)
728
     *
729
     * @see \TYPO3\CMS\Frontend\Page\PageRepository::getMultipleGroupsWhereClause()
730
     * @param  string $groupList    Comma-separated list of (fe_)group UIDs from a user
731
     * @param  string $accessList   Comma-separated list of (fe_)group UIDs of the item to access
732
     * @return bool                 TRUE if at least one of the users group UIDs is in the access list or the access list is empty
733
     */
734 3
    public function hasGroupAccess($groupList, $accessList)
735
    {
736 3
        if (empty($accessList)) {
737 1
            return true;
738
        }
739 2
        foreach (GeneralUtility::intExplode(',', $groupList) as $groupUid) {
740 2
            if (GeneralUtility::inList($accessList, $groupUid)) {
741 1
                return true;
742
            }
743
        }
744 1
        return false;
745
    }
746
747
    /**
748
     * Will expand the parameters configuration to individual values. This follows a certain syntax of the value of each parameter.
749
     * Syntax of values:
750
     * - Basically: If the value is wrapped in [...] it will be expanded according to the following syntax, otherwise the value is taken literally
751
     * - Configuration is splitted by "|" and the parts are processed individually and finally added together
752
     * - For each configuration part:
753
     *         - "[int]-[int]" = Integer range, will be expanded to all values in between, values included, starting from low to high (max. 1000). Example "1-34" or "-40--30"
754
     *         - "_TABLE:[TCA table name];[_PID:[optional page id, default is current page]];[_ENABLELANG:1]" = Look up of table records from PID, filtering out deleted records. Example "_TABLE:tt_content; _PID:123"
755
     *        _ENABLELANG:1 picks only original records without their language overlays
756
     *         - Default: Literal value
757
     *
758
     * @param array $paramArray Array with key (GET var name) and values (value of GET var which is configuration for expansion)
759
     * @param integer $pid Current page ID
760
     * @return array
761
     *
762
     * TODO: Write Functional Tests
763
     */
764
    public function expandParameters($paramArray, $pid)
765
    {
766
        // Traverse parameter names:
767
        foreach ($paramArray as $p => $v) {
768
            $v = trim($v);
769
770
            // If value is encapsulated in square brackets it means there are some ranges of values to find, otherwise the value is literal
771
            if (substr($v, 0, 1) === '[' && substr($v, -1) === ']') {
772
                // So, find the value inside brackets and reset the paramArray value as an array.
773
                $v = substr($v, 1, -1);
774
                $paramArray[$p] = [];
775
776
                // Explode parts and traverse them:
777
                $parts = explode('|', $v);
778
                foreach ($parts as $pV) {
779
780
                        // Look for integer range: (fx. 1-34 or -40--30 // reads minus 40 to minus 30)
781
                    if (preg_match('/^(-?[0-9]+)\s*-\s*(-?[0-9]+)$/', trim($pV), $reg)) {
782
783
                        // Swap if first is larger than last:
784
                        if ($reg[1] > $reg[2]) {
785
                            $temp = $reg[2];
786
                            $reg[2] = $reg[1];
787
                            $reg[1] = $temp;
788
                        }
789
790
                        // Traverse range, add values:
791
                        $runAwayBrake = 1000; // Limit to size of range!
792
                        for ($a = $reg[1]; $a <= $reg[2];$a++) {
793
                            $paramArray[$p][] = $a;
794
                            $runAwayBrake--;
795
                            if ($runAwayBrake <= 0) {
796
                                break;
797
                            }
798
                        }
799
                    } elseif (substr(trim($pV), 0, 7) == '_TABLE:') {
800
801
                        // Parse parameters:
802
                        $subparts = GeneralUtility::trimExplode(';', $pV);
803
                        $subpartParams = [];
804
                        foreach ($subparts as $spV) {
805
                            [$pKey, $pVal] = GeneralUtility::trimExplode(':', $spV);
0 ignored issues
show
Bug introduced by
The variable $pKey does not exist. Did you forget to declare it?

This check marks access to variables or properties that have not been declared yet. While PHP has no explicit notion of declaring a variable, accessing it before a value is assigned to it is most likely a bug.

Loading history...
Bug introduced by
The variable $pVal does not exist. Did you forget to declare it?

This check marks access to variables or properties that have not been declared yet. While PHP has no explicit notion of declaring a variable, accessing it before a value is assigned to it is most likely a bug.

Loading history...
806
                            $subpartParams[$pKey] = $pVal;
807
                        }
808
809
                        // Table exists:
810
                        if (isset($GLOBALS['TCA'][$subpartParams['_TABLE']])) {
811
                            $lookUpPid = isset($subpartParams['_PID']) ? intval($subpartParams['_PID']) : intval($pid);
812
                            $recursiveDepth = isset($subpartParams['_RECURSIVE']) ? intval($subpartParams['_RECURSIVE']) : 0;
813
                            $pidField = isset($subpartParams['_PIDFIELD']) ? trim($subpartParams['_PIDFIELD']) : 'pid';
814
                            $where = $subpartParams['_WHERE'] ?? '';
815
                            $addTable = $subpartParams['_ADDTABLE'] ?? '';
816
817
                            $fieldName = $subpartParams['_FIELD'] ? $subpartParams['_FIELD'] : 'uid';
818
                            if ($fieldName === 'uid' || $GLOBALS['TCA'][$subpartParams['_TABLE']]['columns'][$fieldName]) {
819
                                $queryBuilder = $this->getQueryBuilder($subpartParams['_TABLE']);
820
821
                                if ($recursiveDepth > 0) {
822
                                    /** @var \TYPO3\CMS\Core\Database\QueryGenerator $queryGenerator */
823
                                    $queryGenerator = GeneralUtility::makeInstance(\TYPO3\CMS\Core\Database\QueryGenerator::class);
824
                                    $pidList = $queryGenerator->getTreeList($lookUpPid, $recursiveDepth, 0, 1);
825
                                    $pidArray = GeneralUtility::intExplode(',', $pidList);
826
                                } else {
827
                                    $pidArray = [(string)$lookUpPid];
828
                                }
829
830
                                $queryBuilder->getRestrictions()
831
                                    ->removeAll()
832
                                    ->add(GeneralUtility::makeInstance(DeletedRestriction::class));
833
834
                                $queryBuilder
835
                                    ->select($fieldName)
836
                                    ->from($subpartParams['_TABLE'])
837
                                    ->where(
838
                                        $queryBuilder->expr()->in($pidField, $queryBuilder->createNamedParameter($pidArray, Connection::PARAM_INT_ARRAY)),
839
                                        $where
840
                                    );
841
                                if (!empty($addTable)) {
842
                                    // TODO: Check if this works as intended!
843
                                    $queryBuilder->add('from', $addTable);
844
                                }
845
                                $transOrigPointerField = $GLOBALS['TCA'][$subpartParams['_TABLE']]['ctrl']['transOrigPointerField'];
846
847
                                if ($subpartParams['_ENABLELANG'] && $transOrigPointerField) {
848
                                    $queryBuilder->andWhere(
849
                                        $queryBuilder->expr()->lte(
850
                                            $transOrigPointerField,
851
                                            0
852
                                        )
853
                                    );
854
                                }
855
856
                                $statement = $queryBuilder->execute();
857
858
                                $rows = [];
859
                                while ($row = $statement->fetch()) {
860
                                    $rows[$row[$fieldName]] = $row;
861
                                }
862
863
                                if (is_array($rows)) {
864
                                    $paramArray[$p] = array_merge($paramArray[$p], array_keys($rows));
865
                                }
866
                            }
867
                        }
868
                    } else { // Just add value:
869
                        $paramArray[$p][] = $pV;
870
                    }
871
                    // Hook for processing own expandParameters place holder
872
                    if (is_array($GLOBALS['TYPO3_CONF_VARS']['SC_OPTIONS']['crawler/class.tx_crawler_lib.php']['expandParameters'])) {
873
                        $_params = [
874
                            'pObj' => &$this,
875
                            'paramArray' => &$paramArray,
876
                            'currentKey' => $p,
877
                            'currentValue' => $pV,
878
                            'pid' => $pid,
879
                        ];
880
                        foreach ($GLOBALS['TYPO3_CONF_VARS']['SC_OPTIONS']['crawler/class.tx_crawler_lib.php']['expandParameters'] as $_funcRef) {
881
                            GeneralUtility::callUserFunction($_funcRef, $_params, $this);
882
                        }
883
                    }
884
                }
885
886
                // Make unique set of values and sort array by key:
887
                $paramArray[$p] = array_unique($paramArray[$p]);
888
                ksort($paramArray);
889
            } else {
890
                // Set the literal value as only value in array:
891
                $paramArray[$p] = [$v];
892
            }
893
        }
894
895
        return $paramArray;
896
    }
897
898
    /**
899
     * Compiling URLs from parameter array (output of expandParameters())
900
     * The number of URLs will be the multiplication of the number of parameter values for each key
901
     *
902
     * @param array $paramArray Output of expandParameters(): Array with keys (GET var names) and for each an array of values
903
     * @param array $urls URLs accumulated in this array (for recursion)
904
     * @return array
905
     */
906 3
    public function compileUrls($paramArray, array $urls)
907
    {
908 3
        if (empty($paramArray)) {
909 3
            return $urls;
910
        }
911
        // shift first off stack:
912 2
        reset($paramArray);
913 2
        $varName = key($paramArray);
914 2
        $valueSet = array_shift($paramArray);
915
916
        // Traverse value set:
917 2
        $newUrls = [];
918 2
        foreach ($urls as $url) {
919 1
            foreach ($valueSet as $val) {
920 1
                $newUrls[] = $url . (strcmp((string) $val, '') ? '&' . rawurlencode($varName) . '=' . rawurlencode((string) $val) : '');
921
922 1
                if (count($newUrls) > $this->maximumUrlsToCompile) {
923
                    break;
924
                }
925
            }
926
        }
927 2
        return $this->compileUrls($paramArray, $newUrls);
928
    }
929
930
    /************************************
931
     *
932
     * Crawler log
933
     *
934
     ************************************/
935
936
    /**
937
     * Return array of records from crawler queue for input page ID
938
     *
939
     * @param integer $id Page ID for which to look up log entries.
940
     * @param string $filter Filter: "all" => all entries, "pending" => all that is not yet run, "finished" => all complete ones
941
     * @param boolean $doFlush If TRUE, then entries selected at DELETED(!) instead of selected!
942
     * @param boolean $doFullFlush
943
     * @param integer $itemsPerPage Limit the amount of entries per page default is 10
944
     * @return array
945
     */
946
    public function getLogEntriesForPageId($id, $filter = '', $doFlush = false, $doFullFlush = false, $itemsPerPage = 10)
947
    {
948
        $queryBuilder = GeneralUtility::makeInstance(ConnectionPool::class)->getQueryBuilderForTable($this->tableName);
949
        $queryBuilder
950
            ->select('*')
951
            ->from($this->tableName)
952
            ->where(
953
                $queryBuilder->expr()->eq('page_id', $queryBuilder->createNamedParameter($id, \PDO::PARAM_INT))
954
            )
955
            ->orderBy('scheduled', 'DESC');
956
957
        $expressionBuilder = GeneralUtility::makeInstance(ConnectionPool::class)
958
            ->getConnectionForTable($this->tableName)
959
            ->getExpressionBuilder();
960
        $query = $expressionBuilder->andX();
961
        // PHPStorm adds the highlight that the $addWhere is immediately overwritten,
962
        // but the $query = $expressionBuilder->andX() ensures that the $addWhere is written correctly with AND
963
        // between the statements, it's not a mistake in the code.
964
        $addWhere = '';
0 ignored issues
show
Unused Code introduced by
$addWhere is not used, you could remove the assignment.

This check looks for variable assignements that are either overwritten by other assignments or where the variable is not used subsequently.

$myVar = 'Value';
$higher = false;

if (rand(1, 6) > 3) {
    $higher = true;
} else {
    $higher = false;
}

Both the $myVar assignment in line 1 and the $higher assignment in line 2 are dead. The first because $myVar is never used and the second because $higher is always overwritten for every possible time line.

Loading history...
965
        switch ($filter) {
966
            case 'pending':
967
                $queryBuilder->andWhere($queryBuilder->expr()->eq('exec_time', 0));
968
                $addWhere = ' AND ' . $query->add($expressionBuilder->eq('exec_time', 0));
0 ignored issues
show
Unused Code introduced by
$addWhere is not used, you could remove the assignment.

This check looks for variable assignements that are either overwritten by other assignments or where the variable is not used subsequently.

$myVar = 'Value';
$higher = false;

if (rand(1, 6) > 3) {
    $higher = true;
} else {
    $higher = false;
}

Both the $myVar assignment in line 1 and the $higher assignment in line 2 are dead. The first because $myVar is never used and the second because $higher is always overwritten for every possible time line.

Loading history...
969
                break;
970
            case 'finished':
971
                $queryBuilder->andWhere($queryBuilder->expr()->gt('exec_time', 0));
972
                $addWhere = ' AND ' . $query->add($expressionBuilder->gt('exec_time', 0));
0 ignored issues
show
Unused Code introduced by
$addWhere is not used, you could remove the assignment.

This check looks for variable assignements that are either overwritten by other assignments or where the variable is not used subsequently.

$myVar = 'Value';
$higher = false;

if (rand(1, 6) > 3) {
    $higher = true;
} else {
    $higher = false;
}

Both the $myVar assignment in line 1 and the $higher assignment in line 2 are dead. The first because $myVar is never used and the second because $higher is always overwritten for every possible time line.

Loading history...
973
                break;
974
        }
975
976
        // FIXME: Write unit test that ensures that the right records are deleted.
977
        if ($doFlush) {
978
            $addWhere = $query->add($expressionBuilder->eq('page_id', (int)$id));
979
            $this->flushQueue($doFullFlush ? '1=1' : $addWhere);
980
            return [];
981
        } else {
982
            if ($itemsPerPage > 0) {
983
                $queryBuilder
984
                    ->setMaxResults((int)$itemsPerPage);
985
            }
986
987
            return $queryBuilder->execute()->fetchAll();
988
        }
989
    }
990
991
    /**
992
     * Return array of records from crawler queue for input set ID
993
     *
994
     * @param integer $set_id Set ID for which to look up log entries.
995
     * @param string $filter Filter: "all" => all entries, "pending" => all that is not yet run, "finished" => all complete ones
996
     * @param boolean $doFlush If TRUE, then entries selected at DELETED(!) instead of selected!
997
     * @param integer $itemsPerPage Limit the amount of entires per page default is 10
998
     * @return array
999
     */
1000
    public function getLogEntriesForSetId($set_id, $filter = '', $doFlush = false, $doFullFlush = false, $itemsPerPage = 10)
1001
    {
1002
        $queryBuilder = GeneralUtility::makeInstance(ConnectionPool::class)->getQueryBuilderForTable($this->tableName);
1003
        $queryBuilder
1004
            ->select('*')
1005
            ->from($this->tableName)
1006
            ->where(
1007
                $queryBuilder->expr()->eq('set_id', $queryBuilder->createNamedParameter($set_id, \PDO::PARAM_INT))
1008
            )
1009
            ->orderBy('scheduled', 'DESC');
1010
1011
        $expressionBuilder = GeneralUtility::makeInstance(ConnectionPool::class)
1012
            ->getConnectionForTable($this->tableName)
1013
            ->getExpressionBuilder();
1014
        $query = $expressionBuilder->andX();
1015
        // FIXME: Write Unit tests for Filters
1016
        // PHPStorm adds the highlight that the $addWhere is immediately overwritten,
1017
        // but the $query = $expressionBuilder->andX() ensures that the $addWhere is written correctly with AND
1018
        // between the statements, it's not a mistake in the code.
1019
        $addWhere = '';
0 ignored issues
show
Unused Code introduced by
$addWhere is not used, you could remove the assignment.

This check looks for variable assignements that are either overwritten by other assignments or where the variable is not used subsequently.

$myVar = 'Value';
$higher = false;

if (rand(1, 6) > 3) {
    $higher = true;
} else {
    $higher = false;
}

Both the $myVar assignment in line 1 and the $higher assignment in line 2 are dead. The first because $myVar is never used and the second because $higher is always overwritten for every possible time line.

Loading history...
1020
        switch ($filter) {
1021
            case 'pending':
1022
                $queryBuilder->andWhere($queryBuilder->expr()->eq('exec_time', 0));
1023
                $addWhere = $query->add($expressionBuilder->eq('exec_time', 0));
0 ignored issues
show
Unused Code introduced by
$addWhere is not used, you could remove the assignment.

This check looks for variable assignements that are either overwritten by other assignments or where the variable is not used subsequently.

$myVar = 'Value';
$higher = false;

if (rand(1, 6) > 3) {
    $higher = true;
} else {
    $higher = false;
}

Both the $myVar assignment in line 1 and the $higher assignment in line 2 are dead. The first because $myVar is never used and the second because $higher is always overwritten for every possible time line.

Loading history...
1024
                break;
1025
            case 'finished':
1026
                $queryBuilder->andWhere($queryBuilder->expr()->gt('exec_time', 0));
1027
                $addWhere = $query->add($expressionBuilder->gt('exec_time', 0));
0 ignored issues
show
Unused Code introduced by
$addWhere is not used, you could remove the assignment.

This check looks for variable assignements that are either overwritten by other assignments or where the variable is not used subsequently.

$myVar = 'Value';
$higher = false;

if (rand(1, 6) > 3) {
    $higher = true;
} else {
    $higher = false;
}

Both the $myVar assignment in line 1 and the $higher assignment in line 2 are dead. The first because $myVar is never used and the second because $higher is always overwritten for every possible time line.

Loading history...
1028
                break;
1029
        }
1030
        // FIXME: Write unit test that ensures that the right records are deleted.
1031
        if ($doFlush) {
1032
            $addWhere = $query->add($expressionBuilder->eq('set_id', (int)$set_id));
1033
            $this->flushQueue($doFullFlush ? '' : $addWhere);
1034
            return [];
1035
        } else {
1036
            if ($itemsPerPage > 0) {
1037
                $queryBuilder
1038
                    ->setMaxResults((int)$itemsPerPage);
1039
            }
1040
1041
            return $queryBuilder->execute()->fetchAll();
1042
        }
1043
    }
1044
1045
    /**
1046
     * Removes queue entries
1047
     *
1048
     * @param string $where SQL related filter for the entries which should be removed
1049
     * @return void
1050
     */
1051
    protected function flushQueue($where = ''): void
1052
    {
1053
        $realWhere = strlen((string) $where) > 0 ? $where : '1=1';
1054
1055
        $queryBuilder = $this->getQueryBuilder($this->tableName);
1056
1057
        if (EventDispatcher::getInstance()->hasObserver('queueEntryFlush')) {
1058
            $groups = $queryBuilder
1059
                ->select('DISTINCT set_id')
1060
                ->from($this->tableName)
1061
                ->where($realWhere)
1062
                ->execute()
1063
                ->fetchAll();
1064
            if (is_array($groups)) {
1065
                foreach ($groups as $group) {
1066
                    $subSet = $queryBuilder
1067
                        ->select('uid', 'set_id')
1068
                        ->from($this->tableName)
1069
                        ->where(
1070
                            $realWhere,
1071
                            $queryBuilder->expr()->eq('set_id', $group['set_id'])
1072
                        )
1073
                        ->execute()
1074
                        ->fetchAll();
1075
                    EventDispatcher::getInstance()->post('queueEntryFlush', $group['set_id'], $subSet);
1076
                }
1077
            }
1078
        }
1079
1080
        $queryBuilder
1081
            ->delete($this->tableName)
1082
            ->where($realWhere)
1083
            ->execute();
1084
    }
1085
1086
    /**
1087
     * Adding call back entries to log (called from hooks typically, see indexed search class "class.crawler.php"
1088
     *
1089
     * @param integer $setId Set ID
1090
     * @param array $params Parameters to pass to call back function
1091
     * @param string $callBack Call back object reference, eg. 'EXT:indexed_search/class.crawler.php:&tx_indexedsearch_crawler'
1092
     * @param integer $page_id Page ID to attach it to
1093
     * @param integer $schedule Time at which to activate
1094
     * @return void
1095
     */
1096
    public function addQueueEntry_callBack($setId, $params, $callBack, $page_id = 0, $schedule = 0): void
1097
    {
1098
        if (!is_array($params)) {
1099
            $params = [];
1100
        }
1101
        $params['_CALLBACKOBJ'] = $callBack;
1102
1103
        GeneralUtility::makeInstance(ConnectionPool::class)->getConnectionForTable('tx_crawler_queue')
1104
            ->insert(
1105
                'tx_crawler_queue',
1106
                [
1107
                    'page_id' => (int)$page_id,
1108
                    'parameters' => serialize($params),
1109
                    'scheduled' => (int)$schedule ?: $this->getCurrentTime(),
1110
                    'exec_time' => 0,
1111
                    'set_id' => (int)$setId,
1112
                    'result_data' => '',
1113
                ]
1114
            );
1115
    }
1116
1117
    /************************************
1118
     *
1119
     * URL setting
1120
     *
1121
     ************************************/
1122
1123
    /**
1124
     * Setting a URL for crawling:
1125
     *
1126
     * @param integer $id Page ID
1127
     * @param string $url Complete URL
1128
     * @param array $subCfg Sub configuration array (from TS config)
1129
     * @param integer $tstamp Scheduled-time
1130
     * @param string $configurationHash (optional) configuration hash
1131
     * @param bool $skipInnerDuplicationCheck (optional) skip inner duplication check
1132
     * @return bool
1133
     */
1134
    public function addUrl(
1135
        $id,
1136
        $url,
1137
        array $subCfg,
1138
        $tstamp,
1139
        $configurationHash = '',
1140
        $skipInnerDuplicationCheck = false
1141
    ) {
1142
        $urlAdded = false;
1143
        $rows = [];
1144
1145
        // Creating parameters:
1146
        $parameters = [
1147
            'url' => $url,
1148
        ];
1149
1150
        // fe user group simulation:
1151
        $uGs = implode(',', array_unique(GeneralUtility::intExplode(',', $subCfg['userGroups'], true)));
1152
        if ($uGs) {
1153
            $parameters['feUserGroupList'] = $uGs;
1154
        }
1155
1156
        // Setting processing instructions
1157
        $parameters['procInstructions'] = GeneralUtility::trimExplode(',', $subCfg['procInstrFilter']);
1158
        if (is_array($subCfg['procInstrParams.'])) {
1159
            $parameters['procInstrParams'] = $subCfg['procInstrParams.'];
1160
        }
1161
1162
        // Compile value array:
1163
        $parameters_serialized = serialize($parameters);
1164
        $fieldArray = [
1165
            'page_id' => (int)$id,
1166
            'parameters' => $parameters_serialized,
1167
            'parameters_hash' => GeneralUtility::shortMD5($parameters_serialized),
1168
            'configuration_hash' => $configurationHash,
1169
            'scheduled' => $tstamp,
1170
            'exec_time' => 0,
1171
            'set_id' => (int)$this->setID,
1172
            'result_data' => '',
1173
            'configuration' => $subCfg['key'],
1174
        ];
1175
1176
        if ($this->registerQueueEntriesInternallyOnly) {
1177
            //the entries will only be registered and not stored to the database
1178
            $this->queueEntries[] = $fieldArray;
1179
        } else {
1180
            if (!$skipInnerDuplicationCheck) {
1181
                // check if there is already an equal entry
1182
                $rows = $this->getDuplicateRowsIfExist($tstamp, $fieldArray);
1183
            }
1184
1185
            if (empty($rows)) {
1186
                $connectionForCrawlerQueue = GeneralUtility::makeInstance(ConnectionPool::class)->getConnectionForTable('tx_crawler_queue');
1187
                $connectionForCrawlerQueue->insert(
1188
                    'tx_crawler_queue',
1189
                    $fieldArray
1190
                );
1191
                $uid = $connectionForCrawlerQueue->lastInsertId('tx_crawler_queue', 'qid');
1192
                $rows[] = $uid;
1193
                $urlAdded = true;
1194
                EventDispatcher::getInstance()->post('urlAddedToQueue', $this->setID, ['uid' => $uid, 'fieldArray' => $fieldArray]);
1195
            } else {
1196
                EventDispatcher::getInstance()->post('duplicateUrlInQueue', $this->setID, ['rows' => $rows, 'fieldArray' => $fieldArray]);
1197
            }
1198
        }
1199
1200
        return $urlAdded;
1201
    }
1202
1203
    /**
1204
     * This method determines duplicates for a queue entry with the same parameters and this timestamp.
1205
     * If the timestamp is in the past, it will check if there is any unprocessed queue entry in the past.
1206
     * If the timestamp is in the future it will check, if the queued entry has exactly the same timestamp
1207
     *
1208
     * @param int $tstamp
1209
     * @param array $fieldArray
1210
     *
1211
     * @return array
1212
     */
1213
    protected function getDuplicateRowsIfExist($tstamp, $fieldArray)
1214
    {
1215
        $rows = [];
1216
1217
        $currentTime = $this->getCurrentTime();
1218
1219
        $queryBuilder = GeneralUtility::makeInstance(ConnectionPool::class)->getQueryBuilderForTable($this->tableName);
1220
        $queryBuilder
1221
            ->select('qid')
1222
            ->from('tx_crawler_queue');
1223
        //if this entry is scheduled with "now"
1224
        if ($tstamp <= $currentTime) {
1225
            if ($this->extensionSettings['enableTimeslot']) {
1226
                $timeBegin = $currentTime - 100;
1227
                $timeEnd = $currentTime + 100;
1228
                $queryBuilder
1229
                    ->where(
1230
                        'scheduled BETWEEN ' . $timeBegin . ' AND ' . $timeEnd . ''
1231
                    )
1232
                    ->orWhere(
1233
                        $queryBuilder->expr()->lte('scheduled', $currentTime)
1234
                    );
1235
            } else {
1236
                $queryBuilder
1237
                    ->where(
1238
                        $queryBuilder->expr()->lte('scheduled', $currentTime)
1239
                    );
1240
            }
1241
        } elseif ($tstamp > $currentTime) {
1242
            //entry with a timestamp in the future need to have the same schedule time
1243
            $queryBuilder
1244
                ->where(
1245
                    $queryBuilder->expr()->eq('scheduled', $tstamp)
1246
                );
1247
        }
1248
1249
        $queryBuilder
1250
            ->andWhere('NOT exec_time')
1251
            ->andWhere('NOT process_id')
1252
            ->andWhere($queryBuilder->expr()->eq('page_id', $queryBuilder->createNamedParameter($fieldArray['page_id'], \PDO::PARAM_INT)))
1253
            ->andWhere($queryBuilder->expr()->eq('parameters_hash', $queryBuilder->createNamedParameter($fieldArray['parameters_hash'], \PDO::PARAM_STR)))
1254
            ;
1255
1256
        $statement = $queryBuilder->execute();
1257
1258
        while ($row = $statement->fetch()) {
1259
            $rows[] = $row['qid'];
1260
        }
1261
1262
        return $rows;
1263
    }
1264
1265
    /**
1266
     * Returns the current system time
1267
     *
1268
     * @return int
1269
     */
1270
    public function getCurrentTime()
1271
    {
1272
        return time();
1273
    }
1274
1275
    /************************************
1276
     *
1277
     * URL reading
1278
     *
1279
     ************************************/
1280
1281
    /**
1282
     * Read URL for single queue entry
1283
     *
1284
     * @param integer $queueId
1285
     * @param boolean $force If set, will process even if exec_time has been set!
1286
     * @return integer
1287
     */
1288
    public function readUrl($queueId, $force = false)
1289
    {
1290
        $queryBuilder = GeneralUtility::makeInstance(ConnectionPool::class)->getQueryBuilderForTable($this->tableName);
1291
        $ret = 0;
1292
        $this->logger->debug('crawler-readurl start ' . microtime(true));
1293
        // Get entry:
1294
        $queryBuilder
1295
            ->select('*')
1296
            ->from('tx_crawler_queue')
1297
            ->where(
1298
                $queryBuilder->expr()->eq('qid', $queryBuilder->createNamedParameter($queueId, \PDO::PARAM_INT))
1299
            );
1300
        if (!$force) {
1301
            $queryBuilder
1302
                ->andWhere('exec_time = 0')
1303
                ->andWhere('process_scheduled > 0');
1304
        }
1305
        $queueRec = $queryBuilder->execute()->fetch();
1306
1307
        if (!is_array($queueRec)) {
1308
            return;
1309
        }
1310
1311
        SignalSlotUtility::emitSignal(
1312
            __CLASS__,
1313
            SignalSlotUtility::SIGNNAL_QUEUEITEM_PREPROCESS,
1314
            [$queueId, &$queueRec]
1315
        );
1316
1317
        // Set exec_time to lock record:
1318
        $field_array = ['exec_time' => $this->getCurrentTime()];
1319
1320
        if (isset($this->processID)) {
1321
            //if mulitprocessing is used we need to store the id of the process which has handled this entry
1322
            $field_array['process_id_completed'] = $this->processID;
1323
        }
1324
1325
        GeneralUtility::makeInstance(ConnectionPool::class)->getConnectionForTable('tx_crawler_queue')
1326
            ->update(
1327
                'tx_crawler_queue',
1328
                $field_array,
1329
                ['qid' => (int)$queueId]
1330
            );
1331
1332
        $result = $this->queueExecutor->executeQueueItem($queueRec, $this);
1333
        $resultData = unserialize($result['content']);
1334
1335
        //atm there's no need to point to specific pollable extensions
1336
        if (is_array($GLOBALS['TYPO3_CONF_VARS']['EXTCONF']['crawler']['pollSuccess'])) {
1337
            foreach ($GLOBALS['TYPO3_CONF_VARS']['EXTCONF']['crawler']['pollSuccess'] as $pollable) {
1338
                // only check the success value if the instruction is runnig
1339
                // it is important to name the pollSuccess key same as the procInstructions key
1340
                if (is_array($resultData['parameters']['procInstructions']) && in_array(
1341
                    $pollable,
1342
                    $resultData['parameters']['procInstructions']
1343
                )
1344
                ) {
1345
                    if (!empty($resultData['success'][$pollable]) && $resultData['success'][$pollable]) {
1346
                        $ret |= self::CLI_STATUS_POLLABLE_PROCESSED;
1347
                    }
1348
                }
1349
            }
1350
        }
1351
1352
        // Set result in log which also denotes the end of the processing of this entry.
1353
        $field_array = ['result_data' => serialize($result)];
1354
1355
        SignalSlotUtility::emitSignal(
1356
            __CLASS__,
1357
            SignalSlotUtility::SIGNNAL_QUEUEITEM_POSTPROCESS,
1358
            [$queueId, &$field_array]
1359
        );
1360
1361
        GeneralUtility::makeInstance(ConnectionPool::class)->getConnectionForTable('tx_crawler_queue')
1362
            ->update(
1363
                'tx_crawler_queue',
1364
                $field_array,
1365
                ['qid' => (int)$queueId]
1366
            );
1367
1368
        $this->logger->debug('crawler-readurl stop ' . microtime(true));
1369
        return $ret;
1370
    }
1371
1372
    /**
1373
     * Read URL for not-yet-inserted log-entry
1374
     *
1375
     * @param array $field_array Queue field array,
1376
     *
1377
     * @return string
1378
     */
1379
    public function readUrlFromArray($field_array)
1380
    {
1381
        // Set exec_time to lock record:
1382
        $field_array['exec_time'] = $this->getCurrentTime();
1383
        $connectionForCrawlerQueue = GeneralUtility::makeInstance(ConnectionPool::class)->getConnectionForTable($this->tableName);
1384
        $connectionForCrawlerQueue->insert(
1385
            $this->tableName,
1386
            $field_array
1387
        );
1388
        $queueId = $field_array['qid'] = $connectionForCrawlerQueue->lastInsertId($this->tableName, 'qid');
1389
1390
        $result = $this->queueExecutor->executeQueueItem($field_array, $this);
1391
1392
        // Set result in log which also denotes the end of the processing of this entry.
1393
        $field_array = ['result_data' => serialize($result)];
1394
1395
        SignalSlotUtility::emitSignal(
1396
            __CLASS__,
1397
            SignalSlotUtility::SIGNNAL_QUEUEITEM_POSTPROCESS,
1398
            [$queueId, &$field_array]
1399
        );
1400
1401
        $connectionForCrawlerQueue->update(
1402
            $this->tableName,
1403
            $field_array,
1404
            ['qid' => $queueId]
1405
        );
1406
1407
        return $result;
1408
    }
1409
1410
    /*****************************
1411
     *
1412
     * Compiling URLs to crawl - tools
1413
     *
1414
     *****************************/
1415
1416
    /**
1417
     * @param integer $id Root page id to start from.
1418
     * @param integer $depth Depth of tree, 0=only id-page, 1= on sublevel, 99 = infinite
1419
     * @param integer $scheduledTime Unix Time when the URL is timed to be visited when put in queue
1420
     * @param integer $reqMinute Number of requests per minute (creates the interleave between requests)
1421
     * @param boolean $submitCrawlUrls If set, submits the URLs to queue in database (real crawling)
1422
     * @param boolean $downloadCrawlUrls If set (and submitcrawlUrls is false) will fill $downloadUrls with entries)
1423
     * @param array $incomingProcInstructions Array of processing instructions
1424
     * @param array $configurationSelection Array of configuration keys
1425
     * @return string
1426
     */
1427
    public function getPageTreeAndUrls(
1428
        $id,
1429
        $depth,
1430
        $scheduledTime,
1431
        $reqMinute,
1432
        $submitCrawlUrls,
1433
        $downloadCrawlUrls,
1434
        array $incomingProcInstructions,
1435
        array $configurationSelection
1436
    ) {
1437
        $this->scheduledTime = $scheduledTime;
1438
        $this->reqMinute = $reqMinute;
1439
        $this->submitCrawlUrls = $submitCrawlUrls;
1440
        $this->downloadCrawlUrls = $downloadCrawlUrls;
1441
        $this->incomingProcInstructions = $incomingProcInstructions;
1442
        $this->incomingConfigurationSelection = $configurationSelection;
1443
1444
        $this->duplicateTrack = [];
1445
        $this->downloadUrls = [];
1446
1447
        // Drawing tree:
1448
        /* @var PageTreeView $tree */
1449
        $tree = GeneralUtility::makeInstance(PageTreeView::class);
1450
        $perms_clause = $this->getBackendUser()->getPagePermsClause(Permission::PAGE_SHOW);
1451
        $tree->init('AND ' . $perms_clause);
1452
1453
        $pageInfo = BackendUtility::readPageAccess($id, $perms_clause);
1454
        if (is_array($pageInfo)) {
1455
            // Set root row:
1456
            $tree->tree[] = [
1457
                'row' => $pageInfo,
1458
                'HTML' => $this->iconFactory->getIconForRecord('pages', $pageInfo, Icon::SIZE_SMALL),
1459
            ];
1460
        }
1461
1462
        // Get branch beneath:
1463
        if ($depth) {
1464
            $tree->getTree($id, $depth, '');
1465
        }
1466
1467
        // Traverse page tree:
1468
        $code = '';
1469
1470
        foreach ($tree->tree as $data) {
1471
            $this->MP = false;
1472
1473
            // recognize mount points
1474
            if ($data['row']['doktype'] == PageRepository::DOKTYPE_MOUNTPOINT) {
1475
                $queryBuilder = GeneralUtility::makeInstance(ConnectionPool::class)->getQueryBuilderForTable($this->tableName);
1476
                $queryBuilder->getRestrictions()->removeAll()->add(GeneralUtility::makeInstance(DeletedRestriction::class));
1477
                $mountpage = $queryBuilder
1478
                    ->select('*')
1479
                    ->from('pages')
1480
                    ->where(
1481
                        $queryBuilder->expr()->eq('uid', $queryBuilder->createNamedParameter($data['row']['uid'], \PDO::PARAM_INT))
1482
                    )
1483
                    ->execute()
1484
                    ->fetchAll();
1485
                $queryBuilder->resetRestrictions();
1486
1487
                // fetch mounted pages
1488
                $this->MP = $mountpage[0]['mount_pid'] . '-' . $data['row']['uid'];
0 ignored issues
show
Documentation Bug introduced by
The property $MP was declared of type boolean, but $mountpage[0]['mount_pid...' . $data['row']['uid'] is of type string. Maybe add a type cast?

This check looks for assignments to scalar types that may be of the wrong type.

To ensure the code behaves as expected, it may be a good idea to add an explicit type cast.

$answer = 42;

$correct = false;

$correct = (bool) $answer;
Loading history...
1489
1490
                $mountTree = GeneralUtility::makeInstance(PageTreeView::class);
1491
                $mountTree->init('AND ' . $perms_clause);
1492
                $mountTree->getTree($mountpage[0]['mount_pid'], $depth);
1493
1494
                foreach ($mountTree->tree as $mountData) {
1495
                    $code .= $this->drawURLs_addRowsForPage(
1496
                        $mountData['row'],
1497
                        $mountData['HTML'] . BackendUtility::getRecordTitle('pages', $mountData['row'], true)
1498
                    );
1499
                }
1500
1501
                // replace page when mount_pid_ol is enabled
1502
                if ($mountpage[0]['mount_pid_ol']) {
1503
                    $data['row']['uid'] = $mountpage[0]['mount_pid'];
1504
                } else {
1505
                    // if the mount_pid_ol is not set the MP must not be used for the mountpoint page
1506
                    $this->MP = false;
1507
                }
1508
            }
1509
1510
            $code .= $this->drawURLs_addRowsForPage(
1511
                $data['row'],
1512
                $data['HTML'] . BackendUtility::getRecordTitle('pages', $data['row'], true)
1513
            );
1514
        }
1515
1516
        return $code;
1517
    }
1518
1519
    /**
1520
     * Expands exclude string
1521
     *
1522
     * @param string $excludeString Exclude string
1523
     * @return array
1524
     */
1525
    public function expandExcludeString($excludeString)
1526
    {
1527
        // internal static caches;
1528
        static $expandedExcludeStringCache;
1529
        static $treeCache;
1530
1531
        if (empty($expandedExcludeStringCache[$excludeString])) {
1532
            $pidList = [];
1533
1534
            if (!empty($excludeString)) {
1535
                /** @var PageTreeView $tree */
1536
                $tree = GeneralUtility::makeInstance(PageTreeView::class);
1537
                $tree->init('AND ' . $this->getBackendUser()->getPagePermsClause(Permission::PAGE_SHOW));
1538
1539
                $excludeParts = GeneralUtility::trimExplode(',', $excludeString);
1540
1541
                foreach ($excludeParts as $excludePart) {
1542
                    [$pid, $depth] = GeneralUtility::trimExplode('+', $excludePart);
0 ignored issues
show
Bug introduced by
The variable $pid does not exist. Did you forget to declare it?

This check marks access to variables or properties that have not been declared yet. While PHP has no explicit notion of declaring a variable, accessing it before a value is assigned to it is most likely a bug.

Loading history...
Bug introduced by
The variable $depth does not seem to be defined for all execution paths leading up to this point.

If you define a variable conditionally, it can happen that it is not defined for all execution paths.

Let’s take a look at an example:

function myFunction($a) {
    switch ($a) {
        case 'foo':
            $x = 1;
            break;

        case 'bar':
            $x = 2;
            break;
    }

    // $x is potentially undefined here.
    echo $x;
}

In the above example, the variable $x is defined if you pass “foo” or “bar” as argument for $a. However, since the switch statement has no default case statement, if you pass any other value, the variable $x would be undefined.

Available Fixes

  1. Check for existence of the variable explicitly:

    function myFunction($a) {
        switch ($a) {
            case 'foo':
                $x = 1;
                break;
    
            case 'bar':
                $x = 2;
                break;
        }
    
        if (isset($x)) { // Make sure it's always set.
            echo $x;
        }
    }
    
  2. Define a default value for the variable:

    function myFunction($a) {
        $x = ''; // Set a default which gets overridden for certain paths.
        switch ($a) {
            case 'foo':
                $x = 1;
                break;
    
            case 'bar':
                $x = 2;
                break;
        }
    
        echo $x;
    }
    
  3. Add a value for the missing path:

    function myFunction($a) {
        switch ($a) {
            case 'foo':
                $x = 1;
                break;
    
            case 'bar':
                $x = 2;
                break;
    
            // We add support for the missing case.
            default:
                $x = '';
                break;
        }
    
        echo $x;
    }
    
Loading history...
1543
1544
                    // default is "page only" = "depth=0"
1545
                    if (empty($depth)) {
1546
                        $depth = (stristr($excludePart, '+')) ? 99 : 0;
1547
                    }
1548
1549
                    $pidList[] = $pid;
1550
1551
                    if ($depth > 0) {
1552
                        if (empty($treeCache[$pid][$depth])) {
1553
                            $tree->reset();
1554
                            $tree->getTree($pid, $depth);
1555
                            $treeCache[$pid][$depth] = $tree->tree;
1556
                        }
1557
1558
                        foreach ($treeCache[$pid][$depth] as $data) {
1559
                            $pidList[] = $data['row']['uid'];
1560
                        }
1561
                    }
1562
                }
1563
            }
1564
1565
            $expandedExcludeStringCache[$excludeString] = array_unique($pidList);
1566
        }
1567
1568
        return $expandedExcludeStringCache[$excludeString];
1569
    }
1570
1571
    /**
1572
     * Create the rows for display of the page tree
1573
     * For each page a number of rows are shown displaying GET variable configuration
1574
     *
1575
     * @param    array        Page row
1576
     * @param    string        Page icon and title for row
1577
     * @return    string        HTML <tr> content (one or more)
1578
     */
1579
    public function drawURLs_addRowsForPage(array $pageRow, $pageTitleAndIcon)
1580
    {
1581
        $skipMessage = '';
1582
1583
        // Get list of configurations
1584
        $configurations = $this->getUrlsForPageRow($pageRow, $skipMessage);
1585
1586
        if (!empty($this->incomingConfigurationSelection)) {
1587
            // remove configuration that does not match the current selection
1588
            foreach ($configurations as $confKey => $confArray) {
1589
                if (!in_array($confKey, $this->incomingConfigurationSelection)) {
1590
                    unset($configurations[$confKey]);
1591
                }
1592
            }
1593
        }
1594
1595
        // Traverse parameter combinations:
1596
        $c = 0;
1597
        $content = '';
1598
        if (!empty($configurations)) {
1599
            foreach ($configurations as $confKey => $confArray) {
1600
1601
                    // Title column:
1602
                if (!$c) {
1603
                    $titleClm = '<td rowspan="' . count($configurations) . '">' . $pageTitleAndIcon . '</td>';
1604
                } else {
1605
                    $titleClm = '';
1606
                }
1607
1608
                if (!in_array($pageRow['uid'], $this->expandExcludeString($confArray['subCfg']['exclude']))) {
1609
1610
                        // URL list:
1611
                    $urlList = $this->urlListFromUrlArray(
1612
                        $confArray,
1613
                        $pageRow,
1614
                        $this->scheduledTime,
1615
                        $this->reqMinute,
1616
                        $this->submitCrawlUrls,
1617
                        $this->downloadCrawlUrls,
1618
                        $this->duplicateTrack,
1619
                        $this->downloadUrls,
1620
                        $this->incomingProcInstructions // if empty the urls won't be filtered by processing instructions
1621
                    );
1622
1623
                    // Expanded parameters:
1624
                    $paramExpanded = '';
1625
                    $calcAccu = [];
1626
                    $calcRes = 1;
1627
                    foreach ($confArray['paramExpanded'] as $gVar => $gVal) {
1628
                        $paramExpanded .= '
1629
                            <tr>
1630
                                <td>' . htmlspecialchars('&' . $gVar . '=') . '<br/>' .
1631
                                '(' . count($gVal) . ')' .
1632
                                '</td>
1633
                                <td nowrap="nowrap">' . nl2br(htmlspecialchars(implode(chr(10), $gVal))) . '</td>
1634
                            </tr>
1635
                        ';
1636
                        $calcRes *= count($gVal);
1637
                        $calcAccu[] = count($gVal);
1638
                    }
1639
                    $paramExpanded = '<table>' . $paramExpanded . '</table>';
1640
                    $paramExpanded .= 'Comb: ' . implode('*', $calcAccu) . '=' . $calcRes;
1641
1642
                    // Options
1643
                    $optionValues = '';
1644
                    if ($confArray['subCfg']['userGroups']) {
1645
                        $optionValues .= 'User Groups: ' . $confArray['subCfg']['userGroups'] . '<br/>';
1646
                    }
1647
                    if ($confArray['subCfg']['procInstrFilter']) {
1648
                        $optionValues .= 'ProcInstr: ' . $confArray['subCfg']['procInstrFilter'] . '<br/>';
1649
                    }
1650
1651
                    // Compile row:
1652
                    $content .= '
1653
                        <tr>
1654
                            ' . $titleClm . '
1655
                            <td>' . htmlspecialchars($confKey) . '</td>
1656
                            <td>' . nl2br(htmlspecialchars(rawurldecode(trim(str_replace('&', chr(10) . '&', GeneralUtility::implodeArrayForUrl('', $confArray['paramParsed'])))))) . '</td>
1657
                            <td>' . $paramExpanded . '</td>
1658
                            <td nowrap="nowrap">' . $urlList . '</td>
1659
                            <td nowrap="nowrap">' . $optionValues . '</td>
1660
                            <td nowrap="nowrap">' . DebugUtility::viewArray($confArray['subCfg']['procInstrParams.']) . '</td>
1661
                        </tr>';
1662
                } else {
1663
                    $content .= '<tr>
1664
                            ' . $titleClm . '
1665
                            <td>' . htmlspecialchars($confKey) . '</td>
1666
                            <td colspan="5"><em>No entries</em> (Page is excluded in this configuration)</td>
1667
                        </tr>';
1668
                }
1669
1670
                $c++;
1671
            }
1672
        } else {
1673
            $message = !empty($skipMessage) ? ' (' . $skipMessage . ')' : '';
1674
1675
            // Compile row:
1676
            $content .= '
1677
                <tr>
1678
                    <td>' . $pageTitleAndIcon . '</td>
1679
                    <td colspan="6"><em>No entries</em>' . $message . '</td>
1680
                </tr>';
1681
        }
1682
1683
        return $content;
1684
    }
1685
1686
    /*****************************
1687
     *
1688
     * CLI functions
1689
     *
1690
     *****************************/
1691
1692
    /**
1693
     * Running the functionality of the CLI (crawling URLs from queue)
1694
     *
1695
     * @param int $countInARun
1696
     * @param int $sleepTime
1697
     * @param int $sleepAfterFinish
1698
     * @return string
1699
     */
1700
    public function CLI_run($countInARun, $sleepTime, $sleepAfterFinish)
1701
    {
1702
        $result = 0;
1703
        $counter = 0;
1704
1705
        // First, run hooks:
1706
        $this->CLI_runHooks();
1707
1708
        // Clean up the queue
1709
        if ((int)$this->extensionSettings['purgeQueueDays'] > 0) {
1710
            $purgeDate = $this->getCurrentTime() - 24 * 60 * 60 * (int)$this->extensionSettings['purgeQueueDays'];
1711
1712
            $queryBuilderDelete = GeneralUtility::makeInstance(ConnectionPool::class)->getQueryBuilderForTable($this->tableName);
1713
            $del = $queryBuilderDelete
1714
                ->delete($this->tableName)
1715
                ->where(
1716
                    'exec_time != 0 AND exec_time < ' . $purgeDate
1717
                )->execute();
1718
1719
            if (false === $del) {
1720
                $this->logger->info(
1721
                    'Records could not be deleted.'
1722
                );
1723
            }
1724
        }
1725
1726
        // Select entries:
1727
        //TODO Shouldn't this reside within the transaction?
1728
        $queryBuilderSelect = GeneralUtility::makeInstance(ConnectionPool::class)->getQueryBuilderForTable($this->tableName);
1729
        $rows = $queryBuilderSelect
1730
            ->select('qid', 'scheduled')
1731
            ->from($this->tableName)
1732
            ->where(
1733
                $queryBuilderSelect->expr()->eq('exec_time', 0),
1734
                $queryBuilderSelect->expr()->eq('process_scheduled', 0),
1735
                $queryBuilderSelect->expr()->lte('scheduled', $this->getCurrentTime())
1736
            )
1737
            ->orderBy('scheduled')
1738
            ->addOrderBy('qid')
1739
            ->setMaxResults($countInARun)
1740
            ->execute()
1741
            ->fetchAll();
1742
1743
        if (!empty($rows)) {
1744
            $quidList = [];
1745
1746
            foreach ($rows as $r) {
1747
                $quidList[] = $r['qid'];
1748
            }
1749
1750
            $processId = $this->CLI_buildProcessId();
1751
1752
            //reserve queue entries for process
1753
1754
            //$this->queryBuilder->getConnection()->executeQuery('BEGIN');
1755
            //TODO make sure we're not taking assigned queue-entires
1756
1757
            //save the number of assigned queue entrys to determine who many have been processed later
1758
            $queryBuilderUpdate = GeneralUtility::makeInstance(ConnectionPool::class)->getQueryBuilderForTable($this->tableName);
1759
            $numberOfAffectedRows = $queryBuilderUpdate
1760
                ->update($this->tableName)
1761
                ->where(
1762
                    $queryBuilderUpdate->expr()->in('qid', $quidList)
1763
                )
1764
                ->set('process_scheduled', $this->getCurrentTime())
1765
                ->set('process_id', $processId)
1766
                ->execute();
1767
1768
            GeneralUtility::makeInstance(ConnectionPool::class)->getConnectionForTable('tx_crawler_process')
1769
                ->update(
1770
                    'tx_crawler_process',
1771
                    ['assigned_items_count' => (int)$numberOfAffectedRows],
1772
                    ['process_id' => $processId]
1773
                );
1774
1775
            if ($numberOfAffectedRows == count($quidList)) {
1776
                //$this->queryBuilder->getConnection()->executeQuery('COMMIT');
1777
            } else {
1778
                //$this->queryBuilder->getConnection()->executeQuery('ROLLBACK');
1779
                $this->CLI_debug("Nothing processed due to multi-process collision (" . $this->CLI_buildProcessId() . ")");
1780
                return ($result | self::CLI_STATUS_ABORTED);
1781
            }
1782
1783
            foreach ($rows as $r) {
1784
                $result |= $this->readUrl($r['qid']);
1785
1786
                $counter++;
1787
                usleep((int)$sleepTime); // Just to relax the system
1788
1789
                // if during the start and the current read url the cli has been disable we need to return from the function
1790
                // mark the process NOT as ended.
1791
                if ($this->getDisabled()) {
1792
                    return ($result | self::CLI_STATUS_ABORTED);
1793
                }
1794
1795
                if (!$this->processRepository->isProcessActive($this->CLI_buildProcessId())) {
1796
                    $this->CLI_debug("conflict / timeout (" . $this->CLI_buildProcessId() . ")");
1797
1798
                    //TODO might need an additional returncode
1799
                    $result |= self::CLI_STATUS_ABORTED;
1800
                    break; //possible timeout
1801
                }
1802
            }
1803
1804
            sleep((int)$sleepAfterFinish);
1805
1806
            $msg = 'Rows: ' . $counter;
1807
            $this->CLI_debug($msg . " (" . $this->CLI_buildProcessId() . ")");
1808
        } else {
1809
            $this->CLI_debug("Nothing within queue which needs to be processed (" . $this->CLI_buildProcessId() . ")");
1810
        }
1811
1812
        if ($counter > 0) {
1813
            $result |= self::CLI_STATUS_PROCESSED;
1814
        }
1815
1816
        return $result;
1817
    }
1818
1819
    /**
1820
     * Activate hooks
1821
     *
1822
     * @return void
1823
     */
1824
    public function CLI_runHooks(): void
1825
    {
1826
        foreach ($GLOBALS['TYPO3_CONF_VARS']['EXTCONF']['crawler']['cli_hooks'] ?? [] as $objRef) {
1827
            $hookObj = GeneralUtility::makeInstance($objRef);
1828
            if (is_object($hookObj)) {
1829
                $hookObj->crawler_init($this);
1830
            }
1831
        }
1832
    }
1833
1834
    /**
1835
     * Try to acquire a new process with the given id
1836
     * also performs some auto-cleanup for orphan processes
1837
     * @todo preemption might not be the most elegant way to clean up
1838
     *
1839
     * @param string $id identification string for the process
1840
     * @return boolean
1841
     */
1842
    public function CLI_checkAndAcquireNewProcess($id)
1843
    {
1844
        $queryBuilder = GeneralUtility::makeInstance(ConnectionPool::class)->getQueryBuilderForTable($this->tableName);
1845
        $ret = true;
1846
1847
        $systemProcessId = getmypid();
1848
        if ($systemProcessId < 1) {
1849
            return false;
1850
        }
1851
1852
        $processCount = 0;
1853
        $orphanProcesses = [];
1854
1855
        //$this->queryBuilder->getConnection()->executeQuery('BEGIN');
1856
1857
        $statement = $queryBuilder
1858
            ->select('process_id', 'ttl')
1859
            ->from('tx_crawler_process')
1860
            ->where(
1861
                'active = 1 AND deleted = 0'
1862
            )
1863
            ->execute();
1864
1865
        $currentTime = $this->getCurrentTime();
1866
1867
        while ($row = $statement->fetch()) {
1868
            if ($row['ttl'] < $currentTime) {
1869
                $orphanProcesses[] = $row['process_id'];
1870
            } else {
1871
                $processCount++;
1872
            }
1873
        }
1874
1875
        // if there are less than allowed active processes then add a new one
1876
        if ($processCount < (int)$this->extensionSettings['processLimit']) {
1877
            $this->CLI_debug("add process " . $this->CLI_buildProcessId() . " (" . ($processCount + 1) . "/" . (int)$this->extensionSettings['processLimit'] . ")");
1878
1879
            GeneralUtility::makeInstance(ConnectionPool::class)->getConnectionForTable('tx_crawler_process')->insert(
1880
                'tx_crawler_process',
1881
                [
1882
                    'process_id' => $id,
1883
                    'active' => 1,
1884
                    'ttl' => $currentTime + (int)$this->extensionSettings['processMaxRunTime'],
1885
                    'system_process_id' => $systemProcessId,
1886
                ]
1887
            );
1888
        } else {
1889
            $this->CLI_debug("Processlimit reached (" . ($processCount) . "/" . (int)$this->extensionSettings['processLimit'] . ")");
1890
            $ret = false;
1891
        }
1892
1893
        $this->processRepository->deleteProcessesMarkedAsDeleted();
1894
        $this->CLI_releaseProcesses($orphanProcesses, true); // maybe this should be somehow included into the current lock
1895
1896
        return $ret;
1897
    }
1898
1899
    /**
1900
     * Release a process and the required resources
1901
     *
1902
     * @param  mixed    $releaseIds   string with a single process-id or array with multiple process-ids
1903
     * @param  boolean  $withinLock   show whether the DB-actions are included within an existing lock
1904
     * @return boolean
1905
     */
1906
    public function CLI_releaseProcesses($releaseIds, $withinLock = false)
1907
    {
1908
        $queryBuilder = GeneralUtility::makeInstance(ConnectionPool::class)->getQueryBuilderForTable($this->tableName);
1909
1910
        if (!is_array($releaseIds)) {
1911
            $releaseIds = [$releaseIds];
1912
        }
1913
1914
        if (empty($releaseIds)) {
1915
            return false;   //nothing to release
1916
        }
1917
1918
        if (!$withinLock) {
1919
            //$this->queryBuilder->getConnection()->executeQuery('BEGIN');
1920
        }
1921
1922
        // some kind of 2nd chance algo - this way you need at least 2 processes to have a real cleanup
1923
        // this ensures that a single process can't mess up the entire process table
1924
1925
        // mark all processes as deleted which have no "waiting" queue-entires and which are not active
1926
1927
        $queryBuilder
1928
        ->update($this->tableName, 'q')
1929
        ->where(
1930
            'q.process_id IN(SELECT p.process_id FROM tx_crawler_process as p WHERE p.active = 0)'
1931
        )
1932
        ->set('q.process_scheduled', 0)
1933
        ->set('q.process_id', '')
1934
        ->execute();
1935
1936
        // FIXME: Not entirely sure that this is equivalent to the previous version
1937
        $queryBuilder->resetQueryPart('set');
1938
1939
        $queryBuilder
1940
            ->update('tx_crawler_process')
1941
            ->where(
1942
                $queryBuilder->expr()->eq('active', 0),
1943
                'process_id IN(SELECT q.process_id FROM tx_crawler_queue as q WHERE q.exec_time = 0)'
1944
            )
1945
            ->set('system_process_id', 0)
1946
            ->execute();
1947
        // previous version for reference
1948
        /*
1949
        $GLOBALS['TYPO3_DB']->exec_UPDATEquery(
1950
            'tx_crawler_process',
1951
            'active=0 AND deleted=0
1952
            AND NOT EXISTS (
1953
                SELECT * FROM tx_crawler_queue
1954
                WHERE tx_crawler_queue.process_id = tx_crawler_process.process_id
1955
                AND tx_crawler_queue.exec_time = 0
1956
            )',
1957
            [
1958
                'deleted' => '1',
1959
                'system_process_id' => 0
1960
            ]
1961
        );*/
1962
        // mark all requested processes as non-active
1963
        $queryBuilder
1964
            ->update('tx_crawler_process')
1965
            ->where(
1966
                'NOT EXISTS (
1967
                SELECT * FROM tx_crawler_queue
1968
                    WHERE tx_crawler_queue.process_id = tx_crawler_process.process_id
1969
                    AND tx_crawler_queue.exec_time = 0
1970
                )',
1971
                $queryBuilder->expr()->in('process_id', $queryBuilder->createNamedParameter($releaseIds, Connection::PARAM_STR_ARRAY)),
1972
                $queryBuilder->expr()->eq('deleted', 0)
1973
            )
1974
            ->set('active', 0)
1975
            ->execute();
1976
        $queryBuilder->resetQueryPart('set');
1977
        $queryBuilder
1978
            ->update($this->tableName)
1979
            ->where(
1980
                $queryBuilder->expr()->eq('exec_time', 0),
1981
                $queryBuilder->expr()->in('process_id', $queryBuilder->createNamedParameter($releaseIds, Connection::PARAM_STR_ARRAY))
1982
            )
1983
            ->set('process_scheduled', 0)
1984
            ->set('process_id', '')
1985
            ->execute();
1986
1987
        if (!$withinLock) {
1988
            //$this->queryBuilder->getConnection()->executeQuery('COMMIT');
1989
        }
1990
1991
        return true;
1992
    }
1993
1994
    /**
1995
     * Create a unique Id for the current process
1996
     *
1997
     * @return string  the ID
1998
     */
1999 1
    public function CLI_buildProcessId()
2000
    {
2001 1
        if (!$this->processID) {
2002
            $this->processID = GeneralUtility::shortMD5(microtime(true));
2003
        }
2004 1
        return $this->processID;
2005
    }
2006
2007
    /**
2008
     * Prints a message to the stdout (only if debug-mode is enabled)
2009
     *
2010
     * @param  string $msg  the message
2011
     */
2012
    public function CLI_debug($msg): void
2013
    {
2014
        if ((int)$this->extensionSettings['processDebug']) {
2015
            echo $msg . "\n";
2016
            flush();
2017
        }
2018
    }
2019
2020
    /**
2021
     * Cleans up entries that stayed for too long in the queue. These are:
2022
     * - processed entries that are over 1.5 days in age
2023
     * - scheduled entries that are over 7 days old
2024
     *
2025
     * @return void
2026
     */
2027
    public function cleanUpOldQueueEntries(): void
2028
    {
2029
        $processedAgeInSeconds = $this->extensionSettings['cleanUpProcessedAge'] * 86400; // 24*60*60 Seconds in 24 hours
2030
        $scheduledAgeInSeconds = $this->extensionSettings['cleanUpScheduledAge'] * 86400;
2031
2032
        $now = time();
2033
        $condition = '(exec_time<>0 AND exec_time<' . ($now - $processedAgeInSeconds) . ') OR scheduled<=' . ($now - $scheduledAgeInSeconds);
2034
        $this->flushQueue($condition);
2035
    }
2036
2037
    /**
2038
     * Returns a md5 hash generated from a serialized configuration array.
2039
     *
2040
     * @param array $configuration
2041
     *
2042
     * @return string
2043
     */
2044 5
    protected function getConfigurationHash(array $configuration)
2045
    {
2046 5
        unset($configuration['paramExpanded']);
2047 5
        unset($configuration['URLs']);
2048 5
        return md5(serialize($configuration));
2049
    }
2050
2051
    /**
2052
     * Build a URL from a Page and the Query String. If the page has a Site configuration, it can be built by using
2053
     * the Site instance.
2054
     *
2055
     * @param int $pageId
2056
     * @param string $queryString
2057
     * @param string|null $alternativeBaseUrl
2058
     * @param int $httpsOrHttp see tx_crawler_configuration.force_ssl
2059
     * @return UriInterface
2060
     * @throws \TYPO3\CMS\Core\Exception\SiteNotFoundException
2061
     * @throws \TYPO3\CMS\Core\Routing\InvalidRouteArgumentsException
2062
     */
2063
    protected function getUrlFromPageAndQueryParameters(int $pageId, string $queryString, ?string $alternativeBaseUrl, int $httpsOrHttp): UriInterface
2064
    {
2065
        $site = GeneralUtility::makeInstance(SiteMatcher::class)->matchByPageId((int)$pageId);
2066
        if ($site instanceof Site) {
0 ignored issues
show
Bug introduced by
The class TYPO3\CMS\Core\Site\Entity\Site does not exist. Did you forget a USE statement, or did you not list all dependencies?

This error could be the result of:

1. Missing dependencies

PHP Analyzer uses your composer.json file (if available) to determine the dependencies of your project and to determine all the available classes and functions. It expects the composer.json to be in the root folder of your repository.

Are you sure this class is defined by one of your dependencies, or did you maybe not list a dependency in either the require or require-dev section?

2. Missing use statement

PHP does not complain about undefined classes in ìnstanceof checks. For example, the following PHP code will work perfectly fine:

if ($x instanceof DoesNotExist) {
    // Do something.
}

If you have not tested against this specific condition, such errors might go unnoticed.

Loading history...
2067
            $queryString = ltrim($queryString, '?&');
2068
            $queryParts = [];
2069
            parse_str($queryString, $queryParts);
2070
            unset($queryParts['id']);
2071
            // workaround as long as we don't have native language support in crawler configurations
2072
            if (isset($queryParts['L'])) {
2073
                $queryParts['_language'] = $queryParts['L'];
2074
                unset($queryParts['L']);
2075
                $siteLanguage = $site->getLanguageById((int)$queryParts['_language']);
0 ignored issues
show
Unused Code introduced by
$siteLanguage is not used, you could remove the assignment.

This check looks for variable assignements that are either overwritten by other assignments or where the variable is not used subsequently.

$myVar = 'Value';
$higher = false;

if (rand(1, 6) > 3) {
    $higher = true;
} else {
    $higher = false;
}

Both the $myVar assignment in line 1 and the $higher assignment in line 2 are dead. The first because $myVar is never used and the second because $higher is always overwritten for every possible time line.

Loading history...
2076
            } else {
2077
                $siteLanguage = $site->getDefaultLanguage();
0 ignored issues
show
Unused Code introduced by
$siteLanguage is not used, you could remove the assignment.

This check looks for variable assignements that are either overwritten by other assignments or where the variable is not used subsequently.

$myVar = 'Value';
$higher = false;

if (rand(1, 6) > 3) {
    $higher = true;
} else {
    $higher = false;
}

Both the $myVar assignment in line 1 and the $higher assignment in line 2 are dead. The first because $myVar is never used and the second because $higher is always overwritten for every possible time line.

Loading history...
2078
            }
2079
            $url = $site->getRouter()->generateUri($pageId, $queryParts);
2080
            if (!empty($alternativeBaseUrl)) {
2081
                $alternativeBaseUrl = new Uri($alternativeBaseUrl);
2082
                $url = $url->withHost($alternativeBaseUrl->getHost());
2083
                $url = $url->withScheme($alternativeBaseUrl->getScheme());
2084
                $url = $url->withPort($alternativeBaseUrl->getPort());
2085
            }
2086
        } else {
2087
            // Technically this is not possible with site handling, but kept for backwards-compatibility reasons
2088
            // Once EXT:crawler is v10-only compatible, this should be removed completely
2089
            $baseUrl = ($alternativeBaseUrl ?: GeneralUtility::getIndpEnv('TYPO3_SITE_URL'));
2090
            $cacheHashCalculator = GeneralUtility::makeInstance(CacheHashCalculator::class);
2091
            $queryString .= '&cHash=' . $cacheHashCalculator->generateForParameters($queryString);
2092
            $url = rtrim($baseUrl, '/') . '/index.php' . $queryString;
2093
            $url = new Uri($url);
2094
        }
2095
2096
        if ($httpsOrHttp === -1) {
2097
            $url = $url->withScheme('http');
2098
        } elseif ($httpsOrHttp === 1) {
2099
            $url = $url->withScheme('https');
2100
        }
2101
2102
        return $url;
2103
    }
2104
}
2105