Passed
Push — deprecations ( 3cb403...87297a )
by Tomas Norre
10:49 queued 06:55
created

CrawlerController::urlListFromUrlArray()   B

Complexity

Conditions 8
Paths 8

Size

Total Lines 66
Code Lines 38

Duplication

Lines 0
Ratio 0 %

Code Coverage

Tests 33
CRAP Score 8.1458

Importance

Changes 1
Bugs 0 Features 0
Metric Value
cc 8
eloc 38
c 1
b 0
f 0
nc 8
nop 9
dl 0
loc 66
ccs 33
cts 38
cp 0.8684
crap 8.1458
rs 8.0675

How to fix   Long Method    Many Parameters   

Long Method

Small methods make your code easier to understand, in particular if combined with a good name. Besides, if your method is small, finding a good name is usually much easier.

For example, if you find yourself adding comments to a method's body, this is usually a good sign to extract the commented part to a new method, and use the comment as a starting point when coming up with a good name for this new method.

Commonly applied refactorings include:

Many Parameters

Methods with many parameters are not only hard to understand, but their parameters also often become inconsistent when you need more, or different data.

There are several approaches to avoid long parameter lists:

1
<?php
2
3
declare(strict_types=1);
4
5
namespace AOE\Crawler\Controller;
6
7
/***************************************************************
8
 *  Copyright notice
9
 *
10
 *  (c) 2020 AOE GmbH <[email protected]>
11
 *
12
 *  All rights reserved
13
 *
14
 *  This script is part of the TYPO3 project. The TYPO3 project is
15
 *  free software; you can redistribute it and/or modify
16
 *  it under the terms of the GNU General Public License as published by
17
 *  the Free Software Foundation; either version 3 of the License, or
18
 *  (at your option) any later version.
19
 *
20
 *  The GNU General Public License can be found at
21
 *  http://www.gnu.org/copyleft/gpl.html.
22
 *
23
 *  This script is distributed in the hope that it will be useful,
24
 *  but WITHOUT ANY WARRANTY; without even the implied warranty of
25
 *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
26
 *  GNU General Public License for more details.
27
 *
28
 *  This copyright notice MUST APPEAR in all copies of the script!
29
 ***************************************************************/
30
31
use AOE\Crawler\Configuration\ExtensionConfigurationProvider;
32
use AOE\Crawler\Domain\Repository\ConfigurationRepository;
33
use AOE\Crawler\Domain\Repository\ProcessRepository;
34
use AOE\Crawler\Domain\Repository\QueueRepository;
35
use AOE\Crawler\Event\EventDispatcher;
36
use AOE\Crawler\QueueExecutor;
37
use AOE\Crawler\Utility\SignalSlotUtility;
38
use Psr\Http\Message\UriInterface;
39
use Psr\Log\LoggerAwareInterface;
40
use Psr\Log\LoggerAwareTrait;
41
use TYPO3\CMS\Backend\Tree\View\PageTreeView;
42
use TYPO3\CMS\Backend\Utility\BackendUtility;
43
use TYPO3\CMS\Core\Authentication\BackendUserAuthentication;
44
use TYPO3\CMS\Core\Core\Bootstrap;
45
use TYPO3\CMS\Core\Core\Environment;
46
use TYPO3\CMS\Core\Database\Connection;
47
use TYPO3\CMS\Core\Database\ConnectionPool;
48
use TYPO3\CMS\Core\Database\Query\Restriction\DeletedRestriction;
49
use TYPO3\CMS\Core\Http\Uri;
50
use TYPO3\CMS\Core\Imaging\Icon;
51
use TYPO3\CMS\Core\Imaging\IconFactory;
52
use TYPO3\CMS\Core\Routing\SiteMatcher;
53
use TYPO3\CMS\Core\Site\Entity\Site;
54
use TYPO3\CMS\Core\Type\Bitmask\Permission;
55
use TYPO3\CMS\Core\TypoScript\Parser\TypoScriptParser;
56
use TYPO3\CMS\Core\Utility\DebugUtility;
57
use TYPO3\CMS\Core\Utility\GeneralUtility;
58
use TYPO3\CMS\Core\Utility\MathUtility;
59
use TYPO3\CMS\Extbase\Object\ObjectManager;
60
use TYPO3\CMS\Frontend\Page\CacheHashCalculator;
61
use TYPO3\CMS\Frontend\Page\PageRepository;
62
63
/**
64
 * Class CrawlerController
65
 *
66
 * @package AOE\Crawler\Controller
67
 */
68
class CrawlerController implements LoggerAwareInterface
69
{
70
    use LoggerAwareTrait;
71
72
    public const CLI_STATUS_NOTHING_PROCCESSED = 0;
73
74
    public const CLI_STATUS_REMAIN = 1; //queue not empty
75
76
    public const CLI_STATUS_PROCESSED = 2; //(some) queue items where processed
77
78
    public const CLI_STATUS_ABORTED = 4; //instance didn't finish
79
80
    public const CLI_STATUS_POLLABLE_PROCESSED = 8;
81
82
    /**
83
     * @var integer
84
     */
85
    public $setID = 0;
86
87
    /**
88
     * @var string
89
     */
90
    public $processID = '';
91
92
    /**
93
     * @var array
94
     */
95
    public $duplicateTrack = [];
96
97
    /**
98
     * @var array
99
     */
100
    public $downloadUrls = [];
101
102
    /**
103
     * @var array
104
     */
105
    public $incomingProcInstructions = [];
106
107
    /**
108
     * @var array
109
     */
110
    public $incomingConfigurationSelection = [];
111
112
    /**
113
     * @var bool
114
     */
115
    public $registerQueueEntriesInternallyOnly = false;
116
117
    /**
118
     * @var array
119
     */
120
    public $queueEntries = [];
121
122
    /**
123
     * @var array
124
     */
125
    public $urlList = [];
126
127
    /**
128
     * @var array
129
     */
130
    public $extensionSettings = [];
131
132
    /**
133
     * Mount Point
134
     *
135
     * @var bool
136
     * Todo: Check what this is used for and adjust the type hint or code, as bool doesn't match the current code.
137
     */
138
    public $MP = false;
139
140
    /**
141
     * @var string
142
     */
143
    protected $processFilename;
144
145
    /**
146
     * Holds the internal access mode can be 'gui','cli' or 'cli_im'
147
     *
148
     * @var string
149
     */
150
    protected $accessMode;
151
152
    /**
153
     * @var QueueRepository
154
     */
155
    protected $queueRepository;
156
157
    /**
158
     * @var ProcessRepository
159
     */
160
    protected $processRepository;
161
162
    /**
163
     * @var ConfigurationRepository
164
     */
165
    protected $configurationRepository;
166
167
    /**
168
     * @var string
169
     */
170
    protected $tableName = 'tx_crawler_queue';
171
172
    /**
173
     * @var QueueExecutor
174
     */
175
    protected $queueExecutor;
176
177
    /**
178
     * @var int
179
     */
180
    protected $maximumUrlsToCompile = 10000;
181
182
    /**
183
     * @var IconFactory
184
     */
185
    protected $iconFactory;
186
187
    /**
188
     * @var BackendUserAuthentication|null
189
     */
190
    private $backendUser;
191
192
    /**
193
     * @var integer
194
     */
195
    private $scheduledTime = 0;
196
197
    /**
198
     * @var integer
199
     */
200
    private $reqMinute = 0;
201
202
    /**
203
     * @var bool
204
     */
205
    private $submitCrawlUrls = false;
206
207
    /**
208
     * @var bool
209
     */
210
    private $downloadCrawlUrls = false;
211
212
    /************************************
213
     *
214
     * Getting URLs based on Page TSconfig
215
     *
216
     ************************************/
217
218 41
    public function __construct()
219
    {
220 41
        $objectManager = GeneralUtility::makeInstance(ObjectManager::class);
221 41
        $this->queueRepository = $objectManager->get(QueueRepository::class);
222 41
        $this->processRepository = $objectManager->get(ProcessRepository::class);
223 41
        $this->configurationRepository = $objectManager->get(ConfigurationRepository::class);
224 41
        $this->queueExecutor = $objectManager->get(QueueExecutor::class);
225 41
        $this->iconFactory = GeneralUtility::makeInstance(IconFactory::class);
226
227 41
        $this->processFilename = Environment::getVarPath() . '/lock/tx_crawler.proc';
228
229
        /** @var ExtensionConfigurationProvider $configurationProvider */
230 41
        $configurationProvider = GeneralUtility::makeInstance(ExtensionConfigurationProvider::class);
231 41
        $settings = $configurationProvider->getExtensionConfiguration();
232 41
        $this->extensionSettings = is_array($settings) ? $settings : [];
0 ignored issues
show
introduced by
The condition is_array($settings) is always true.
Loading history...
233
234
        // set defaults:
235 41
        if (MathUtility::convertToPositiveInteger($this->extensionSettings['countInARun']) === 0) {
236
            $this->extensionSettings['countInARun'] = 100;
237
        }
238
239 41
        $this->extensionSettings['processLimit'] = MathUtility::forceIntegerInRange($this->extensionSettings['processLimit'], 1, 99, 1);
240 41
        $this->maximumUrlsToCompile = MathUtility::forceIntegerInRange($this->extensionSettings['maxCompileUrls'], 1, 1000000000, 10000);
241 41
    }
242
243
    /**
244
     * Method to set the accessMode can be gui, cli or cli_im
245
     *
246
     * @return string
247
     */
248 1
    public function getAccessMode()
249
    {
250 1
        return $this->accessMode;
251
    }
252
253
    /**
254
     * @param string $accessMode
255
     */
256 1
    public function setAccessMode($accessMode): void
257
    {
258 1
        $this->accessMode = $accessMode;
259 1
    }
260
261
    /**
262
     * Set disabled status to prevent processes from being processed
263
     *
264
     * @param bool $disabled (optional, defaults to true)
265
     */
266 2
    public function setDisabled($disabled = true): void
267
    {
268 2
        if ($disabled) {
269 1
            GeneralUtility::writeFile($this->processFilename, '');
270
        } else {
271 1
            if (is_file($this->processFilename)) {
272 1
                unlink($this->processFilename);
273
            }
274
        }
275 2
    }
276
277
    /**
278
     * Get disable status
279
     *
280
     * @return bool true if disabled
281
     */
282 2
    public function getDisabled()
283
    {
284 2
        return is_file($this->processFilename);
285
    }
286
287
    /**
288
     * @param string $filenameWithPath
289
     */
290 3
    public function setProcessFilename($filenameWithPath): void
291
    {
292 3
        $this->processFilename = $filenameWithPath;
293 3
    }
294
295
    /**
296
     * @return string
297
     */
298 1
    public function getProcessFilename()
299
    {
300 1
        return $this->processFilename;
301
    }
302
303
    /**
304
     * Sets the extensions settings (unserialized pendant of $TYPO3_CONF_VARS['EXT']['extConf']['crawler']).
305
     */
306 12
    public function setExtensionSettings(array $extensionSettings): void
307
    {
308 12
        $this->extensionSettings = $extensionSettings;
309 12
    }
310
311
    /**
312
     * Check if the given page should be crawled
313
     *
314
     * @return false|string false if the page should be crawled (not excluded), true / skipMessage if it should be skipped
315
     */
316 8
    public function checkIfPageShouldBeSkipped(array $pageRow)
317
    {
318 8
        $skipPage = false;
319 8
        $skipMessage = 'Skipped'; // message will be overwritten later
320
321
        // if page is hidden
322 8
        if (! $this->extensionSettings['crawlHiddenPages']) {
323 8
            if ($pageRow['hidden']) {
324 1
                $skipPage = true;
325 1
                $skipMessage = 'Because page is hidden';
326
            }
327
        }
328
329 8
        if (! $skipPage) {
330 7
            if (GeneralUtility::inList('3,4', $pageRow['doktype']) || $pageRow['doktype'] >= 199) {
331 3
                $skipPage = true;
332 3
                $skipMessage = 'Because doktype is not allowed';
333
            }
334
        }
335
336 8
        if (! $skipPage) {
337 4
            foreach ($GLOBALS['TYPO3_CONF_VARS']['EXTCONF']['crawler']['excludeDoktype'] ?? [] as $key => $doktypeList) {
338 1
                if (GeneralUtility::inList($doktypeList, $pageRow['doktype'])) {
339 1
                    $skipPage = true;
340 1
                    $skipMessage = 'Doktype was excluded by "' . $key . '"';
341 1
                    break;
342
                }
343
            }
344
        }
345
346 8
        if (! $skipPage) {
347
            // veto hook
348 3
            foreach ($GLOBALS['TYPO3_CONF_VARS']['EXTCONF']['crawler']['pageVeto'] ?? [] as $key => $func) {
349
                $params = [
350
                    'pageRow' => $pageRow,
351
                ];
352
                // expects "false" if page is ok and "true" or a skipMessage if this page should _not_ be crawled
353
                $veto = GeneralUtility::callUserFunction($func, $params, $this);
354
                if ($veto !== false) {
355
                    $skipPage = true;
356
                    if (is_string($veto)) {
357
                        $skipMessage = $veto;
358
                    } else {
359
                        $skipMessage = 'Veto from hook "' . htmlspecialchars($key) . '"';
360
                    }
361
                    // no need to execute other hooks if a previous one return a veto
362
                    break;
363
                }
364
            }
365
        }
366
367 8
        return $skipPage ? $skipMessage : false;
368
    }
369
370
    /**
371
     * Wrapper method for getUrlsForPageId()
372
     * It returns an array of configurations and no urls!
373
     *
374
     * @param array $pageRow Page record with at least dok-type and uid columns.
375
     * @param string $skipMessage
376
     * @return array
377
     * @see getUrlsForPageId()
378
     */
379 4
    public function getUrlsForPageRow(array $pageRow, &$skipMessage = '')
380
    {
381 4
        $message = $this->checkIfPageShouldBeSkipped($pageRow);
382
383 4
        if ($message === false) {
384 3
            $res = $this->getUrlsForPageId($pageRow['uid']);
385 3
            $skipMessage = '';
386
        } else {
387 1
            $skipMessage = $message;
388 1
            $res = [];
389
        }
390
391 4
        return $res;
392
    }
393
394
    /**
395
     * Creates a list of URLs from input array (and submits them to queue if asked for)
396
     * See Web > Info module script + "indexed_search"'s crawler hook-client using this!
397
     *
398
     * @param array $vv Information about URLs from pageRow to crawl.
399
     * @param array $pageRow Page row
400
     * @param int $scheduledTime Unix time to schedule indexing to, typically time()
401
     * @param int $reqMinute Number of requests per minute (creates the interleave between requests)
402
     * @param bool $submitCrawlUrls If set, submits the URLs to queue
403
     * @param bool $downloadCrawlUrls If set (and submitcrawlUrls is false) will fill $downloadUrls with entries)
404
     * @param array $duplicateTrack Array which is passed by reference and contains the an id per url to secure we will not crawl duplicates
405
     * @param array $downloadUrls Array which will be filled with URLS for download if flag is set.
406
     * @param array $incomingProcInstructions Array of processing instructions
407
     * @return string List of URLs (meant for display in backend module)
408
     */
409 2
    public function urlListFromUrlArray(
410
        array $vv,
411
        array $pageRow,
412
        $scheduledTime,
413
        $reqMinute,
414
        $submitCrawlUrls,
415
        $downloadCrawlUrls,
416
        array &$duplicateTrack,
417
        array &$downloadUrls,
418
        array $incomingProcInstructions
419
    ) {
420 2
        if (! is_array($vv['URLs'])) {
421
            return 'ERROR - no URL generated';
422
        }
423 2
        $urlLog = [];
424 2
        $pageId = (int) $pageRow['uid'];
425 2
        $configurationHash = $this->getConfigurationHash($vv);
426 2
        $skipInnerCheck = $this->queueRepository->noUnprocessedQueueEntriesForPageWithConfigurationHashExist($pageId, $configurationHash);
427
428 2
        foreach ($vv['URLs'] as $urlQuery) {
429 2
            if (! $this->drawURLs_PIfilter($vv['subCfg']['procInstrFilter'], $incomingProcInstructions)) {
430
                continue;
431
            }
432 2
            $url = (string) $this->getUrlFromPageAndQueryParameters(
433 2
                $pageId,
434 2
                $urlQuery,
435 2
                $vv['subCfg']['baseUrl'] ?? null,
436 2
                $vv['subCfg']['force_ssl'] ?? 0
437
            );
438
439
            // Create key by which to determine unique-ness:
440 2
            $uKey = $url . '|' . $vv['subCfg']['userGroups'] . '|' . $vv['subCfg']['procInstrFilter'];
441
442 2
            if (isset($duplicateTrack[$uKey])) {
443
                //if the url key is registered just display it and do not resubmit is
444
                $urlLog[] = '<em><span class="text-muted">' . htmlspecialchars($url) . '</span></em>';
445
            } else {
446
                // Scheduled time:
447 2
                $schTime = $scheduledTime + round(count($duplicateTrack) * (60 / $reqMinute));
448 2
                $schTime = intval($schTime / 60) * 60;
449 2
                $formattedDate = BackendUtility::datetime($schTime);
450 2
                $this->urlList[] = '[' . $formattedDate . '] ' . $url;
451 2
                $urlList = '[' . $formattedDate . '] ' . htmlspecialchars($url);
452
453
                // Submit for crawling!
454 2
                if ($submitCrawlUrls) {
455 2
                    $added = $this->addUrl(
456 2
                        $pageId,
457 2
                        $url,
458 2
                        $vv['subCfg'],
459 2
                        $scheduledTime,
460 2
                        $configurationHash,
461 2
                        $skipInnerCheck
462
                    );
463 2
                    if ($added === false) {
464 2
                        $urlList .= ' (URL already existed)';
465
                    }
466
                } elseif ($downloadCrawlUrls) {
467
                    $downloadUrls[$url] = $url;
468
                }
469 2
                $urlLog[] = $urlList;
470
            }
471 2
            $duplicateTrack[$uKey] = true;
472
        }
473
474 2
        return implode('<br>', $urlLog);
475
    }
476
477
    /**
478
     * Returns true if input processing instruction is among registered ones.
479
     *
480
     * @param string $piString PI to test
481
     * @param array $incomingProcInstructions Processing instructions
482
     * @return boolean
483
     */
484 5
    public function drawURLs_PIfilter($piString, array $incomingProcInstructions)
485
    {
486 5
        if (empty($incomingProcInstructions)) {
487 1
            return true;
488
        }
489
490 4
        foreach ($incomingProcInstructions as $pi) {
491 4
            if (GeneralUtility::inList($piString, $pi)) {
492 2
                return true;
493
            }
494
        }
495 2
        return false;
496
    }
497
498 3
    public function getPageTSconfigForId($id): array
499
    {
500 3
        if (! $this->MP) {
501 3
            $pageTSconfig = BackendUtility::getPagesTSconfig($id);
0 ignored issues
show
Deprecated Code introduced by
The function TYPO3\CMS\Backend\Utilit...ity::getPagesTSconfig() has been deprecated. ( Ignorable by Annotation )

If this is a false-positive, you can also ignore this issue in your code via the ignore-deprecated  annotation

501
            $pageTSconfig = /** @scrutinizer ignore-deprecated */ BackendUtility::getPagesTSconfig($id);
Loading history...
502
        } else {
503
            // TODO: Please check, this makes no sense to split a boolean value.
504
            [, $mountPointId] = explode('-', $this->MP);
0 ignored issues
show
Bug introduced by
$this->MP of type true is incompatible with the type string expected by parameter $string of explode(). ( Ignorable by Annotation )

If this is a false-positive, you can also ignore this issue in your code via the ignore-type  annotation

504
            [, $mountPointId] = explode('-', /** @scrutinizer ignore-type */ $this->MP);
Loading history...
505
            $pageTSconfig = BackendUtility::getPagesTSconfig($mountPointId);
0 ignored issues
show
Bug introduced by
$mountPointId of type string is incompatible with the type integer expected by parameter $id of TYPO3\CMS\Backend\Utilit...ity::getPagesTSconfig(). ( Ignorable by Annotation )

If this is a false-positive, you can also ignore this issue in your code via the ignore-type  annotation

505
            $pageTSconfig = BackendUtility::getPagesTSconfig(/** @scrutinizer ignore-type */ $mountPointId);
Loading history...
Deprecated Code introduced by
The function TYPO3\CMS\Backend\Utilit...ity::getPagesTSconfig() has been deprecated. ( Ignorable by Annotation )

If this is a false-positive, you can also ignore this issue in your code via the ignore-deprecated  annotation

505
            $pageTSconfig = /** @scrutinizer ignore-deprecated */ BackendUtility::getPagesTSconfig($mountPointId);
Loading history...
506
        }
507
508
        // Call a hook to alter configuration
509 3
        if (is_array($GLOBALS['TYPO3_CONF_VARS']['EXTCONF']['crawler']['getPageTSconfigForId'])) {
510
            $params = [
511
                'pageId' => $id,
512
                'pageTSConfig' => &$pageTSconfig,
513
            ];
514
            foreach ($GLOBALS['TYPO3_CONF_VARS']['EXTCONF']['crawler']['getPageTSconfigForId'] as $userFunc) {
515
                GeneralUtility::callUserFunction($userFunc, $params, $this);
516
            }
517
        }
518 3
        return $pageTSconfig;
519
    }
520
521
    /**
522
     * This methods returns an array of configurations.
523
     * Adds no urls!
524
     */
525 2
    public function getUrlsForPageId(int $pageId): array
526
    {
527
        // Get page TSconfig for page ID
528 2
        $pageTSconfig = $this->getPageTSconfigForId($pageId);
529
530 2
        $res = [];
531
532
        // Fetch Crawler Configuration from pageTSconfig
533 2
        $crawlerCfg = $pageTSconfig['tx_crawler.']['crawlerCfg.']['paramSets.'] ?? [];
534 2
        foreach ($crawlerCfg as $key => $values) {
535 1
            if (! is_array($values)) {
536 1
                continue;
537
            }
538 1
            $key = str_replace('.', '', $key);
539
            // Sub configuration for a single configuration string:
540 1
            $subCfg = (array) $crawlerCfg[$key . '.'];
541 1
            $subCfg['key'] = $key;
542
543 1
            if (strcmp($subCfg['procInstrFilter'], '')) {
544 1
                $subCfg['procInstrFilter'] = implode(',', GeneralUtility::trimExplode(',', $subCfg['procInstrFilter']));
545
            }
546 1
            $pidOnlyList = implode(',', GeneralUtility::trimExplode(',', $subCfg['pidsOnly'], true));
547
548
            // process configuration if it is not page-specific or if the specific page is the current page:
549
            // TODO: Check if $pidOnlyList can be kept as Array instead of imploded
550 1
            if (! strcmp((string) $subCfg['pidsOnly'], '') || GeneralUtility::inList($pidOnlyList, strval($pageId))) {
551
552
                // Explode, process etc.:
553 1
                $res[$key] = [];
554 1
                $res[$key]['subCfg'] = $subCfg;
555 1
                $res[$key]['paramParsed'] = GeneralUtility::explodeUrl2Array($crawlerCfg[$key]);
556 1
                $res[$key]['paramExpanded'] = $this->expandParameters($res[$key]['paramParsed'], $pageId);
557 1
                $res[$key]['origin'] = 'pagets';
558
559
                // recognize MP value
560 1
                if (! $this->MP) {
561 1
                    $res[$key]['URLs'] = $this->compileUrls($res[$key]['paramExpanded'], ['?id=' . $pageId]);
562
                } else {
563
                    $res[$key]['URLs'] = $this->compileUrls($res[$key]['paramExpanded'], ['?id=' . $pageId . '&MP=' . $this->MP]);
0 ignored issues
show
Bug introduced by
Are you sure $this->MP of type true can be used in concatenation? ( Ignorable by Annotation )

If this is a false-positive, you can also ignore this issue in your code via the ignore-type  annotation

563
                    $res[$key]['URLs'] = $this->compileUrls($res[$key]['paramExpanded'], ['?id=' . $pageId . '&MP=' . /** @scrutinizer ignore-type */ $this->MP]);
Loading history...
564
                }
565
            }
566
        }
567
568
        // Get configuration from tx_crawler_configuration records up the rootline
569 2
        $crawlerConfigurations = $this->configurationRepository->getCrawlerConfigurationRecordsFromRootLine($pageId);
570 2
        foreach ($crawlerConfigurations as $configurationRecord) {
571
572
            // check access to the configuration record
573 1
            if (empty($configurationRecord['begroups']) || $this->getBackendUser()->isAdmin() || $this->hasGroupAccess($this->getBackendUser()->user['usergroup_cached_list'], $configurationRecord['begroups'])) {
574 1
                $pidOnlyList = implode(',', GeneralUtility::trimExplode(',', $configurationRecord['pidsonly'], true));
575
576
                // process configuration if it is not page-specific or if the specific page is the current page:
577
                // TODO: Check if $pidOnlyList can be kept as Array instead of imploded
578 1
                if (! strcmp($configurationRecord['pidsonly'], '') || GeneralUtility::inList($pidOnlyList, strval($pageId))) {
579 1
                    $key = $configurationRecord['name'];
580
581
                    // don't overwrite previously defined paramSets
582 1
                    if (! isset($res[$key])) {
583
584
                        /* @var $TSparserObject \TYPO3\CMS\Core\TypoScript\Parser\TypoScriptParser */
585 1
                        $TSparserObject = GeneralUtility::makeInstance(TypoScriptParser::class);
586 1
                        $TSparserObject->parse($configurationRecord['processing_instruction_parameters_ts']);
587
588
                        $subCfg = [
589 1
                            'procInstrFilter' => $configurationRecord['processing_instruction_filter'],
590 1
                            'procInstrParams.' => $TSparserObject->setup,
591 1
                            'baseUrl' => $configurationRecord['base_url'],
592 1
                            'force_ssl' => (int) $configurationRecord['force_ssl'],
593 1
                            'userGroups' => $configurationRecord['fegroups'],
594 1
                            'exclude' => $configurationRecord['exclude'],
595 1
                            'key' => $key,
596
                        ];
597
598 1
                        if (! in_array($pageId, $this->expandExcludeString($subCfg['exclude']), true)) {
599 1
                            $res[$key] = [];
600 1
                            $res[$key]['subCfg'] = $subCfg;
601 1
                            $res[$key]['paramParsed'] = GeneralUtility::explodeUrl2Array($configurationRecord['configuration']);
602 1
                            $res[$key]['paramExpanded'] = $this->expandParameters($res[$key]['paramParsed'], $pageId);
603 1
                            $res[$key]['URLs'] = $this->compileUrls($res[$key]['paramExpanded'], ['?id=' . $pageId]);
604 1
                            $res[$key]['origin'] = 'tx_crawler_configuration_' . $configurationRecord['uid'];
605
                        }
606
                    }
607
                }
608
            }
609
        }
610
611 2
        foreach ($GLOBALS['TYPO3_CONF_VARS']['EXTCONF']['crawler']['processUrls'] ?? [] as $func) {
612
            $params = [
613
                'res' => &$res,
614
            ];
615
            GeneralUtility::callUserFunction($func, $params, $this);
616
        }
617 2
        return $res;
618
    }
619
620
    /**
621
     * Find all configurations of subpages of a page
622
     * TODO: Write Functional Tests
623
     */
624 1
    public function getConfigurationsForBranch(int $rootid, int $depth): array
625
    {
626 1
        $configurationsForBranch = [];
627 1
        $pageTSconfig = $this->getPageTSconfigForId($rootid);
628 1
        $sets = $pageTSconfig['tx_crawler.']['crawlerCfg.']['paramSets.'] ?? [];
629 1
        foreach ($sets as $key => $value) {
630
            if (! is_array($value)) {
631
                continue;
632
            }
633
            $configurationsForBranch[] = substr($key, -1) === '.' ? substr($key, 0, -1) : $key;
634
        }
635 1
        $pids = [];
636 1
        $rootLine = BackendUtility::BEgetRootLine($rootid);
637 1
        foreach ($rootLine as $node) {
638 1
            $pids[] = $node['uid'];
639
        }
640
        /* @var PageTreeView $tree */
641 1
        $tree = GeneralUtility::makeInstance(PageTreeView::class);
642 1
        $perms_clause = $this->getBackendUser()->getPagePermsClause(Permission::PAGE_SHOW);
643 1
        $tree->init(empty($perms_clause) ? '' : ('AND ' . $perms_clause));
644 1
        $tree->getTree($rootid, $depth, '');
645 1
        foreach ($tree->tree as $node) {
646
            $pids[] = $node['row']['uid'];
647
        }
648
649 1
        $queryBuilder = $this->getQueryBuilder('tx_crawler_configuration');
650
        $statement = $queryBuilder
651 1
            ->select('name')
652 1
            ->from('tx_crawler_configuration')
653 1
            ->where(
654 1
                $queryBuilder->expr()->in('pid', $queryBuilder->createNamedParameter($pids, Connection::PARAM_INT_ARRAY))
655
            )
656 1
            ->execute();
657
658 1
        while ($row = $statement->fetch()) {
659 1
            $configurationsForBranch[] = $row['name'];
660
        }
661 1
        return $configurationsForBranch;
662
    }
663
664
    /**
665
     * Check if a user has access to an item
666
     * (e.g. get the group list of the current logged in user from $GLOBALS['TSFE']->gr_list)
667
     *
668
     * @param string $groupList Comma-separated list of (fe_)group UIDs from a user
669
     * @param string $accessList Comma-separated list of (fe_)group UIDs of the item to access
670
     * @return bool                 TRUE if at least one of the users group UIDs is in the access list or the access list is empty
671
     * @see \TYPO3\CMS\Frontend\Page\PageRepository::getMultipleGroupsWhereClause()
672
     */
673 3
    public function hasGroupAccess($groupList, $accessList)
674
    {
675 3
        if (empty($accessList)) {
676 1
            return true;
677
        }
678 2
        foreach (GeneralUtility::intExplode(',', $groupList) as $groupUid) {
679 2
            if (GeneralUtility::inList($accessList, $groupUid)) {
680 1
                return true;
681
            }
682
        }
683 1
        return false;
684
    }
685
686
    /**
687
     * Will expand the parameters configuration to individual values. This follows a certain syntax of the value of each parameter.
688
     * Syntax of values:
689
     * - Basically: If the value is wrapped in [...] it will be expanded according to the following syntax, otherwise the value is taken literally
690
     * - Configuration is splitted by "|" and the parts are processed individually and finally added together
691
     * - For each configuration part:
692
     *         - "[int]-[int]" = Integer range, will be expanded to all values in between, values included, starting from low to high (max. 1000). Example "1-34" or "-40--30"
693
     *         - "_TABLE:[TCA table name];[_PID:[optional page id, default is current page]];[_ENABLELANG:1]" = Look up of table records from PID, filtering out deleted records. Example "_TABLE:tt_content; _PID:123"
694
     *        _ENABLELANG:1 picks only original records without their language overlays
695
     *         - Default: Literal value
696
     *
697
     * @param array $paramArray Array with key (GET var name) and values (value of GET var which is configuration for expansion)
698
     * @param integer $pid Current page ID
699
     * @return array
700
     *
701
     * TODO: Write Functional Tests
702
     */
703 9
    public function expandParameters($paramArray, $pid)
704
    {
705
        // Traverse parameter names:
706 9
        foreach ($paramArray as $p => $v) {
707 9
            $v = trim($v);
708
709
            // If value is encapsulated in square brackets it means there are some ranges of values to find, otherwise the value is literal
710 9
            if (strpos($v, '[') === 0 && substr($v, -1) === ']') {
711
                // So, find the value inside brackets and reset the paramArray value as an array.
712 9
                $v = substr($v, 1, -1);
713 9
                $paramArray[$p] = [];
714
715
                // Explode parts and traverse them:
716 9
                $parts = explode('|', $v);
717 9
                foreach ($parts as $pV) {
718
719
                    // Look for integer range: (fx. 1-34 or -40--30 // reads minus 40 to minus 30)
720 9
                    if (preg_match('/^(-?[0-9]+)\s*-\s*(-?[0-9]+)$/', trim($pV), $reg)) {
721 1
                        $reg = $this->swapIfFirstIsLargerThanSecond($reg);
722
723
                        // Traverse range, add values:
724 1
                        $runAwayBrake = 1000; // Limit to size of range!
725 1
                        for ($a = $reg[1]; $a <= $reg[2]; $a++) {
726 1
                            $paramArray[$p][] = $a;
727 1
                            $runAwayBrake--;
728 1
                            if ($runAwayBrake <= 0) {
729
                                break;
730
                            }
731
                        }
732 8
                    } elseif (strpos(trim($pV), '_TABLE:') === 0) {
733
734
                        // Parse parameters:
735 6
                        $subparts = GeneralUtility::trimExplode(';', $pV);
736 6
                        $subpartParams = [];
737 6
                        foreach ($subparts as $spV) {
738 6
                            [$pKey, $pVal] = GeneralUtility::trimExplode(':', $spV);
739 6
                            $subpartParams[$pKey] = $pVal;
740
                        }
741
742
                        // Table exists:
743 6
                        if (isset($GLOBALS['TCA'][$subpartParams['_TABLE']])) {
744 6
                            $lookUpPid = isset($subpartParams['_PID']) ? intval($subpartParams['_PID']) : intval($pid);
745 6
                            $recursiveDepth = isset($subpartParams['_RECURSIVE']) ? intval($subpartParams['_RECURSIVE']) : 0;
746 6
                            $pidField = isset($subpartParams['_PIDFIELD']) ? trim($subpartParams['_PIDFIELD']) : 'pid';
747 6
                            $where = $subpartParams['_WHERE'] ?? '';
748 6
                            $addTable = $subpartParams['_ADDTABLE'] ?? '';
749
750 6
                            $fieldName = $subpartParams['_FIELD'] ? $subpartParams['_FIELD'] : 'uid';
751 6
                            if ($fieldName === 'uid' || $GLOBALS['TCA'][$subpartParams['_TABLE']]['columns'][$fieldName]) {
752 6
                                $queryBuilder = $this->getQueryBuilder($subpartParams['_TABLE']);
753
754 6
                                if ($recursiveDepth > 0) {
755
                                    /** @var \TYPO3\CMS\Core\Database\QueryGenerator $queryGenerator */
756 2
                                    $queryGenerator = GeneralUtility::makeInstance(\TYPO3\CMS\Core\Database\QueryGenerator::class);
757 2
                                    $pidList = $queryGenerator->getTreeList($lookUpPid, $recursiveDepth, 0, 1);
758 2
                                    $pidArray = GeneralUtility::intExplode(',', $pidList);
759
                                } else {
760 4
                                    $pidArray = [(string) $lookUpPid];
761
                                }
762
763 6
                                $queryBuilder->getRestrictions()
764 6
                                    ->removeAll()
765 6
                                    ->add(GeneralUtility::makeInstance(DeletedRestriction::class));
766
767
                                $queryBuilder
768 6
                                    ->select($fieldName)
769 6
                                    ->from($subpartParams['_TABLE'])
770 6
                                    ->where(
771 6
                                        $queryBuilder->expr()->in($pidField, $queryBuilder->createNamedParameter($pidArray, Connection::PARAM_INT_ARRAY)),
772 6
                                        $where
773
                                    );
774
775 6
                                if (! empty($addTable)) {
776
                                    // TODO: Check if this works as intended!
777
                                    $queryBuilder->add('from', $addTable);
778
                                }
779 6
                                $transOrigPointerField = $GLOBALS['TCA'][$subpartParams['_TABLE']]['ctrl']['transOrigPointerField'];
780
781 6
                                if ($subpartParams['_ENABLELANG'] && $transOrigPointerField) {
782
                                    $queryBuilder->andWhere(
783
                                        $queryBuilder->expr()->lte(
784
                                            $transOrigPointerField,
785
                                            0
786
                                        )
787
                                    );
788
                                }
789
790 6
                                $statement = $queryBuilder->execute();
791
792 6
                                $rows = [];
793 6
                                while ($row = $statement->fetch()) {
794 6
                                    $rows[$row[$fieldName]] = $row;
795
                                }
796
797 6
                                if (is_array($rows)) {
798 6
                                    $paramArray[$p] = array_merge($paramArray[$p], array_keys($rows));
799
                                }
800
                            }
801
                        }
802
                    } else { // Just add value:
803 2
                        $paramArray[$p][] = $pV;
804
                    }
805
                    // Hook for processing own expandParameters place holder
806 9
                    if (is_array($GLOBALS['TYPO3_CONF_VARS']['SC_OPTIONS']['crawler/class.tx_crawler_lib.php']['expandParameters'])) {
807
                        $_params = [
808
                            'pObj' => &$this,
809
                            'paramArray' => &$paramArray,
810
                            'currentKey' => $p,
811
                            'currentValue' => $pV,
812
                            'pid' => $pid,
813
                        ];
814
                        foreach ($GLOBALS['TYPO3_CONF_VARS']['SC_OPTIONS']['crawler/class.tx_crawler_lib.php']['expandParameters'] as $_funcRef) {
815
                            GeneralUtility::callUserFunction($_funcRef, $_params, $this);
816
                        }
817
                    }
818
                }
819
820
                // Make unique set of values and sort array by key:
821 9
                $paramArray[$p] = array_unique($paramArray[$p]);
822 9
                ksort($paramArray);
823
            } else {
824
                // Set the literal value as only value in array:
825 2
                $paramArray[$p] = [$v];
826
            }
827
        }
828
829 9
        return $paramArray;
830
    }
831
832
    /**
833
     * Compiling URLs from parameter array (output of expandParameters())
834
     * The number of URLs will be the multiplication of the number of parameter values for each key
835
     *
836
     * @param array $paramArray Output of expandParameters(): Array with keys (GET var names) and for each an array of values
837
     * @param array $urls URLs accumulated in this array (for recursion)
838
     * @return array
839
     */
840 5
    public function compileUrls($paramArray, array $urls)
841
    {
842 5
        if (empty($paramArray)) {
843 5
            return $urls;
844
        }
845
        // shift first off stack:
846 4
        reset($paramArray);
847 4
        $varName = key($paramArray);
848 4
        $valueSet = array_shift($paramArray);
849
850
        // Traverse value set:
851 4
        $newUrls = [];
852 4
        foreach ($urls as $url) {
853 3
            foreach ($valueSet as $val) {
854 3
                $newUrls[] = $url . (strcmp((string) $val, '') ? '&' . rawurlencode($varName) . '=' . rawurlencode((string) $val) : '');
855
856 3
                if (count($newUrls) > $this->maximumUrlsToCompile) {
857
                    break;
858
                }
859
            }
860
        }
861 4
        return $this->compileUrls($paramArray, $newUrls);
862
    }
863
864
    /************************************
865
     *
866
     * Crawler log
867
     *
868
     ************************************/
869
870
    /**
871
     * Return array of records from crawler queue for input page ID
872
     *
873
     * @param integer $id Page ID for which to look up log entries.
874
     * @param string $filter Filter: "all" => all entries, "pending" => all that is not yet run, "finished" => all complete ones
875
     * @param boolean $doFlush If TRUE, then entries selected at DELETED(!) instead of selected!
876
     * @param boolean $doFullFlush
877
     * @param integer $itemsPerPage Limit the amount of entries per page default is 10
878
     * @return array
879
     */
880 4
    public function getLogEntriesForPageId($id, $filter = '', $doFlush = false, $doFullFlush = false, $itemsPerPage = 10)
881
    {
882 4
        $queryBuilder = GeneralUtility::makeInstance(ConnectionPool::class)->getQueryBuilderForTable($this->tableName);
883
        $queryBuilder
884 4
            ->select('*')
885 4
            ->from($this->tableName)
886 4
            ->where(
887 4
                $queryBuilder->expr()->eq('page_id', $queryBuilder->createNamedParameter($id, \PDO::PARAM_INT))
888
            )
889 4
            ->orderBy('scheduled', 'DESC');
890
891 4
        $expressionBuilder = GeneralUtility::makeInstance(ConnectionPool::class)
892 4
            ->getConnectionForTable($this->tableName)
893 4
            ->getExpressionBuilder();
894 4
        $query = $expressionBuilder->andX();
895
        // PHPStorm adds the highlight that the $addWhere is immediately overwritten,
896
        // but the $query = $expressionBuilder->andX() ensures that the $addWhere is written correctly with AND
897
        // between the statements, it's not a mistake in the code.
898 4
        $addWhere = '1=1';
899 4
        switch ($filter) {
900 4
            case 'pending':
901
                $queryBuilder->andWhere($queryBuilder->expr()->eq('exec_time', 0));
902
                $addWhere .= ' AND ' . $query->add($expressionBuilder->eq('exec_time', 0));
903
                break;
904 4
            case 'finished':
905
                $queryBuilder->andWhere($queryBuilder->expr()->gt('exec_time', 0));
906
                $addWhere .= ' AND ' . $query->add($expressionBuilder->gt('exec_time', 0));
907
                break;
908 4
            case 'all':
909
                $doFullFlush = $doFullFlush ?: false;
910
                break;
911
        }
912
913
        // FIXME: Write unit test that ensures that the right records are deleted.
914 4
        if ($doFlush) {
915
            // We do currently ignore PageId by flush.
916
            // To have pending and finished parameters accepted
917
            // 2020.04.11 - Tomas Mikkelsen
918
            // $addWhere = $query->add($expressionBuilder->eq('page_id', (int)$id));
919 2
            $this->queueRepository->flushQueue($doFullFlush ? '1=1' : $addWhere);
920 2
            return [];
921
        }
922 2
        if ($itemsPerPage > 0) {
923
            $queryBuilder
924 2
                ->setMaxResults((int) $itemsPerPage);
925
        }
926
927 2
        return $queryBuilder->execute()->fetchAll();
928
    }
929
930
    /**
931
     * Return array of records from crawler queue for input set ID
932
     *
933
     * @param int $set_id Set ID for which to look up log entries.
934
     * @param string $filter Filter: "all" => all entries, "pending" => all that is not yet run, "finished" => all complete ones
935
     * @param bool $doFlush If TRUE, then entries selected at DELETED(!) instead of selected!
936
     * @param int $itemsPerPage Limit the amount of entires per page default is 10
937
     * @return array
938
     */
939 6
    public function getLogEntriesForSetId(int $set_id, string $filter = '', bool $doFlush = false, bool $doFullFlush = false, int $itemsPerPage = 10)
940
    {
941 6
        $queryBuilder = GeneralUtility::makeInstance(ConnectionPool::class)->getQueryBuilderForTable($this->tableName);
942
        $queryBuilder
943 6
            ->select('*')
944 6
            ->from($this->tableName)
945 6
            ->where(
946 6
                $queryBuilder->expr()->eq('set_id', $queryBuilder->createNamedParameter($set_id, \PDO::PARAM_INT))
947
            )
948 6
            ->orderBy('scheduled', 'DESC');
949
950 6
        $expressionBuilder = GeneralUtility::makeInstance(ConnectionPool::class)
951 6
            ->getConnectionForTable($this->tableName)
952 6
            ->getExpressionBuilder();
953 6
        $query = $expressionBuilder->andX();
954
        // FIXME: Write Unit tests for Filters
955
        // PHPStorm adds the highlight that the $addWhere is immediately overwritten,
956
        // but the $query = $expressionBuilder->andX() ensures that the $addWhere is written correctly with AND
957
        // between the statements, it's not a mistake in the code.
958 6
        $addWhere = '';
959 6
        switch ($filter) {
960 6
            case 'pending':
961 1
                $queryBuilder->andWhere($queryBuilder->expr()->eq('exec_time', 0));
962 1
                $addWhere = $query->add($expressionBuilder->eq('exec_time', 0));
0 ignored issues
show
Unused Code introduced by
The assignment to $addWhere is dead and can be removed.
Loading history...
963 1
                break;
964 5
            case 'finished':
965 1
                $queryBuilder->andWhere($queryBuilder->expr()->gt('exec_time', 0));
966 1
                $addWhere = $query->add($expressionBuilder->gt('exec_time', 0));
967 1
                break;
968
        }
969
        // FIXME: Write unit test that ensures that the right records are deleted.
970 6
        if ($doFlush) {
971 4
            $addWhere = $query->add($expressionBuilder->eq('set_id', (int) $set_id));
972 4
            $this->queueRepository->flushQueue($doFullFlush ? '' : $addWhere);
973 4
            return [];
974
        }
975 2
        if ($itemsPerPage > 0) {
976
            $queryBuilder
977 2
                ->setMaxResults((int) $itemsPerPage);
978
        }
979
980 2
        return $queryBuilder->execute()->fetchAll();
981
    }
982
983
    /**
984
     * Adding call back entries to log (called from hooks typically, see indexed search class "class.crawler.php"
985
     *
986
     * @param integer $setId Set ID
987
     * @param array $params Parameters to pass to call back function
988
     * @param string $callBack Call back object reference, eg. 'EXT:indexed_search/class.crawler.php:&tx_indexedsearch_crawler'
989
     * @param integer $page_id Page ID to attach it to
990
     * @param integer $schedule Time at which to activate
991
     */
992
    public function addQueueEntry_callBack($setId, $params, $callBack, $page_id = 0, $schedule = 0): void
993
    {
994
        if (! is_array($params)) {
0 ignored issues
show
introduced by
The condition is_array($params) is always true.
Loading history...
995
            $params = [];
996
        }
997
        $params['_CALLBACKOBJ'] = $callBack;
998
999
        GeneralUtility::makeInstance(ConnectionPool::class)->getConnectionForTable('tx_crawler_queue')
1000
            ->insert(
1001
                'tx_crawler_queue',
1002
                [
1003
                    'page_id' => (int) $page_id,
1004
                    'parameters' => serialize($params),
1005
                    'scheduled' => (int) $schedule ?: $this->getCurrentTime(),
1006
                    'exec_time' => 0,
1007
                    'set_id' => (int) $setId,
1008
                    'result_data' => '',
1009
                ]
1010
            );
1011
    }
1012
1013
    /************************************
1014
     *
1015
     * URL setting
1016
     *
1017
     ************************************/
1018
1019
    /**
1020
     * Setting a URL for crawling:
1021
     *
1022
     * @param integer $id Page ID
1023
     * @param string $url Complete URL
1024
     * @param array $subCfg Sub configuration array (from TS config)
1025
     * @param integer $tstamp Scheduled-time
1026
     * @param string $configurationHash (optional) configuration hash
1027
     * @param bool $skipInnerDuplicationCheck (optional) skip inner duplication check
1028
     * @return bool
1029
     */
1030 6
    public function addUrl(
1031
        $id,
1032
        $url,
1033
        array $subCfg,
1034
        $tstamp,
1035
        $configurationHash = '',
1036
        $skipInnerDuplicationCheck = false
1037
    ) {
1038 6
        $urlAdded = false;
1039 6
        $rows = [];
1040
1041
        // Creating parameters:
1042
        $parameters = [
1043 6
            'url' => $url,
1044
        ];
1045
1046
        // fe user group simulation:
1047 6
        $uGs = implode(',', array_unique(GeneralUtility::intExplode(',', $subCfg['userGroups'], true)));
1048 6
        if ($uGs) {
1049 1
            $parameters['feUserGroupList'] = $uGs;
1050
        }
1051
1052
        // Setting processing instructions
1053 6
        $parameters['procInstructions'] = GeneralUtility::trimExplode(',', $subCfg['procInstrFilter']);
1054 6
        if (is_array($subCfg['procInstrParams.'])) {
1055 3
            $parameters['procInstrParams'] = $subCfg['procInstrParams.'];
1056
        }
1057
1058
        // Compile value array:
1059 6
        $parameters_serialized = serialize($parameters);
1060
        $fieldArray = [
1061 6
            'page_id' => (int) $id,
1062 6
            'parameters' => $parameters_serialized,
1063 6
            'parameters_hash' => GeneralUtility::shortMD5($parameters_serialized),
1064 6
            'configuration_hash' => $configurationHash,
1065 6
            'scheduled' => $tstamp,
1066 6
            'exec_time' => 0,
1067 6
            'set_id' => (int) $this->setID,
1068 6
            'result_data' => '',
1069 6
            'configuration' => $subCfg['key'],
1070
        ];
1071
1072 6
        if ($this->registerQueueEntriesInternallyOnly) {
1073
            //the entries will only be registered and not stored to the database
1074 1
            $this->queueEntries[] = $fieldArray;
1075
        } else {
1076 5
            if (! $skipInnerDuplicationCheck) {
1077
                // check if there is already an equal entry
1078 4
                $rows = $this->getDuplicateRowsIfExist($tstamp, $fieldArray);
1079
            }
1080
1081 5
            if (empty($rows)) {
1082 4
                $connectionForCrawlerQueue = GeneralUtility::makeInstance(ConnectionPool::class)->getConnectionForTable('tx_crawler_queue');
1083 4
                $connectionForCrawlerQueue->insert(
1084 4
                    'tx_crawler_queue',
1085 4
                    $fieldArray
1086
                );
1087 4
                $uid = $connectionForCrawlerQueue->lastInsertId('tx_crawler_queue', 'qid');
1088 4
                $rows[] = $uid;
1089 4
                $urlAdded = true;
1090 4
                EventDispatcher::getInstance()->post('urlAddedToQueue', strval($this->setID), ['uid' => $uid, 'fieldArray' => $fieldArray]);
1091
            } else {
1092 1
                EventDispatcher::getInstance()->post('duplicateUrlInQueue', strval($this->setID), ['rows' => $rows, 'fieldArray' => $fieldArray]);
1093
            }
1094
        }
1095
1096 6
        return $urlAdded;
1097
    }
1098
1099
    /**
1100
     * Returns the current system time
1101
     *
1102
     * @return int
1103
     */
1104
    public function getCurrentTime()
1105
    {
1106
        return time();
1107
    }
1108
1109
    /************************************
1110
     *
1111
     * URL reading
1112
     *
1113
     ************************************/
1114
1115
    /**
1116
     * Read URL for single queue entry
1117
     *
1118
     * @param integer $queueId
1119
     * @param boolean $force If set, will process even if exec_time has been set!
1120
     * @return integer
1121
     */
1122
    public function readUrl($queueId, $force = false)
1123
    {
1124
        $queryBuilder = GeneralUtility::makeInstance(ConnectionPool::class)->getQueryBuilderForTable($this->tableName);
1125
        $ret = 0;
1126
        $this->logger->debug('crawler-readurl start ' . microtime(true));
1127
        // Get entry:
1128
        $queryBuilder
1129
            ->select('*')
1130
            ->from('tx_crawler_queue')
1131
            ->where(
1132
                $queryBuilder->expr()->eq('qid', $queryBuilder->createNamedParameter($queueId, \PDO::PARAM_INT))
1133
            );
1134
        if (! $force) {
1135
            $queryBuilder
1136
                ->andWhere('exec_time = 0')
1137
                ->andWhere('process_scheduled > 0');
1138
        }
1139
        $queueRec = $queryBuilder->execute()->fetch();
1140
1141
        if (! is_array($queueRec)) {
1142
            return;
1143
        }
1144
1145
        SignalSlotUtility::emitSignal(
1146
            self::class,
1147
            SignalSlotUtility::SIGNAL_QUEUEITEM_PREPROCESS,
1148
            [$queueId, &$queueRec]
1149
        );
1150
1151
        // Set exec_time to lock record:
1152
        $field_array = ['exec_time' => $this->getCurrentTime()];
1153
1154
        if (isset($this->processID)) {
1155
            //if mulitprocessing is used we need to store the id of the process which has handled this entry
1156
            $field_array['process_id_completed'] = $this->processID;
1157
        }
1158
1159
        GeneralUtility::makeInstance(ConnectionPool::class)->getConnectionForTable('tx_crawler_queue')
1160
            ->update(
1161
                'tx_crawler_queue',
1162
                $field_array,
1163
                ['qid' => (int) $queueId]
1164
            );
1165
1166
        $result = $this->queueExecutor->executeQueueItem($queueRec, $this);
1167
        if ($result['content'] === null) {
1168
            $resultData = 'An errors happened';
1169
        } else {
1170
            $resultData = unserialize($result['content']);
1171
        }
1172
1173
        //atm there's no need to point to specific pollable extensions
1174
        if (is_array($GLOBALS['TYPO3_CONF_VARS']['EXTCONF']['crawler']['pollSuccess'])) {
1175
            foreach ($GLOBALS['TYPO3_CONF_VARS']['EXTCONF']['crawler']['pollSuccess'] as $pollable) {
1176
                // only check the success value if the instruction is runnig
1177
                // it is important to name the pollSuccess key same as the procInstructions key
1178
                if (is_array($resultData['parameters']['procInstructions'])
1179
                    && in_array(
1180
                        $pollable,
1181
                        $resultData['parameters']['procInstructions'], true
1182
                    )
1183
                ) {
1184
                    if (! empty($resultData['success'][$pollable]) && $resultData['success'][$pollable]) {
1185
                        $ret |= self::CLI_STATUS_POLLABLE_PROCESSED;
1186
                    }
1187
                }
1188
            }
1189
        }
1190
1191
        // Set result in log which also denotes the end of the processing of this entry.
1192
        $field_array = ['result_data' => serialize($result)];
1193
1194
        SignalSlotUtility::emitSignal(
1195
            self::class,
1196
            SignalSlotUtility::SIGNAL_QUEUEITEM_POSTPROCESS,
1197
            [$queueId, &$field_array]
1198
        );
1199
1200
        GeneralUtility::makeInstance(ConnectionPool::class)->getConnectionForTable('tx_crawler_queue')
1201
            ->update(
1202
                'tx_crawler_queue',
1203
                $field_array,
1204
                ['qid' => (int) $queueId]
1205
            );
1206
1207
        $this->logger->debug('crawler-readurl stop ' . microtime(true));
1208
        return $ret;
1209
    }
1210
1211
    /**
1212
     * Read URL for not-yet-inserted log-entry
1213
     *
1214
     * @param array $field_array Queue field array,
1215
     *
1216
     * @return string
1217
     */
1218
    public function readUrlFromArray($field_array)
1219
    {
1220
        // Set exec_time to lock record:
1221
        $field_array['exec_time'] = $this->getCurrentTime();
1222
        $connectionForCrawlerQueue = GeneralUtility::makeInstance(ConnectionPool::class)->getConnectionForTable($this->tableName);
1223
        $connectionForCrawlerQueue->insert(
1224
            $this->tableName,
1225
            $field_array
1226
        );
1227
        $queueId = $field_array['qid'] = $connectionForCrawlerQueue->lastInsertId($this->tableName, 'qid');
1228
1229
        $result = $this->queueExecutor->executeQueueItem($field_array, $this);
1230
1231
        // Set result in log which also denotes the end of the processing of this entry.
1232
        $field_array = ['result_data' => serialize($result)];
1233
1234
        SignalSlotUtility::emitSignal(
1235
            self::class,
1236
            SignalSlotUtility::SIGNAL_QUEUEITEM_POSTPROCESS,
1237
            [$queueId, &$field_array]
1238
        );
1239
1240
        $connectionForCrawlerQueue->update(
1241
            $this->tableName,
1242
            $field_array,
1243
            ['qid' => $queueId]
1244
        );
1245
1246
        return $result;
0 ignored issues
show
Bug Best Practice introduced by
The expression return $result also could return the type array|boolean which is incompatible with the documented return type string.
Loading history...
1247
    }
1248
1249
    /*****************************
1250
     *
1251
     * Compiling URLs to crawl - tools
1252
     *
1253
     *****************************/
1254
1255
    /**
1256
     * @param integer $id Root page id to start from.
1257
     * @param integer $depth Depth of tree, 0=only id-page, 1= on sublevel, 99 = infinite
1258
     * @param integer $scheduledTime Unix Time when the URL is timed to be visited when put in queue
1259
     * @param integer $reqMinute Number of requests per minute (creates the interleave between requests)
1260
     * @param boolean $submitCrawlUrls If set, submits the URLs to queue in database (real crawling)
1261
     * @param boolean $downloadCrawlUrls If set (and submitcrawlUrls is false) will fill $downloadUrls with entries)
1262
     * @param array $incomingProcInstructions Array of processing instructions
1263
     * @param array $configurationSelection Array of configuration keys
1264
     * @return string
1265
     */
1266
    public function getPageTreeAndUrls(
1267
        $id,
1268
        $depth,
1269
        $scheduledTime,
1270
        $reqMinute,
1271
        $submitCrawlUrls,
1272
        $downloadCrawlUrls,
1273
        array $incomingProcInstructions,
1274
        array $configurationSelection
1275
    ) {
1276
        $this->scheduledTime = $scheduledTime;
1277
        $this->reqMinute = $reqMinute;
1278
        $this->submitCrawlUrls = $submitCrawlUrls;
1279
        $this->downloadCrawlUrls = $downloadCrawlUrls;
1280
        $this->incomingProcInstructions = $incomingProcInstructions;
1281
        $this->incomingConfigurationSelection = $configurationSelection;
1282
1283
        $this->duplicateTrack = [];
1284
        $this->downloadUrls = [];
1285
1286
        // Drawing tree:
1287
        /* @var PageTreeView $tree */
1288
        $tree = GeneralUtility::makeInstance(PageTreeView::class);
1289
        $perms_clause = $this->getBackendUser()->getPagePermsClause(Permission::PAGE_SHOW);
1290
        $tree->init('AND ' . $perms_clause);
1291
1292
        $pageInfo = BackendUtility::readPageAccess($id, $perms_clause);
1293
        if (is_array($pageInfo)) {
0 ignored issues
show
introduced by
The condition is_array($pageInfo) is always true.
Loading history...
1294
            // Set root row:
1295
            $tree->tree[] = [
1296
                'row' => $pageInfo,
1297
                'HTML' => $this->iconFactory->getIconForRecord('pages', $pageInfo, Icon::SIZE_SMALL),
1298
            ];
1299
        }
1300
1301
        // Get branch beneath:
1302
        if ($depth) {
1303
            $tree->getTree($id, $depth, '');
1304
        }
1305
1306
        // Traverse page tree:
1307
        $code = '';
1308
1309
        foreach ($tree->tree as $data) {
1310
            $this->MP = false;
1311
1312
            // recognize mount points
1313
            if ($data['row']['doktype'] === PageRepository::DOKTYPE_MOUNTPOINT) {
1314
                $queryBuilder = GeneralUtility::makeInstance(ConnectionPool::class)->getQueryBuilderForTable('pages');
1315
                $queryBuilder->getRestrictions()->removeAll()->add(GeneralUtility::makeInstance(DeletedRestriction::class));
1316
                $mountpage = $queryBuilder
1317
                    ->select('*')
1318
                    ->from('pages')
1319
                    ->where(
1320
                        $queryBuilder->expr()->eq('uid', $queryBuilder->createNamedParameter($data['row']['uid'], \PDO::PARAM_INT))
1321
                    )
1322
                    ->execute()
1323
                    ->fetchAll();
1324
                $queryBuilder->resetRestrictions();
1325
1326
                // fetch mounted pages
1327
                $this->MP = $mountpage[0]['mount_pid'] . '-' . $data['row']['uid'];
0 ignored issues
show
Documentation Bug introduced by
The property $MP was declared of type boolean, but $mountpage[0]['mount_pid...' . $data['row']['uid'] is of type string. Maybe add a type cast?

This check looks for assignments to scalar types that may be of the wrong type.

To ensure the code behaves as expected, it may be a good idea to add an explicit type cast.

$answer = 42;

$correct = false;

$correct = (bool) $answer;
Loading history...
1328
1329
                $mountTree = GeneralUtility::makeInstance(PageTreeView::class);
1330
                $mountTree->init('AND ' . $perms_clause);
1331
                $mountTree->getTree($mountpage[0]['mount_pid'], $depth);
1332
1333
                foreach ($mountTree->tree as $mountData) {
1334
                    $code .= $this->drawURLs_addRowsForPage(
1335
                        $mountData['row'],
1336
                        $mountData['HTML'] . BackendUtility::getRecordTitle('pages', $mountData['row'], true)
1337
                    );
1338
                }
1339
1340
                // replace page when mount_pid_ol is enabled
1341
                if ($mountpage[0]['mount_pid_ol']) {
1342
                    $data['row']['uid'] = $mountpage[0]['mount_pid'];
1343
                } else {
1344
                    // if the mount_pid_ol is not set the MP must not be used for the mountpoint page
1345
                    $this->MP = false;
1346
                }
1347
            }
1348
1349
            $code .= $this->drawURLs_addRowsForPage(
1350
                $data['row'],
1351
                $data['HTML'] . BackendUtility::getRecordTitle('pages', $data['row'], true)
1352
            );
1353
        }
1354
1355
        return $code;
1356
    }
1357
1358
    /**
1359
     * Expands exclude string
1360
     *
1361
     * @param string $excludeString Exclude string
1362
     * @return array
1363
     */
1364 1
    public function expandExcludeString($excludeString)
1365
    {
1366
        // internal static caches;
1367 1
        static $expandedExcludeStringCache;
1368 1
        static $treeCache;
1369
1370 1
        if (empty($expandedExcludeStringCache[$excludeString])) {
1371 1
            $pidList = [];
1372
1373 1
            if (! empty($excludeString)) {
1374
                /** @var PageTreeView $tree */
1375
                $tree = GeneralUtility::makeInstance(PageTreeView::class);
1376
                $tree->init('AND ' . $this->getBackendUser()->getPagePermsClause(Permission::PAGE_SHOW));
1377
1378
                $excludeParts = GeneralUtility::trimExplode(',', $excludeString);
1379
1380
                foreach ($excludeParts as $excludePart) {
1381
                    [$pid, $depth] = GeneralUtility::trimExplode('+', $excludePart);
1382
1383
                    // default is "page only" = "depth=0"
1384
                    if (empty($depth)) {
1385
                        $depth = (stristr($excludePart, '+')) ? 99 : 0;
1386
                    }
1387
1388
                    $pidList[] = $pid;
1389
1390
                    if ($depth > 0) {
1391
                        if (empty($treeCache[$pid][$depth])) {
1392
                            $tree->reset();
1393
                            $tree->getTree($pid, $depth);
1394
                            $treeCache[$pid][$depth] = $tree->tree;
1395
                        }
1396
1397
                        foreach ($treeCache[$pid][$depth] as $data) {
1398
                            $pidList[] = $data['row']['uid'];
1399
                        }
1400
                    }
1401
                }
1402
            }
1403
1404 1
            $expandedExcludeStringCache[$excludeString] = array_unique($pidList);
1405
        }
1406
1407 1
        return $expandedExcludeStringCache[$excludeString];
1408
    }
1409
1410
    /**
1411
     * Create the rows for display of the page tree
1412
     * For each page a number of rows are shown displaying GET variable configuration
1413
     */
1414
    public function drawURLs_addRowsForPage(array $pageRow, string $pageTitle): string
1415
    {
1416
        $skipMessage = '';
1417
1418
        // Get list of configurations
1419
        $configurations = $this->getUrlsForPageRow($pageRow, $skipMessage);
1420
1421
        if (! empty($this->incomingConfigurationSelection)) {
1422
            // remove configuration that does not match the current selection
1423
            foreach ($configurations as $confKey => $confArray) {
1424
                if (! in_array($confKey, $this->incomingConfigurationSelection, true)) {
1425
                    unset($configurations[$confKey]);
1426
                }
1427
            }
1428
        }
1429
1430
        // Traverse parameter combinations:
1431
        $c = 0;
1432
        $content = '';
1433
        if (! empty($configurations)) {
1434
            foreach ($configurations as $confKey => $confArray) {
1435
1436
                // Title column:
1437
                if (! $c) {
1438
                    $titleClm = '<td rowspan="' . count($configurations) . '">' . $pageTitle . '</td>';
1439
                } else {
1440
                    $titleClm = '';
1441
                }
1442
1443
                if (! in_array($pageRow['uid'], $this->expandExcludeString($confArray['subCfg']['exclude']), true)) {
1444
1445
                    // URL list:
1446
                    $urlList = $this->urlListFromUrlArray(
1447
                        $confArray,
1448
                        $pageRow,
1449
                        $this->scheduledTime,
1450
                        $this->reqMinute,
1451
                        $this->submitCrawlUrls,
1452
                        $this->downloadCrawlUrls,
1453
                        $this->duplicateTrack,
1454
                        $this->downloadUrls,
1455
                        $this->incomingProcInstructions // if empty the urls won't be filtered by processing instructions
1456
                    );
1457
1458
                    // Expanded parameters:
1459
                    $paramExpanded = '';
1460
                    $calcAccu = [];
1461
                    $calcRes = 1;
1462
                    foreach ($confArray['paramExpanded'] as $gVar => $gVal) {
1463
                        $paramExpanded .= '
1464
                            <tr>
1465
                                <td>' . htmlspecialchars('&' . $gVar . '=') . '<br/>' .
1466
                            '(' . count($gVal) . ')' .
1467
                            '</td>
1468
                                <td nowrap="nowrap">' . nl2br(htmlspecialchars(implode(chr(10), $gVal))) . '</td>
1469
                            </tr>
1470
                        ';
1471
                        $calcRes *= count($gVal);
1472
                        $calcAccu[] = count($gVal);
1473
                    }
1474
                    $paramExpanded = '<table>' . $paramExpanded . '</table>';
1475
                    $paramExpanded .= 'Comb: ' . implode('*', $calcAccu) . '=' . $calcRes;
1476
1477
                    // Options
1478
                    $optionValues = '';
1479
                    if ($confArray['subCfg']['userGroups']) {
1480
                        $optionValues .= 'User Groups: ' . $confArray['subCfg']['userGroups'] . '<br/>';
1481
                    }
1482
                    if ($confArray['subCfg']['procInstrFilter']) {
1483
                        $optionValues .= 'ProcInstr: ' . $confArray['subCfg']['procInstrFilter'] . '<br/>';
1484
                    }
1485
1486
                    // Compile row:
1487
                    $content .= '
1488
                        <tr>
1489
                            ' . $titleClm . '
1490
                            <td>' . htmlspecialchars($confKey) . '</td>
1491
                            <td>' . nl2br(htmlspecialchars(rawurldecode(trim(str_replace('&', chr(10) . '&', GeneralUtility::implodeArrayForUrl('', $confArray['paramParsed'])))))) . '</td>
1492
                            <td>' . $paramExpanded . '</td>
1493
                            <td nowrap="nowrap">' . $urlList . '</td>
1494
                            <td nowrap="nowrap">' . $optionValues . '</td>
1495
                            <td nowrap="nowrap">' . DebugUtility::viewArray($confArray['subCfg']['procInstrParams.']) . '</td>
1496
                        </tr>';
1497
                } else {
1498
                    $content .= '<tr>
1499
                            ' . $titleClm . '
1500
                            <td>' . htmlspecialchars($confKey) . '</td>
1501
                            <td colspan="5"><em>No entries</em> (Page is excluded in this configuration)</td>
1502
                        </tr>';
1503
                }
1504
1505
                $c++;
1506
            }
1507
        } else {
1508
            $message = ! empty($skipMessage) ? ' (' . $skipMessage . ')' : '';
1509
1510
            // Compile row:
1511
            $content .= '
1512
                <tr>
1513
                    <td>' . $pageTitle . '</td>
1514
                    <td colspan="6"><em>No entries</em>' . $message . '</td>
1515
                </tr>';
1516
        }
1517
1518
        return $content;
1519
    }
1520
1521
    /*****************************
1522
     *
1523
     * CLI functions
1524
     *
1525
     *****************************/
1526
1527
    /**
1528
     * Running the functionality of the CLI (crawling URLs from queue)
1529
     */
1530
    public function CLI_run(int $countInARun, int $sleepTime, int $sleepAfterFinish): int
1531
    {
1532
        $result = 0;
1533
        $counter = 0;
1534
1535
        // First, run hooks:
1536
        $this->CLI_runHooks();
1537
1538
        // Clean up the queue
1539
        $this->queueRepository->cleanupQueue();
1540
1541
        // Select entries:
1542
        $rows = $this->queueRepository->fetchRecordsToBeCrawled($countInARun);
1543
1544
        if (! empty($rows)) {
1545
            $quidList = [];
1546
1547
            foreach ($rows as $r) {
1548
                $quidList[] = $r['qid'];
1549
            }
1550
1551
            $processId = $this->CLI_buildProcessId();
1552
1553
            //save the number of assigned queue entries to determine how many have been processed later
1554
            $numberOfAffectedRows = $this->queueRepository->updateProcessIdAndSchedulerForQueueIds($quidList, $processId);
1555
            $this->processRepository->updateProcessAssignItemsCount($numberOfAffectedRows, $processId);
1556
1557
            if ($numberOfAffectedRows !== count($quidList)) {
1558
                $this->CLI_debug('Nothing processed due to multi-process collision (' . $this->CLI_buildProcessId() . ')');
1559
                return ($result | self::CLI_STATUS_ABORTED);
1560
            }
1561
1562
            foreach ($rows as $r) {
1563
                $result |= $this->readUrl($r['qid']);
1564
1565
                $counter++;
1566
                usleep((int) $sleepTime); // Just to relax the system
1567
1568
                // if during the start and the current read url the cli has been disable we need to return from the function
1569
                // mark the process NOT as ended.
1570
                if ($this->getDisabled()) {
1571
                    return ($result | self::CLI_STATUS_ABORTED);
1572
                }
1573
1574
                if (! $this->processRepository->isProcessActive($this->CLI_buildProcessId())) {
1575
                    $this->CLI_debug('conflict / timeout (' . $this->CLI_buildProcessId() . ')');
1576
1577
                    //TODO might need an additional returncode
1578
                    $result |= self::CLI_STATUS_ABORTED;
1579
                    break; //possible timeout
1580
                }
1581
            }
1582
1583
            sleep((int) $sleepAfterFinish);
1584
1585
            $msg = 'Rows: ' . $counter;
1586
            $this->CLI_debug($msg . ' (' . $this->CLI_buildProcessId() . ')');
1587
        } else {
1588
            $this->CLI_debug('Nothing within queue which needs to be processed (' . $this->CLI_buildProcessId() . ')');
1589
        }
1590
1591
        if ($counter > 0) {
1592
            $result |= self::CLI_STATUS_PROCESSED;
1593
        }
1594
1595
        return $result;
1596
    }
1597
1598
    /**
1599
     * Activate hooks
1600
     */
1601
    public function CLI_runHooks(): void
1602
    {
1603
        foreach ($GLOBALS['TYPO3_CONF_VARS']['EXTCONF']['crawler']['cli_hooks'] ?? [] as $objRef) {
1604
            $hookObj = GeneralUtility::makeInstance($objRef);
1605
            if (is_object($hookObj)) {
1606
                $hookObj->crawler_init($this);
1607
            }
1608
        }
1609
    }
1610
1611
    /**
1612
     * Try to acquire a new process with the given id
1613
     * also performs some auto-cleanup for orphan processes
1614
     * @param string $id identification string for the process
1615
     * @return boolean
1616
     * @todo preemption might not be the most elegant way to clean up
1617
     */
1618
    public function CLI_checkAndAcquireNewProcess($id)
1619
    {
1620
        $queryBuilder = GeneralUtility::makeInstance(ConnectionPool::class)->getQueryBuilderForTable($this->tableName);
1621
        $ret = true;
1622
1623
        $systemProcessId = getmypid();
1624
        if ($systemProcessId < 1) {
1625
            return false;
1626
        }
1627
1628
        $processCount = 0;
1629
        $orphanProcesses = [];
1630
1631
        $statement = $queryBuilder
1632
            ->select('process_id', 'ttl')
1633
            ->from('tx_crawler_process')
1634
            ->where(
1635
                'active = 1 AND deleted = 0'
1636
            )
1637
            ->execute();
1638
1639
        $currentTime = $this->getCurrentTime();
1640
1641
        while ($row = $statement->fetch()) {
1642
            if ($row['ttl'] < $currentTime) {
1643
                $orphanProcesses[] = $row['process_id'];
1644
            } else {
1645
                $processCount++;
1646
            }
1647
        }
1648
1649
        // if there are less than allowed active processes then add a new one
1650
        if ($processCount < (int) $this->extensionSettings['processLimit']) {
1651
            $this->CLI_debug('add process ' . $this->CLI_buildProcessId() . ' (' . ($processCount + 1) . '/' . (int) $this->extensionSettings['processLimit'] . ')');
1652
1653
            GeneralUtility::makeInstance(ConnectionPool::class)->getConnectionForTable('tx_crawler_process')->insert(
1654
                'tx_crawler_process',
1655
                [
1656
                    'process_id' => $id,
1657
                    'active' => 1,
1658
                    'ttl' => $currentTime + (int) $this->extensionSettings['processMaxRunTime'],
1659
                    'system_process_id' => $systemProcessId,
1660
                ]
1661
            );
1662
        } else {
1663
            $this->CLI_debug('Processlimit reached (' . ($processCount) . '/' . (int) $this->extensionSettings['processLimit'] . ')');
1664
            $ret = false;
1665
        }
1666
1667
        $this->processRepository->deleteProcessesMarkedAsDeleted();
1668
        $this->CLI_releaseProcesses($orphanProcesses);
1669
1670
        return $ret;
1671
    }
1672
1673
    /**
1674
     * Release a process and the required resources
1675
     *
1676
     * @param mixed $releaseIds string with a single process-id or array with multiple process-ids
1677
     * @return boolean
1678
     */
1679
    public function CLI_releaseProcesses($releaseIds)
1680
    {
1681
        $queryBuilder = GeneralUtility::makeInstance(ConnectionPool::class)->getQueryBuilderForTable($this->tableName);
1682
1683
        if (! is_array($releaseIds)) {
1684
            $releaseIds = [$releaseIds];
1685
        }
1686
1687
        if (empty($releaseIds)) {
1688
            return false;   //nothing to release
1689
        }
1690
1691
        // some kind of 2nd chance algo - this way you need at least 2 processes to have a real cleanup
1692
        // this ensures that a single process can't mess up the entire process table
1693
1694
        // mark all processes as deleted which have no "waiting" queue-entires and which are not active
1695
1696
        $queryBuilder
1697
            ->update($this->tableName, 'q')
1698
            ->where(
1699
                'q.process_id IN(SELECT p.process_id FROM tx_crawler_process as p WHERE p.active = 0)'
1700
            )
1701
            ->set('q.process_scheduled', 0)
1702
            ->set('q.process_id', '')
1703
            ->execute();
1704
1705
        // FIXME: Not entirely sure that this is equivalent to the previous version
1706
        $queryBuilder->resetQueryPart('set');
1707
1708
        $queryBuilder
1709
            ->update('tx_crawler_process')
1710
            ->where(
1711
                $queryBuilder->expr()->eq('active', 0),
1712
                'process_id IN(SELECT q.process_id FROM tx_crawler_queue as q WHERE q.exec_time = 0)'
1713
            )
1714
            ->set('system_process_id', 0)
1715
            ->execute();
1716
1717
        $this->processRepository->markRequestedProcessesAsNotActive($releaseIds);
1718
        $this->queueRepository->unsetProcessScheduledAndProcessIdForQueueEntries($releaseIds);
1719
1720
        return true;
1721
    }
1722
1723
    /**
1724
     * Create a unique Id for the current process
1725
     *
1726
     * @return string  the ID
1727
     */
1728 1
    public function CLI_buildProcessId()
1729
    {
1730 1
        if (! $this->processID) {
1731
            $this->processID = GeneralUtility::shortMD5(microtime(true));
1732
        }
1733 1
        return $this->processID;
1734
    }
1735
1736
    /**
1737
     * Prints a message to the stdout (only if debug-mode is enabled)
1738
     *
1739
     * @param string $msg the message
1740
     */
1741
    public function CLI_debug($msg): void
1742
    {
1743
        if ((int) $this->extensionSettings['processDebug']) {
1744
            echo $msg . "\n";
1745
            flush();
1746
        }
1747
    }
1748
1749
    /**
1750
     * Cleans up entries that stayed for too long in the queue. These are:
1751
     * - processed entries that are over 1.5 days in age
1752
     * - scheduled entries that are over 7 days old
1753
     */
1754 1
    public function cleanUpOldQueueEntries(): void
1755
    {
1756 1
        $processedAgeInSeconds = $this->extensionSettings['cleanUpProcessedAge'] * 86400; // 24*60*60 Seconds in 24 hours
1757 1
        $scheduledAgeInSeconds = $this->extensionSettings['cleanUpScheduledAge'] * 86400;
1758
1759 1
        $now = time();
1760 1
        $condition = '(exec_time<>0 AND exec_time<' . ($now - $processedAgeInSeconds) . ') OR scheduled<=' . ($now - $scheduledAgeInSeconds);
1761 1
        $this->queueRepository->flushQueue($condition);
1762 1
    }
1763
1764
    /**
1765
     * This method determines duplicates for a queue entry with the same parameters and this timestamp.
1766
     * If the timestamp is in the past, it will check if there is any unprocessed queue entry in the past.
1767
     * If the timestamp is in the future it will check, if the queued entry has exactly the same timestamp
1768
     *
1769
     * @param int $tstamp
1770
     * @param array $fieldArray
1771
     *
1772
     * @return array
1773
     */
1774 7
    protected function getDuplicateRowsIfExist($tstamp, $fieldArray)
1775
    {
1776 7
        $rows = [];
1777
1778 7
        $currentTime = $this->getCurrentTime();
1779
1780 7
        $queryBuilder = GeneralUtility::makeInstance(ConnectionPool::class)->getQueryBuilderForTable($this->tableName);
1781
        $queryBuilder
1782 7
            ->select('qid')
1783 7
            ->from('tx_crawler_queue');
1784
        //if this entry is scheduled with "now"
1785 7
        if ($tstamp <= $currentTime) {
1786 2
            if ($this->extensionSettings['enableTimeslot']) {
1787 1
                $timeBegin = $currentTime - 100;
1788 1
                $timeEnd = $currentTime + 100;
1789
                $queryBuilder
1790 1
                    ->where(
1791 1
                        'scheduled BETWEEN ' . $timeBegin . ' AND ' . $timeEnd . ''
1792
                    )
1793 1
                    ->orWhere(
1794 1
                        $queryBuilder->expr()->lte('scheduled', $currentTime)
1795
                    );
1796
            } else {
1797
                $queryBuilder
1798 1
                    ->where(
1799 2
                        $queryBuilder->expr()->lte('scheduled', $currentTime)
1800
                    );
1801
            }
1802 5
        } elseif ($tstamp > $currentTime) {
1803
            //entry with a timestamp in the future need to have the same schedule time
1804
            $queryBuilder
1805 5
                ->where(
1806 5
                    $queryBuilder->expr()->eq('scheduled', $tstamp)
1807
                );
1808
        }
1809
1810
        $queryBuilder
1811 7
            ->andWhere('NOT exec_time')
1812 7
            ->andWhere('NOT process_id')
1813 7
            ->andWhere($queryBuilder->expr()->eq('page_id', $queryBuilder->createNamedParameter($fieldArray['page_id'], \PDO::PARAM_INT)))
1814 7
            ->andWhere($queryBuilder->expr()->eq('parameters_hash', $queryBuilder->createNamedParameter($fieldArray['parameters_hash'], \PDO::PARAM_STR)));
1815
1816 7
        $statement = $queryBuilder->execute();
1817
1818 7
        while ($row = $statement->fetch()) {
1819 5
            $rows[] = $row['qid'];
1820
        }
1821
1822 7
        return $rows;
1823
    }
1824
1825
    /**
1826
     * Returns a md5 hash generated from a serialized configuration array.
1827
     *
1828
     * @return string
1829
     */
1830 8
    protected function getConfigurationHash(array $configuration)
1831
    {
1832 8
        unset($configuration['paramExpanded']);
1833 8
        unset($configuration['URLs']);
1834 8
        return md5(serialize($configuration));
1835
    }
1836
1837
    /**
1838
     * Build a URL from a Page and the Query String. If the page has a Site configuration, it can be built by using
1839
     * the Site instance.
1840
     *
1841
     * @param int $httpsOrHttp see tx_crawler_configuration.force_ssl
1842
     * @throws \TYPO3\CMS\Core\Exception\SiteNotFoundException
1843
     * @throws \TYPO3\CMS\Core\Routing\InvalidRouteArgumentsException
1844
     */
1845 10
    protected function getUrlFromPageAndQueryParameters(int $pageId, string $queryString, ?string $alternativeBaseUrl, int $httpsOrHttp): UriInterface
1846
    {
1847 10
        $site = GeneralUtility::makeInstance(SiteMatcher::class)->matchByPageId((int) $pageId);
1848 10
        if ($site instanceof Site) {
1849 5
            $queryString = ltrim($queryString, '?&');
1850 5
            $queryParts = [];
1851 5
            parse_str($queryString, $queryParts);
1852 5
            unset($queryParts['id']);
1853
            // workaround as long as we don't have native language support in crawler configurations
1854 5
            if (isset($queryParts['L'])) {
1855
                $queryParts['_language'] = $queryParts['L'];
1856
                unset($queryParts['L']);
1857
                $siteLanguage = $site->getLanguageById((int) $queryParts['_language']);
0 ignored issues
show
Unused Code introduced by
The assignment to $siteLanguage is dead and can be removed.
Loading history...
1858
            } else {
1859 5
                $siteLanguage = $site->getDefaultLanguage();
1860
            }
1861 5
            $url = $site->getRouter()->generateUri($pageId, $queryParts);
1862 5
            if (! empty($alternativeBaseUrl)) {
1863 3
                $alternativeBaseUrl = new Uri($alternativeBaseUrl);
1864 3
                $url = $url->withHost($alternativeBaseUrl->getHost());
1865 3
                $url = $url->withScheme($alternativeBaseUrl->getScheme());
1866 3
                $url = $url->withPort($alternativeBaseUrl->getPort());
1867 3
                if ($userInfo = $alternativeBaseUrl->getUserInfo()) {
1868 5
                    $url = $url->withUserInfo($userInfo);
1869
                }
1870
            }
1871
        } else {
1872
            // Technically this is not possible with site handling, but kept for backwards-compatibility reasons
1873
            // Once EXT:crawler is v10-only compatible, this should be removed completely
1874 5
            $baseUrl = ($alternativeBaseUrl ?: GeneralUtility::getIndpEnv('TYPO3_SITE_URL'));
1875 5
            $cacheHashCalculator = GeneralUtility::makeInstance(CacheHashCalculator::class);
1876 5
            $queryString .= '&cHash=' . $cacheHashCalculator->generateForParameters($queryString);
1877 5
            $url = rtrim($baseUrl, '/') . '/index.php' . $queryString;
1878 5
            $url = new Uri($url);
1879
        }
1880
1881 10
        if ($httpsOrHttp === -1) {
1882 2
            $url = $url->withScheme('http');
1883 8
        } elseif ($httpsOrHttp === 1) {
1884 6
            $url = $url->withScheme('https');
1885
        }
1886
1887 10
        return $url;
1888
    }
1889
1890 1
    protected function swapIfFirstIsLargerThanSecond(array $reg): array
1891
    {
1892
        // Swap if first is larger than last:
1893 1
        if ($reg[1] > $reg[2]) {
1894
            $temp = $reg[2];
1895
            $reg[2] = $reg[1];
1896
            $reg[1] = $temp;
1897
        }
1898
1899 1
        return $reg;
1900
    }
1901
1902
    /**
1903
     * @return BackendUserAuthentication
1904
     */
1905 1
    private function getBackendUser()
1906
    {
1907
        // Make sure the _cli_ user is loaded
1908 1
        Bootstrap::initializeBackendAuthentication();
1909 1
        if ($this->backendUser === null) {
1910 1
            $this->backendUser = $GLOBALS['BE_USER'];
1911
        }
1912 1
        return $this->backendUser;
1913
    }
1914
1915
    /**
1916
     * Get querybuilder for given table
1917
     *
1918
     * @return \TYPO3\CMS\Core\Database\Query\QueryBuilder
1919
     */
1920 7
    private function getQueryBuilder(string $table)
1921
    {
1922 7
        return GeneralUtility::makeInstance(ConnectionPool::class)->getQueryBuilderForTable($table);
1923
    }
1924
}
1925