Passed
Push — cleanup/crawlercontroller ( c98170...0af167 )
by Tomas Norre
06:32
created

CrawlerController::urlListFromUrlArray()   B

Complexity

Conditions 8
Paths 8

Size

Total Lines 66
Code Lines 38

Duplication

Lines 0
Ratio 0 %

Code Coverage

Tests 33
CRAP Score 8.1458

Importance

Changes 1
Bugs 0 Features 0
Metric Value
cc 8
eloc 38
c 1
b 0
f 0
nc 8
nop 9
dl 0
loc 66
ccs 33
cts 38
cp 0.8684
crap 8.1458
rs 8.0675

How to fix   Long Method    Many Parameters   

Long Method

Small methods make your code easier to understand, in particular if combined with a good name. Besides, if your method is small, finding a good name is usually much easier.

For example, if you find yourself adding comments to a method's body, this is usually a good sign to extract the commented part to a new method, and use the comment as a starting point when coming up with a good name for this new method.

Commonly applied refactorings include:

Many Parameters

Methods with many parameters are not only hard to understand, but their parameters also often become inconsistent when you need more, or different data.

There are several approaches to avoid long parameter lists:

1
<?php
2
3
declare(strict_types=1);
4
5
namespace AOE\Crawler\Controller;
6
7
/***************************************************************
8
 *  Copyright notice
9
 *
10
 *  (c) 2020 AOE GmbH <[email protected]>
11
 *
12
 *  All rights reserved
13
 *
14
 *  This script is part of the TYPO3 project. The TYPO3 project is
15
 *  free software; you can redistribute it and/or modify
16
 *  it under the terms of the GNU General Public License as published by
17
 *  the Free Software Foundation; either version 3 of the License, or
18
 *  (at your option) any later version.
19
 *
20
 *  The GNU General Public License can be found at
21
 *  http://www.gnu.org/copyleft/gpl.html.
22
 *
23
 *  This script is distributed in the hope that it will be useful,
24
 *  but WITHOUT ANY WARRANTY; without even the implied warranty of
25
 *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
26
 *  GNU General Public License for more details.
27
 *
28
 *  This copyright notice MUST APPEAR in all copies of the script!
29
 ***************************************************************/
30
31
use AOE\Crawler\Configuration\ExtensionConfigurationProvider;
32
use AOE\Crawler\Domain\Repository\ConfigurationRepository;
33
use AOE\Crawler\Domain\Repository\ProcessRepository;
34
use AOE\Crawler\Domain\Repository\QueueRepository;
35
use AOE\Crawler\Event\EventDispatcher;
36
use AOE\Crawler\QueueExecutor;
37
use AOE\Crawler\Utility\SignalSlotUtility;
38
use Psr\Http\Message\UriInterface;
39
use Psr\Log\LoggerAwareInterface;
40
use Psr\Log\LoggerAwareTrait;
41
use TYPO3\CMS\Backend\Tree\View\PageTreeView;
42
use TYPO3\CMS\Backend\Utility\BackendUtility;
43
use TYPO3\CMS\Core\Authentication\BackendUserAuthentication;
44
use TYPO3\CMS\Core\Core\Bootstrap;
45
use TYPO3\CMS\Core\Core\Environment;
46
use TYPO3\CMS\Core\Database\Connection;
47
use TYPO3\CMS\Core\Database\ConnectionPool;
48
use TYPO3\CMS\Core\Database\Query\Restriction\DeletedRestriction;
49
use TYPO3\CMS\Core\Http\Uri;
50
use TYPO3\CMS\Core\Imaging\Icon;
51
use TYPO3\CMS\Core\Imaging\IconFactory;
52
use TYPO3\CMS\Core\Routing\SiteMatcher;
53
use TYPO3\CMS\Core\Site\Entity\Site;
54
use TYPO3\CMS\Core\Type\Bitmask\Permission;
55
use TYPO3\CMS\Core\TypoScript\Parser\TypoScriptParser;
56
use TYPO3\CMS\Core\Utility\DebugUtility;
57
use TYPO3\CMS\Core\Utility\GeneralUtility;
58
use TYPO3\CMS\Core\Utility\MathUtility;
59
use TYPO3\CMS\Extbase\Object\ObjectManager;
60
use TYPO3\CMS\Frontend\Page\CacheHashCalculator;
61
use TYPO3\CMS\Frontend\Page\PageRepository;
62
63
/**
64
 * Class CrawlerController
65
 *
66
 * @package AOE\Crawler\Controller
67
 */
68
class CrawlerController implements LoggerAwareInterface
69
{
70
    use LoggerAwareTrait;
71
72
    public const CLI_STATUS_NOTHING_PROCCESSED = 0;
73
    public const CLI_STATUS_REMAIN = 1; //queue not empty
74
    public const CLI_STATUS_PROCESSED = 2; //(some) queue items where processed
75
    public const CLI_STATUS_ABORTED = 4; //instance didn't finish
76
    public const CLI_STATUS_POLLABLE_PROCESSED = 8;
77
78
    /**
79
     * @var integer
80
     */
81
    public $setID = 0;
82
83
    /**
84
     * @var string
85
     */
86
    public $processID = '';
87
88
    /**
89
     * @var array
90
     */
91
    public $duplicateTrack = [];
92
93
    /**
94
     * @var array
95
     */
96
    public $downloadUrls = [];
97
98
    /**
99
     * @var array
100
     */
101
    public $incomingProcInstructions = [];
102
103
    /**
104
     * @var array
105
     */
106
    public $incomingConfigurationSelection = [];
107
108
    /**
109
     * @var bool
110
     */
111
    public $registerQueueEntriesInternallyOnly = false;
112
113
    /**
114
     * @var array
115
     */
116
    public $queueEntries = [];
117
118
    /**
119
     * @var array
120
     */
121
    public $urlList = [];
122
123
    /**
124
     * @var array
125
     */
126
    public $extensionSettings = [];
127
128
    /**
129
     * Mount Point
130
     *
131
     * @var bool
132
     * Todo: Check what this is used for and adjust the type hint or code, as bool doesn't match the current code.
133
     */
134
    public $MP = false;
135
136
    /**
137
     * @var string
138
     */
139
    protected $processFilename;
140
141
    /**
142
     * Holds the internal access mode can be 'gui','cli' or 'cli_im'
143
     *
144
     * @var string
145
     */
146
    protected $accessMode;
147
148
    /**
149
     * @var BackendUserAuthentication|null
150
     */
151
    private $backendUser;
152
153
    /**
154
     * @var integer
155
     */
156
    private $scheduledTime = 0;
157
158
    /**
159
     * @var integer
160
     */
161
    private $reqMinute = 0;
162
163
    /**
164
     * @var bool
165
     */
166
    private $submitCrawlUrls = false;
167
168
    /**
169
     * @var bool
170
     */
171
    private $downloadCrawlUrls = false;
172
173
    /**
174
     * @var QueueRepository
175
     */
176
    protected $queueRepository;
177
178
    /**
179
     * @var ProcessRepository
180
     */
181
    protected $processRepository;
182
183
    /**
184
     * @var ConfigurationRepository
185
     */
186
    protected $configurationRepository;
187
188
    /**
189
     * @var string
190
     */
191
    protected $tableName = 'tx_crawler_queue';
192
193
    /**
194
     * @var QueueExecutor
195
     */
196
    protected $queueExecutor;
197
198
    /**
199
     * @var int
200
     */
201
    protected $maximumUrlsToCompile = 10000;
202
203
    /**
204
     * @var IconFactory
205
     */
206
    protected $iconFactory;
207
208
    /**
209
     * Method to set the accessMode can be gui, cli or cli_im
210
     *
211
     * @return string
212
     */
213 1
    public function getAccessMode()
214
    {
215 1
        return $this->accessMode;
216
    }
217
218
    /**
219
     * @param string $accessMode
220
     */
221 1
    public function setAccessMode($accessMode): void
222
    {
223 1
        $this->accessMode = $accessMode;
224 1
    }
225
226
    /**
227
     * Set disabled status to prevent processes from being processed
228
     *
229
     * @param bool $disabled (optional, defaults to true)
230
     * @return void
231
     */
232 2
    public function setDisabled($disabled = true): void
233
    {
234 2
        if ($disabled) {
235 1
            GeneralUtility::writeFile($this->processFilename, '');
236
        } else {
237 1
            if (is_file($this->processFilename)) {
238 1
                unlink($this->processFilename);
239
            }
240
        }
241 2
    }
242
243
    /**
244
     * Get disable status
245
     *
246
     * @return bool true if disabled
247
     */
248 2
    public function getDisabled()
249
    {
250 2
        return is_file($this->processFilename);
251
    }
252
253
    /**
254
     * @param string $filenameWithPath
255
     *
256
     * @return void
257
     */
258 3
    public function setProcessFilename($filenameWithPath): void
259
    {
260 3
        $this->processFilename = $filenameWithPath;
261 3
    }
262
263
    /**
264
     * @return string
265
     */
266 1
    public function getProcessFilename()
267
    {
268 1
        return $this->processFilename;
269
    }
270
271
    /************************************
272
     *
273
     * Getting URLs based on Page TSconfig
274
     *
275
     ************************************/
276
277 45
    public function __construct()
278
    {
279 45
        $objectManager = GeneralUtility::makeInstance(ObjectManager::class);
280 45
        $this->queueRepository = $objectManager->get(QueueRepository::class);
281 45
        $this->processRepository = $objectManager->get(ProcessRepository::class);
282 45
        $this->configurationRepository = $objectManager->get(ConfigurationRepository::class);
283 45
        $this->queueExecutor = $objectManager->get(QueueExecutor::class);
284 45
        $this->iconFactory = GeneralUtility::makeInstance(IconFactory::class);
285
286 45
        $this->processFilename = Environment::getVarPath() . '/lock/tx_crawler.proc';
287
288
        /** @var ExtensionConfigurationProvider $configurationProvider */
289 45
        $configurationProvider = GeneralUtility::makeInstance(ExtensionConfigurationProvider::class);
290 45
        $settings = $configurationProvider->getExtensionConfiguration();
291 45
        $this->extensionSettings = is_array($settings) ? $settings : [];
0 ignored issues
show
introduced by
The condition is_array($settings) is always true.
Loading history...
292
293
        // set defaults:
294 45
        if (MathUtility::convertToPositiveInteger($this->extensionSettings['countInARun']) == 0) {
295
            $this->extensionSettings['countInARun'] = 100;
296
        }
297
298 45
        $this->extensionSettings['processLimit'] = MathUtility::forceIntegerInRange($this->extensionSettings['processLimit'], 1, 99, 1);
299 45
        $this->maximumUrlsToCompile = MathUtility::forceIntegerInRange($this->extensionSettings['maxCompileUrls'], 1, 1000000000, 10000);
300 45
    }
301
302
    /**
303
     * @return BackendUserAuthentication
304
     */
305 1
    private function getBackendUser()
306
    {
307
        // Make sure the _cli_ user is loaded
308 1
        Bootstrap::initializeBackendAuthentication();
309 1
        if ($this->backendUser === null) {
310 1
            $this->backendUser = $GLOBALS['BE_USER'];
311
        }
312 1
        return $this->backendUser;
313
    }
314
315
    /**
316
     * Sets the extensions settings (unserialized pendant of $TYPO3_CONF_VARS['EXT']['extConf']['crawler']).
317
     *
318
     * @param array $extensionSettings
319
     * @return void
320
     */
321 12
    public function setExtensionSettings(array $extensionSettings): void
322
    {
323 12
        $this->extensionSettings = $extensionSettings;
324 12
    }
325
326
    /**
327
     * Check if the given page should be crawled
328
     *
329
     * @param array $pageRow
330
     * @return false|string false if the page should be crawled (not excluded), true / skipMessage if it should be skipped
331
     */
332 8
    public function checkIfPageShouldBeSkipped(array $pageRow)
333
    {
334 8
        $skipPage = false;
335 8
        $skipMessage = 'Skipped'; // message will be overwritten later
336
337
        // if page is hidden
338 8
        if (!$this->extensionSettings['crawlHiddenPages']) {
339 8
            if ($pageRow['hidden']) {
340 1
                $skipPage = true;
341 1
                $skipMessage = 'Because page is hidden';
342
            }
343
        }
344
345 8
        if (!$skipPage) {
346 7
            if (GeneralUtility::inList('3,4', $pageRow['doktype']) || $pageRow['doktype'] >= 199) {
347 3
                $skipPage = true;
348 3
                $skipMessage = 'Because doktype is not allowed';
349
            }
350
        }
351
352 8
        if (!$skipPage) {
353 4
            foreach ($GLOBALS['TYPO3_CONF_VARS']['EXTCONF']['crawler']['excludeDoktype'] ?? [] as $key => $doktypeList) {
354 1
                if (GeneralUtility::inList($doktypeList, $pageRow['doktype'])) {
355 1
                    $skipPage = true;
356 1
                    $skipMessage = 'Doktype was excluded by "' . $key . '"';
357 1
                    break;
358
                }
359
            }
360
        }
361
362 8
        if (!$skipPage) {
363
            // veto hook
364 3
            foreach ($GLOBALS['TYPO3_CONF_VARS']['EXTCONF']['crawler']['pageVeto'] ?? [] as $key => $func) {
365
                $params = [
366
                    'pageRow' => $pageRow,
367
                ];
368
                // expects "false" if page is ok and "true" or a skipMessage if this page should _not_ be crawled
369
                $veto = GeneralUtility::callUserFunction($func, $params, $this);
370
                if ($veto !== false) {
371
                    $skipPage = true;
372
                    if (is_string($veto)) {
373
                        $skipMessage = $veto;
374
                    } else {
375
                        $skipMessage = 'Veto from hook "' . htmlspecialchars($key) . '"';
376
                    }
377
                    // no need to execute other hooks if a previous one return a veto
378
                    break;
379
                }
380
            }
381
        }
382
383 8
        return $skipPage ? $skipMessage : false;
384
    }
385
386
    /**
387
     * Wrapper method for getUrlsForPageId()
388
     * It returns an array of configurations and no urls!
389
     *
390
     * @param array $pageRow Page record with at least dok-type and uid columns.
391
     * @param string $skipMessage
392
     * @return array
393
     * @see getUrlsForPageId()
394
     */
395 4
    public function getUrlsForPageRow(array $pageRow, &$skipMessage = '')
396
    {
397 4
        $message = $this->checkIfPageShouldBeSkipped($pageRow);
398
399 4
        if ($message === false) {
400 3
            $res = $this->getUrlsForPageId($pageRow['uid']);
401 3
            $skipMessage = '';
402
        } else {
403 1
            $skipMessage = $message;
404 1
            $res = [];
405
        }
406
407 4
        return $res;
408
    }
409
410
    /**
411
     * Creates a list of URLs from input array (and submits them to queue if asked for)
412
     * See Web > Info module script + "indexed_search"'s crawler hook-client using this!
413
     *
414
     * @param array $vv Information about URLs from pageRow to crawl.
415
     * @param array $pageRow Page row
416
     * @param int $scheduledTime Unix time to schedule indexing to, typically time()
417
     * @param int $reqMinute Number of requests per minute (creates the interleave between requests)
418
     * @param bool $submitCrawlUrls If set, submits the URLs to queue
419
     * @param bool $downloadCrawlUrls If set (and submitcrawlUrls is false) will fill $downloadUrls with entries)
420
     * @param array $duplicateTrack Array which is passed by reference and contains the an id per url to secure we will not crawl duplicates
421
     * @param array $downloadUrls Array which will be filled with URLS for download if flag is set.
422
     * @param array $incomingProcInstructions Array of processing instructions
423
     * @return string List of URLs (meant for display in backend module)
424
     */
425 2
    public function urlListFromUrlArray(
426
        array $vv,
427
        array $pageRow,
428
        $scheduledTime,
429
        $reqMinute,
430
        $submitCrawlUrls,
431
        $downloadCrawlUrls,
432
        array &$duplicateTrack,
433
        array &$downloadUrls,
434
        array $incomingProcInstructions
435
    ) {
436 2
        if (!is_array($vv['URLs'])) {
437
            return 'ERROR - no URL generated';
438
        }
439 2
        $urlLog = [];
440 2
        $pageId = (int)$pageRow['uid'];
441 2
        $configurationHash = $this->getConfigurationHash($vv);
442 2
        $skipInnerCheck = $this->queueRepository->noUnprocessedQueueEntriesForPageWithConfigurationHashExist($pageId, $configurationHash);
443
444 2
        foreach ($vv['URLs'] as $urlQuery) {
445 2
            if (!$this->drawURLs_PIfilter($vv['subCfg']['procInstrFilter'], $incomingProcInstructions)) {
446
                continue;
447
            }
448 2
            $url = (string)$this->getUrlFromPageAndQueryParameters(
449 2
                $pageId,
450 2
                $urlQuery,
451 2
                $vv['subCfg']['baseUrl'] ?? null,
452 2
                $vv['subCfg']['force_ssl'] ?? 0
453
            );
454
455
            // Create key by which to determine unique-ness:
456 2
            $uKey = $url . '|' . $vv['subCfg']['userGroups'] . '|' . $vv['subCfg']['procInstrFilter'];
457
458 2
            if (isset($duplicateTrack[$uKey])) {
459
                //if the url key is registered just display it and do not resubmit is
460
                $urlLog[] = '<em><span class="text-muted">' . htmlspecialchars($url) . '</span></em>';
461
            } else {
462
                // Scheduled time:
463 2
                $schTime = $scheduledTime + round(count($duplicateTrack) * (60 / $reqMinute));
464 2
                $schTime = intval($schTime / 60) * 60;
465 2
                $formattedDate = BackendUtility::datetime($schTime);
466 2
                $this->urlList[] = '[' . $formattedDate . '] ' . $url;
467 2
                $urlList = '[' . $formattedDate . '] ' . htmlspecialchars($url);
468
469
                // Submit for crawling!
470 2
                if ($submitCrawlUrls) {
471 2
                    $added = $this->addUrl(
472 2
                        $pageId,
473 2
                        $url,
474 2
                        $vv['subCfg'],
475 2
                        $scheduledTime,
476 2
                        $configurationHash,
477 2
                        $skipInnerCheck
478
                    );
479 2
                    if ($added === false) {
480 2
                        $urlList .= ' (URL already existed)';
481
                    }
482
                } elseif ($downloadCrawlUrls) {
483
                    $downloadUrls[$url] = $url;
484
                }
485 2
                $urlLog[] = $urlList;
486
            }
487 2
            $duplicateTrack[$uKey] = true;
488
        }
489
490 2
        return implode('<br>', $urlLog);
491
    }
492
493
    /**
494
     * Returns true if input processing instruction is among registered ones.
495
     *
496
     * @param string $piString PI to test
497
     * @param array $incomingProcInstructions Processing instructions
498
     * @return boolean
499
     */
500 5
    public function drawURLs_PIfilter($piString, array $incomingProcInstructions)
501
    {
502 5
        if (empty($incomingProcInstructions)) {
503 1
            return true;
504
        }
505
506 4
        foreach ($incomingProcInstructions as $pi) {
507 4
            if (GeneralUtility::inList($piString, $pi)) {
508 2
                return true;
509
            }
510
        }
511 2
        return false;
512
    }
513
514 3
    public function getPageTSconfigForId($id): array
515
    {
516 3
        if (!$this->MP) {
517 3
            $pageTSconfig = BackendUtility::getPagesTSconfig($id);
0 ignored issues
show
Deprecated Code introduced by
The function TYPO3\CMS\Backend\Utilit...ity::getPagesTSconfig() has been deprecated. ( Ignorable by Annotation )

If this is a false-positive, you can also ignore this issue in your code via the ignore-deprecated  annotation

517
            $pageTSconfig = /** @scrutinizer ignore-deprecated */ BackendUtility::getPagesTSconfig($id);
Loading history...
518
        } else {
519
            // TODO: Please check, this makes no sense to split a boolean value.
520
            [, $mountPointId] = explode('-', $this->MP);
0 ignored issues
show
Bug introduced by
$this->MP of type true is incompatible with the type string expected by parameter $string of explode(). ( Ignorable by Annotation )

If this is a false-positive, you can also ignore this issue in your code via the ignore-type  annotation

520
            [, $mountPointId] = explode('-', /** @scrutinizer ignore-type */ $this->MP);
Loading history...
521
            $pageTSconfig = BackendUtility::getPagesTSconfig($mountPointId);
0 ignored issues
show
Deprecated Code introduced by
The function TYPO3\CMS\Backend\Utilit...ity::getPagesTSconfig() has been deprecated. ( Ignorable by Annotation )

If this is a false-positive, you can also ignore this issue in your code via the ignore-deprecated  annotation

521
            $pageTSconfig = /** @scrutinizer ignore-deprecated */ BackendUtility::getPagesTSconfig($mountPointId);
Loading history...
Bug introduced by
$mountPointId of type string is incompatible with the type integer expected by parameter $id of TYPO3\CMS\Backend\Utilit...ity::getPagesTSconfig(). ( Ignorable by Annotation )

If this is a false-positive, you can also ignore this issue in your code via the ignore-type  annotation

521
            $pageTSconfig = BackendUtility::getPagesTSconfig(/** @scrutinizer ignore-type */ $mountPointId);
Loading history...
522
        }
523
524
        // Call a hook to alter configuration
525 3
        if (is_array($GLOBALS['TYPO3_CONF_VARS']['EXTCONF']['crawler']['getPageTSconfigForId'])) {
526
            $params = [
527
                'pageId' => $id,
528
                'pageTSConfig' => &$pageTSconfig,
529
            ];
530
            foreach ($GLOBALS['TYPO3_CONF_VARS']['EXTCONF']['crawler']['getPageTSconfigForId'] as $userFunc) {
531
                GeneralUtility::callUserFunction($userFunc, $params, $this);
532
            }
533
        }
534 3
        return $pageTSconfig;
535
    }
536
537
    /**
538
     * This methods returns an array of configurations.
539
     * Adds no urls!
540
     */
541 2
    public function getUrlsForPageId(int $pageId): array
542
    {
543
        // Get page TSconfig for page ID
544 2
        $pageTSconfig = $this->getPageTSconfigForId($pageId);
545
546 2
        $res = [];
547
548
        // Fetch Crawler Configuration from pageTSconfig
549 2
        $crawlerCfg = $pageTSconfig['tx_crawler.']['crawlerCfg.']['paramSets.'] ?? [];
550 2
        foreach ($crawlerCfg as $key => $values) {
551 1
            if (!is_array($values)) {
552 1
                continue;
553
            }
554 1
            $key = str_replace('.', '', $key);
555
            // Sub configuration for a single configuration string:
556 1
            $subCfg = (array)$crawlerCfg[$key . '.'];
557 1
            $subCfg['key'] = $key;
558
559 1
            if (strcmp($subCfg['procInstrFilter'], '')) {
560 1
                $subCfg['procInstrFilter'] = implode(',', GeneralUtility::trimExplode(',', $subCfg['procInstrFilter']));
561
            }
562 1
            $pidOnlyList = implode(',', GeneralUtility::trimExplode(',', $subCfg['pidsOnly'], true));
563
564
            // process configuration if it is not page-specific or if the specific page is the current page:
565
            // TODO: Check if $pidOnlyList can be kept as Array instead of imploded
566 1
            if (!strcmp((string)$subCfg['pidsOnly'], '') || GeneralUtility::inList($pidOnlyList, strval($pageId))) {
567
568
                // Explode, process etc.:
569 1
                $res[$key] = [];
570 1
                $res[$key]['subCfg'] = $subCfg;
571 1
                $res[$key]['paramParsed'] = GeneralUtility::explodeUrl2Array($crawlerCfg[$key]);
572 1
                $res[$key]['paramExpanded'] = $this->expandParameters($res[$key]['paramParsed'], $pageId);
573 1
                $res[$key]['origin'] = 'pagets';
574
575
                // recognize MP value
576 1
                if (!$this->MP) {
577 1
                    $res[$key]['URLs'] = $this->compileUrls($res[$key]['paramExpanded'], ['?id=' . $pageId]);
578
                } else {
579
                    $res[$key]['URLs'] = $this->compileUrls($res[$key]['paramExpanded'], ['?id=' . $pageId . '&MP=' . $this->MP]);
0 ignored issues
show
Bug introduced by
Are you sure $this->MP of type true can be used in concatenation? ( Ignorable by Annotation )

If this is a false-positive, you can also ignore this issue in your code via the ignore-type  annotation

579
                    $res[$key]['URLs'] = $this->compileUrls($res[$key]['paramExpanded'], ['?id=' . $pageId . '&MP=' . /** @scrutinizer ignore-type */ $this->MP]);
Loading history...
580
                }
581
            }
582
        }
583
584
        // Get configuration from tx_crawler_configuration records up the rootline
585 2
        $crawlerConfigurations = $this->configurationRepository->getCrawlerConfigurationRecordsFromRootLine($pageId);
586 2
        foreach ($crawlerConfigurations as $configurationRecord) {
587
588
            // check access to the configuration record
589 1
            if (empty($configurationRecord['begroups']) || $this->getBackendUser()->isAdmin() || $this->hasGroupAccess($this->getBackendUser()->user['usergroup_cached_list'], $configurationRecord['begroups'])) {
590 1
                $pidOnlyList = implode(',', GeneralUtility::trimExplode(',', $configurationRecord['pidsonly'], true));
591
592
                // process configuration if it is not page-specific or if the specific page is the current page:
593
                // TODO: Check if $pidOnlyList can be kept as Array instead of imploded
594 1
                if (!strcmp($configurationRecord['pidsonly'], '') || GeneralUtility::inList($pidOnlyList, strval($pageId))) {
595 1
                    $key = $configurationRecord['name'];
596
597
                    // don't overwrite previously defined paramSets
598 1
                    if (!isset($res[$key])) {
599
600
                        /* @var $TSparserObject \TYPO3\CMS\Core\TypoScript\Parser\TypoScriptParser */
601 1
                        $TSparserObject = GeneralUtility::makeInstance(TypoScriptParser::class);
602 1
                        $TSparserObject->parse($configurationRecord['processing_instruction_parameters_ts']);
603
604
                        $subCfg = [
605 1
                            'procInstrFilter' => $configurationRecord['processing_instruction_filter'],
606 1
                            'procInstrParams.' => $TSparserObject->setup,
607 1
                            'baseUrl' => $configurationRecord['base_url'],
608 1
                            'force_ssl' => (int)$configurationRecord['force_ssl'],
609 1
                            'userGroups' => $configurationRecord['fegroups'],
610 1
                            'exclude' => $configurationRecord['exclude'],
611 1
                            'key' => $key,
612
                        ];
613
614 1
                        if (!in_array($pageId, $this->expandExcludeString($subCfg['exclude']))) {
615 1
                            $res[$key] = [];
616 1
                            $res[$key]['subCfg'] = $subCfg;
617 1
                            $res[$key]['paramParsed'] = GeneralUtility::explodeUrl2Array($configurationRecord['configuration']);
618 1
                            $res[$key]['paramExpanded'] = $this->expandParameters($res[$key]['paramParsed'], $pageId);
619 1
                            $res[$key]['URLs'] = $this->compileUrls($res[$key]['paramExpanded'], ['?id=' . $pageId]);
620 1
                            $res[$key]['origin'] = 'tx_crawler_configuration_' . $configurationRecord['uid'];
621
                        }
622
                    }
623
                }
624
            }
625
        }
626
627 2
        foreach ($GLOBALS['TYPO3_CONF_VARS']['EXTCONF']['crawler']['processUrls'] ?? [] as $func) {
628
            $params = [
629
                'res' => &$res,
630
            ];
631
            GeneralUtility::callUserFunction($func, $params, $this);
632
        }
633 2
        return $res;
634
    }
635
636
    /**
637
     * Find all configurations of subpages of a page
638
     * TODO: Write Functional Tests
639
     */
640 1
    public function getConfigurationsForBranch(int $rootid, int $depth): array
641
    {
642 1
        $configurationsForBranch = [];
643 1
        $pageTSconfig = $this->getPageTSconfigForId($rootid);
644 1
        $sets = $pageTSconfig['tx_crawler.']['crawlerCfg.']['paramSets.'] ?? [];
645 1
        foreach ($sets as $key => $value) {
646
            if (!is_array($value)) {
647
                continue;
648
            }
649
            $configurationsForBranch[] = substr($key, -1) == '.' ? substr($key, 0, -1) : $key;
650
        }
651 1
        $pids = [];
652 1
        $rootLine = BackendUtility::BEgetRootLine($rootid);
653 1
        foreach ($rootLine as $node) {
654 1
            $pids[] = $node['uid'];
655
        }
656
        /* @var PageTreeView $tree */
657 1
        $tree = GeneralUtility::makeInstance(PageTreeView::class);
658 1
        $perms_clause = $this->getBackendUser()->getPagePermsClause(Permission::PAGE_SHOW);
659 1
        $tree->init(empty($perms_clause) ? '' : ('AND ' . $perms_clause));
660 1
        $tree->getTree($rootid, $depth, '');
661 1
        foreach ($tree->tree as $node) {
662
            $pids[] = $node['row']['uid'];
663
        }
664
665 1
        $queryBuilder = $this->getQueryBuilder('tx_crawler_configuration');
666
        $statement = $queryBuilder
667 1
            ->select('name')
668 1
            ->from('tx_crawler_configuration')
669 1
            ->where(
670 1
                $queryBuilder->expr()->in('pid', $queryBuilder->createNamedParameter($pids, Connection::PARAM_INT_ARRAY))
671
            )
672 1
            ->execute();
673
674 1
        while ($row = $statement->fetch()) {
675 1
            $configurationsForBranch[] = $row['name'];
676
        }
677 1
        return $configurationsForBranch;
678
    }
679
680
    /**
681
     * Get querybuilder for given table
682
     *
683
     * @param string $table
684
     * @return \TYPO3\CMS\Core\Database\Query\QueryBuilder
685
     */
686 18
    private function getQueryBuilder(string $table)
687
    {
688 18
        return GeneralUtility::makeInstance(ConnectionPool::class)->getQueryBuilderForTable($table);
689
    }
690
691
    /**
692
     * Check if a user has access to an item
693
     * (e.g. get the group list of the current logged in user from $GLOBALS['TSFE']->gr_list)
694
     *
695
     * @param string $groupList Comma-separated list of (fe_)group UIDs from a user
696
     * @param string $accessList Comma-separated list of (fe_)group UIDs of the item to access
697
     * @return bool                 TRUE if at least one of the users group UIDs is in the access list or the access list is empty
698
     * @see \TYPO3\CMS\Frontend\Page\PageRepository::getMultipleGroupsWhereClause()
699
     */
700 3
    public function hasGroupAccess($groupList, $accessList)
701
    {
702 3
        if (empty($accessList)) {
703 1
            return true;
704
        }
705 2
        foreach (GeneralUtility::intExplode(',', $groupList) as $groupUid) {
706 2
            if (GeneralUtility::inList($accessList, $groupUid)) {
707 1
                return true;
708
            }
709
        }
710 1
        return false;
711
    }
712
713
    /**
714
     * Will expand the parameters configuration to individual values. This follows a certain syntax of the value of each parameter.
715
     * Syntax of values:
716
     * - Basically: If the value is wrapped in [...] it will be expanded according to the following syntax, otherwise the value is taken literally
717
     * - Configuration is splitted by "|" and the parts are processed individually and finally added together
718
     * - For each configuration part:
719
     *         - "[int]-[int]" = Integer range, will be expanded to all values in between, values included, starting from low to high (max. 1000). Example "1-34" or "-40--30"
720
     *         - "_TABLE:[TCA table name];[_PID:[optional page id, default is current page]];[_ENABLELANG:1]" = Look up of table records from PID, filtering out deleted records. Example "_TABLE:tt_content; _PID:123"
721
     *        _ENABLELANG:1 picks only original records without their language overlays
722
     *         - Default: Literal value
723
     *
724
     * @param array $paramArray Array with key (GET var name) and values (value of GET var which is configuration for expansion)
725
     * @param integer $pid Current page ID
726
     * @return array
727
     *
728
     * TODO: Write Functional Tests
729
     */
730 9
    public function expandParameters($paramArray, $pid)
731
    {
732
        // Traverse parameter names:
733 9
        foreach ($paramArray as $p => $v) {
734 9
            $v = trim($v);
735
736
            // If value is encapsulated in square brackets it means there are some ranges of values to find, otherwise the value is literal
737 9
            if (strpos($v, '[') === 0 && substr($v, -1) === ']') {
738
                // So, find the value inside brackets and reset the paramArray value as an array.
739 9
                $v = substr($v, 1, -1);
740 9
                $paramArray[$p] = [];
741
742
                // Explode parts and traverse them:
743 9
                $parts = explode('|', $v);
744 9
                foreach ($parts as $pV) {
745
746
                    // Look for integer range: (fx. 1-34 or -40--30 // reads minus 40 to minus 30)
747 9
                    if (preg_match('/^(-?[0-9]+)\s*-\s*(-?[0-9]+)$/', trim($pV), $reg)) {
748 1
                        $reg = $this->swapIfFirstIsLargerThanSecond($reg);
749
750
                        // Traverse range, add values:
751 1
                        $runAwayBrake = 1000; // Limit to size of range!
752 1
                        for ($a = $reg[1]; $a <= $reg[2]; $a++) {
753 1
                            $paramArray[$p][] = $a;
754 1
                            $runAwayBrake--;
755 1
                            if ($runAwayBrake <= 0) {
756
                                break;
757
                            }
758
                        }
759 8
                    } elseif (strpos(trim($pV), '_TABLE:') === 0) {
760
761
                        // Parse parameters:
762 6
                        $subparts = GeneralUtility::trimExplode(';', $pV);
763 6
                        $subpartParams = [];
764 6
                        foreach ($subparts as $spV) {
765 6
                            [$pKey, $pVal] = GeneralUtility::trimExplode(':', $spV);
766 6
                            $subpartParams[$pKey] = $pVal;
767
                        }
768
769
                        // Table exists:
770 6
                        if (isset($GLOBALS['TCA'][$subpartParams['_TABLE']])) {
771 6
                            $lookUpPid = isset($subpartParams['_PID']) ? intval($subpartParams['_PID']) : intval($pid);
772 6
                            $recursiveDepth = isset($subpartParams['_RECURSIVE']) ? intval($subpartParams['_RECURSIVE']) : 0;
773 6
                            $pidField = isset($subpartParams['_PIDFIELD']) ? trim($subpartParams['_PIDFIELD']) : 'pid';
774 6
                            $where = $subpartParams['_WHERE'] ?? '';
775 6
                            $addTable = $subpartParams['_ADDTABLE'] ?? '';
776
777 6
                            $fieldName = $subpartParams['_FIELD'] ? $subpartParams['_FIELD'] : 'uid';
778 6
                            if ($fieldName === 'uid' || $GLOBALS['TCA'][$subpartParams['_TABLE']]['columns'][$fieldName]) {
779 6
                                $queryBuilder = $this->getQueryBuilder($subpartParams['_TABLE']);
780
781 6
                                if ($recursiveDepth > 0) {
782
                                    /** @var \TYPO3\CMS\Core\Database\QueryGenerator $queryGenerator */
783 2
                                    $queryGenerator = GeneralUtility::makeInstance(\TYPO3\CMS\Core\Database\QueryGenerator::class);
784 2
                                    $pidList = $queryGenerator->getTreeList($lookUpPid, $recursiveDepth, 0, 1);
785 2
                                    $pidArray = GeneralUtility::intExplode(',', $pidList);
786
                                } else {
787 4
                                    $pidArray = [(string)$lookUpPid];
788
                                }
789
790 6
                                $queryBuilder->getRestrictions()
791 6
                                    ->removeAll()
792 6
                                    ->add(GeneralUtility::makeInstance(DeletedRestriction::class));
793
794
                                $queryBuilder
795 6
                                    ->select($fieldName)
796 6
                                    ->from($subpartParams['_TABLE'])
797 6
                                    ->where(
798 6
                                        $queryBuilder->expr()->in($pidField, $queryBuilder->createNamedParameter($pidArray, Connection::PARAM_INT_ARRAY)),
799 6
                                        $where
800
                                    );
801
802 6
                                if (!empty($addTable)) {
803
                                    // TODO: Check if this works as intended!
804
                                    $queryBuilder->add('from', $addTable);
805
                                }
806 6
                                $transOrigPointerField = $GLOBALS['TCA'][$subpartParams['_TABLE']]['ctrl']['transOrigPointerField'];
807
808 6
                                if ($subpartParams['_ENABLELANG'] && $transOrigPointerField) {
809
                                    $queryBuilder->andWhere(
810
                                        $queryBuilder->expr()->lte(
811
                                            $transOrigPointerField,
812
                                            0
813
                                        )
814
                                    );
815
                                }
816
817 6
                                $statement = $queryBuilder->execute();
818
819 6
                                $rows = [];
820 6
                                while ($row = $statement->fetch()) {
821 6
                                    $rows[$row[$fieldName]] = $row;
822
                                }
823
824 6
                                if (is_array($rows)) {
825 6
                                    $paramArray[$p] = array_merge($paramArray[$p], array_keys($rows));
826
                                }
827
                            }
828
                        }
829
                    } else { // Just add value:
830 2
                        $paramArray[$p][] = $pV;
831
                    }
832
                    // Hook for processing own expandParameters place holder
833 9
                    if (is_array($GLOBALS['TYPO3_CONF_VARS']['SC_OPTIONS']['crawler/class.tx_crawler_lib.php']['expandParameters'])) {
834
                        $_params = [
835
                            'pObj' => &$this,
836
                            'paramArray' => &$paramArray,
837
                            'currentKey' => $p,
838
                            'currentValue' => $pV,
839
                            'pid' => $pid,
840
                        ];
841
                        foreach ($GLOBALS['TYPO3_CONF_VARS']['SC_OPTIONS']['crawler/class.tx_crawler_lib.php']['expandParameters'] as $_funcRef) {
842
                            GeneralUtility::callUserFunction($_funcRef, $_params, $this);
843
                        }
844
                    }
845
                }
846
847
                // Make unique set of values and sort array by key:
848 9
                $paramArray[$p] = array_unique($paramArray[$p]);
849 9
                ksort($paramArray);
850
            } else {
851
                // Set the literal value as only value in array:
852 2
                $paramArray[$p] = [$v];
853
            }
854
        }
855
856 9
        return $paramArray;
857
    }
858
859
    /**
860
     * Compiling URLs from parameter array (output of expandParameters())
861
     * The number of URLs will be the multiplication of the number of parameter values for each key
862
     *
863
     * @param array $paramArray Output of expandParameters(): Array with keys (GET var names) and for each an array of values
864
     * @param array $urls URLs accumulated in this array (for recursion)
865
     * @return array
866
     */
867 5
    public function compileUrls($paramArray, array $urls)
868
    {
869 5
        if (empty($paramArray)) {
870 5
            return $urls;
871
        }
872
        // shift first off stack:
873 4
        reset($paramArray);
874 4
        $varName = key($paramArray);
875 4
        $valueSet = array_shift($paramArray);
876
877
        // Traverse value set:
878 4
        $newUrls = [];
879 4
        foreach ($urls as $url) {
880 3
            foreach ($valueSet as $val) {
881 3
                $newUrls[] = $url . (strcmp((string)$val, '') ? '&' . rawurlencode($varName) . '=' . rawurlencode((string)$val) : '');
882
883 3
                if (count($newUrls) > $this->maximumUrlsToCompile) {
884
                    break;
885
                }
886
            }
887
        }
888 4
        return $this->compileUrls($paramArray, $newUrls);
889
    }
890
891
    /************************************
892
     *
893
     * Crawler log
894
     *
895
     ************************************/
896
897
    /**
898
     * Return array of records from crawler queue for input page ID
899
     *
900
     * @param integer $id Page ID for which to look up log entries.
901
     * @param string $filter Filter: "all" => all entries, "pending" => all that is not yet run, "finished" => all complete ones
902
     * @param boolean $doFlush If TRUE, then entries selected at DELETED(!) instead of selected!
903
     * @param boolean $doFullFlush
904
     * @param integer $itemsPerPage Limit the amount of entries per page default is 10
905
     * @return array
906
     */
907 4
    public function getLogEntriesForPageId($id, $filter = '', $doFlush = false, $doFullFlush = false, $itemsPerPage = 10)
908
    {
909 4
        $queryBuilder = GeneralUtility::makeInstance(ConnectionPool::class)->getQueryBuilderForTable($this->tableName);
910
        $queryBuilder
911 4
            ->select('*')
912 4
            ->from($this->tableName)
913 4
            ->where(
914 4
                $queryBuilder->expr()->eq('page_id', $queryBuilder->createNamedParameter($id, \PDO::PARAM_INT))
915
            )
916 4
            ->orderBy('scheduled', 'DESC');
917
918 4
        $expressionBuilder = GeneralUtility::makeInstance(ConnectionPool::class)
919 4
            ->getConnectionForTable($this->tableName)
920 4
            ->getExpressionBuilder();
921 4
        $query = $expressionBuilder->andX();
922
        // PHPStorm adds the highlight that the $addWhere is immediately overwritten,
923
        // but the $query = $expressionBuilder->andX() ensures that the $addWhere is written correctly with AND
924
        // between the statements, it's not a mistake in the code.
925 4
        $addWhere = '1=1';
926 4
        switch ($filter) {
927 4
            case 'pending':
928
                $queryBuilder->andWhere($queryBuilder->expr()->eq('exec_time', 0));
929
                $addWhere .= ' AND ' . $query->add($expressionBuilder->eq('exec_time', 0));
930
                break;
931 4
            case 'finished':
932
                $queryBuilder->andWhere($queryBuilder->expr()->gt('exec_time', 0));
933
                $addWhere .= ' AND ' . $query->add($expressionBuilder->gt('exec_time', 0));
934
                break;
935 4
            case 'all':
936
                $doFullFlush = $doFullFlush ?: false;
937
                break;
938
        }
939
940
        // FIXME: Write unit test that ensures that the right records are deleted.
941 4
        if ($doFlush) {
942
            // We do currently ignore PageId by flush.
943
            // To have pending and finished parameters accepted
944
            // 2020.04.11 - Tomas Mikkelsen
945
            // $addWhere = $query->add($expressionBuilder->eq('page_id', (int)$id));
946 2
            $this->flushQueue($doFullFlush ? '1=1' : $addWhere);
947 2
            return [];
948
        } else {
949 2
            if ($itemsPerPage > 0) {
950
                $queryBuilder
951 2
                    ->setMaxResults((int)$itemsPerPage);
952
            }
953
954 2
            return $queryBuilder->execute()->fetchAll();
955
        }
956
    }
957
958
    /**
959
     * Return array of records from crawler queue for input set ID
960
     *
961
     * @param int $set_id Set ID for which to look up log entries.
962
     * @param string $filter Filter: "all" => all entries, "pending" => all that is not yet run, "finished" => all complete ones
963
     * @param bool $doFlush If TRUE, then entries selected at DELETED(!) instead of selected!
964
     * @param int $itemsPerPage Limit the amount of entires per page default is 10
965
     * @return array
966
     */
967 6
    public function getLogEntriesForSetId(int $set_id, string $filter = '', bool $doFlush = false, bool $doFullFlush = false, int $itemsPerPage = 10)
968
    {
969 6
        $queryBuilder = GeneralUtility::makeInstance(ConnectionPool::class)->getQueryBuilderForTable($this->tableName);
970
        $queryBuilder
971 6
            ->select('*')
972 6
            ->from($this->tableName)
973 6
            ->where(
974 6
                $queryBuilder->expr()->eq('set_id', $queryBuilder->createNamedParameter($set_id, \PDO::PARAM_INT))
975
            )
976 6
            ->orderBy('scheduled', 'DESC');
977
978 6
        $expressionBuilder = GeneralUtility::makeInstance(ConnectionPool::class)
979 6
            ->getConnectionForTable($this->tableName)
980 6
            ->getExpressionBuilder();
981 6
        $query = $expressionBuilder->andX();
982
        // FIXME: Write Unit tests for Filters
983
        // PHPStorm adds the highlight that the $addWhere is immediately overwritten,
984
        // but the $query = $expressionBuilder->andX() ensures that the $addWhere is written correctly with AND
985
        // between the statements, it's not a mistake in the code.
986 6
        $addWhere = '';
987 6
        switch ($filter) {
988 6
            case 'pending':
989 1
                $queryBuilder->andWhere($queryBuilder->expr()->eq('exec_time', 0));
990 1
                $addWhere = $query->add($expressionBuilder->eq('exec_time', 0));
0 ignored issues
show
Unused Code introduced by
The assignment to $addWhere is dead and can be removed.
Loading history...
991 1
                break;
992 5
            case 'finished':
993 1
                $queryBuilder->andWhere($queryBuilder->expr()->gt('exec_time', 0));
994 1
                $addWhere = $query->add($expressionBuilder->gt('exec_time', 0));
995 1
                break;
996
        }
997
        // FIXME: Write unit test that ensures that the right records are deleted.
998 6
        if ($doFlush) {
999 4
            $addWhere = $query->add($expressionBuilder->eq('set_id', (int)$set_id));
1000 4
            $this->flushQueue($doFullFlush ? '' : $addWhere);
1001 4
            return [];
1002
        } else {
1003 2
            if ($itemsPerPage > 0) {
1004
                $queryBuilder
1005 2
                    ->setMaxResults((int)$itemsPerPage);
1006
            }
1007
1008 2
            return $queryBuilder->execute()->fetchAll();
1009
        }
1010
    }
1011
1012
    /**
1013
     * Removes queue entries
1014
     *
1015
     * @param string $where SQL related filter for the entries which should be removed
1016
     * @return void
1017
     */
1018 11
    protected function flushQueue($where = ''): void
1019
    {
1020 11
        $realWhere = strlen((string)$where) > 0 ? $where : '1=1';
1021
1022 11
        $queryBuilder = $this->getQueryBuilder($this->tableName);
1023
1024 11
        if (EventDispatcher::getInstance()->hasObserver('queueEntryFlush')) {
1025
            $groups = $queryBuilder
1026
                ->select('DISTINCT set_id')
1027
                ->from($this->tableName)
1028
                ->where($realWhere)
1029
                ->execute()
1030
                ->fetchAll();
1031
            if (is_array($groups)) {
0 ignored issues
show
introduced by
The condition is_array($groups) is always true.
Loading history...
1032
                foreach ($groups as $group) {
1033
                    $subSet = $queryBuilder
1034
                        ->select('uid', 'set_id')
1035
                        ->from($this->tableName)
1036
                        ->where(
1037
                            $realWhere,
1038
                            $queryBuilder->expr()->eq('set_id', $group['set_id'])
1039
                        )
1040
                        ->execute()
1041
                        ->fetchAll();
1042
                    EventDispatcher::getInstance()->post('queueEntryFlush', $group['set_id'], $subSet);
1043
                }
1044
            }
1045
        }
1046
1047
        $queryBuilder
1048 11
            ->delete($this->tableName)
1049 11
            ->where($realWhere)
1050 11
            ->execute();
1051 11
    }
1052
1053
    /**
1054
     * Adding call back entries to log (called from hooks typically, see indexed search class "class.crawler.php"
1055
     *
1056
     * @param integer $setId Set ID
1057
     * @param array $params Parameters to pass to call back function
1058
     * @param string $callBack Call back object reference, eg. 'EXT:indexed_search/class.crawler.php:&tx_indexedsearch_crawler'
1059
     * @param integer $page_id Page ID to attach it to
1060
     * @param integer $schedule Time at which to activate
1061
     * @return void
1062
     */
1063
    public function addQueueEntry_callBack($setId, $params, $callBack, $page_id = 0, $schedule = 0): void
1064
    {
1065
        if (!is_array($params)) {
0 ignored issues
show
introduced by
The condition is_array($params) is always true.
Loading history...
1066
            $params = [];
1067
        }
1068
        $params['_CALLBACKOBJ'] = $callBack;
1069
1070
        GeneralUtility::makeInstance(ConnectionPool::class)->getConnectionForTable('tx_crawler_queue')
1071
            ->insert(
1072
                'tx_crawler_queue',
1073
                [
1074
                    'page_id' => (int)$page_id,
1075
                    'parameters' => serialize($params),
1076
                    'scheduled' => (int)$schedule ?: $this->getCurrentTime(),
1077
                    'exec_time' => 0,
1078
                    'set_id' => (int)$setId,
1079
                    'result_data' => '',
1080
                ]
1081
            );
1082
    }
1083
1084
    /************************************
1085
     *
1086
     * URL setting
1087
     *
1088
     ************************************/
1089
1090
    /**
1091
     * Setting a URL for crawling:
1092
     *
1093
     * @param integer $id Page ID
1094
     * @param string $url Complete URL
1095
     * @param array $subCfg Sub configuration array (from TS config)
1096
     * @param integer $tstamp Scheduled-time
1097
     * @param string $configurationHash (optional) configuration hash
1098
     * @param bool $skipInnerDuplicationCheck (optional) skip inner duplication check
1099
     * @return bool
1100
     */
1101 6
    public function addUrl(
1102
        $id,
1103
        $url,
1104
        array $subCfg,
1105
        $tstamp,
1106
        $configurationHash = '',
1107
        $skipInnerDuplicationCheck = false
1108
    ) {
1109 6
        $urlAdded = false;
1110 6
        $rows = [];
1111
1112
        // Creating parameters:
1113
        $parameters = [
1114 6
            'url' => $url,
1115
        ];
1116
1117
        // fe user group simulation:
1118 6
        $uGs = implode(',', array_unique(GeneralUtility::intExplode(',', $subCfg['userGroups'], true)));
1119 6
        if ($uGs) {
1120 1
            $parameters['feUserGroupList'] = $uGs;
1121
        }
1122
1123
        // Setting processing instructions
1124 6
        $parameters['procInstructions'] = GeneralUtility::trimExplode(',', $subCfg['procInstrFilter']);
1125 6
        if (is_array($subCfg['procInstrParams.'])) {
1126 3
            $parameters['procInstrParams'] = $subCfg['procInstrParams.'];
1127
        }
1128
1129
        // Compile value array:
1130 6
        $parameters_serialized = serialize($parameters);
1131
        $fieldArray = [
1132 6
            'page_id' => (int)$id,
1133 6
            'parameters' => $parameters_serialized,
1134 6
            'parameters_hash' => GeneralUtility::shortMD5($parameters_serialized),
1135 6
            'configuration_hash' => $configurationHash,
1136 6
            'scheduled' => $tstamp,
1137 6
            'exec_time' => 0,
1138 6
            'set_id' => (int)$this->setID,
1139 6
            'result_data' => '',
1140 6
            'configuration' => $subCfg['key'],
1141
        ];
1142
1143 6
        if ($this->registerQueueEntriesInternallyOnly) {
1144
            //the entries will only be registered and not stored to the database
1145 1
            $this->queueEntries[] = $fieldArray;
1146
        } else {
1147 5
            if (!$skipInnerDuplicationCheck) {
1148
                // check if there is already an equal entry
1149 4
                $rows = $this->getDuplicateRowsIfExist($tstamp, $fieldArray);
1150
            }
1151
1152 5
            if (empty($rows)) {
1153 4
                $connectionForCrawlerQueue = GeneralUtility::makeInstance(ConnectionPool::class)->getConnectionForTable('tx_crawler_queue');
1154 4
                $connectionForCrawlerQueue->insert(
1155 4
                    'tx_crawler_queue',
1156 4
                    $fieldArray
1157
                );
1158 4
                $uid = $connectionForCrawlerQueue->lastInsertId('tx_crawler_queue', 'qid');
1159 4
                $rows[] = $uid;
1160 4
                $urlAdded = true;
1161 4
                EventDispatcher::getInstance()->post('urlAddedToQueue', strval($this->setID), ['uid' => $uid, 'fieldArray' => $fieldArray]);
1162
            } else {
1163 1
                EventDispatcher::getInstance()->post('duplicateUrlInQueue', strval($this->setID), ['rows' => $rows, 'fieldArray' => $fieldArray]);
1164
            }
1165
        }
1166
1167 6
        return $urlAdded;
1168
    }
1169
1170
    /**
1171
     * This method determines duplicates for a queue entry with the same parameters and this timestamp.
1172
     * If the timestamp is in the past, it will check if there is any unprocessed queue entry in the past.
1173
     * If the timestamp is in the future it will check, if the queued entry has exactly the same timestamp
1174
     *
1175
     * @param int $tstamp
1176
     * @param array $fieldArray
1177
     *
1178
     * @return array
1179
     */
1180 7
    protected function getDuplicateRowsIfExist($tstamp, $fieldArray)
1181
    {
1182 7
        $rows = [];
1183
1184 7
        $currentTime = $this->getCurrentTime();
1185
1186 7
        $queryBuilder = GeneralUtility::makeInstance(ConnectionPool::class)->getQueryBuilderForTable($this->tableName);
1187
        $queryBuilder
1188 7
            ->select('qid')
1189 7
            ->from('tx_crawler_queue');
1190
        //if this entry is scheduled with "now"
1191 7
        if ($tstamp <= $currentTime) {
1192 2
            if ($this->extensionSettings['enableTimeslot']) {
1193 1
                $timeBegin = $currentTime - 100;
1194 1
                $timeEnd = $currentTime + 100;
1195
                $queryBuilder
1196 1
                    ->where(
1197 1
                        'scheduled BETWEEN ' . $timeBegin . ' AND ' . $timeEnd . ''
1198
                    )
1199 1
                    ->orWhere(
1200 1
                        $queryBuilder->expr()->lte('scheduled', $currentTime)
1201
                    );
1202
            } else {
1203
                $queryBuilder
1204 1
                    ->where(
1205 2
                        $queryBuilder->expr()->lte('scheduled', $currentTime)
1206
                    );
1207
            }
1208 5
        } elseif ($tstamp > $currentTime) {
1209
            //entry with a timestamp in the future need to have the same schedule time
1210
            $queryBuilder
1211 5
                ->where(
1212 5
                    $queryBuilder->expr()->eq('scheduled', $tstamp)
1213
                );
1214
        }
1215
1216
        $queryBuilder
1217 7
            ->andWhere('NOT exec_time')
1218 7
            ->andWhere('NOT process_id')
1219 7
            ->andWhere($queryBuilder->expr()->eq('page_id', $queryBuilder->createNamedParameter($fieldArray['page_id'], \PDO::PARAM_INT)))
1220 7
            ->andWhere($queryBuilder->expr()->eq('parameters_hash', $queryBuilder->createNamedParameter($fieldArray['parameters_hash'], \PDO::PARAM_STR)));
1221
1222 7
        $statement = $queryBuilder->execute();
1223
1224 7
        while ($row = $statement->fetch()) {
1225 5
            $rows[] = $row['qid'];
1226
        }
1227
1228 7
        return $rows;
1229
    }
1230
1231
    /**
1232
     * Returns the current system time
1233
     *
1234
     * @return int
1235
     */
1236
    public function getCurrentTime()
1237
    {
1238
        return time();
1239
    }
1240
1241
    /************************************
1242
     *
1243
     * URL reading
1244
     *
1245
     ************************************/
1246
1247
    /**
1248
     * Read URL for single queue entry
1249
     *
1250
     * @param integer $queueId
1251
     * @param boolean $force If set, will process even if exec_time has been set!
1252
     * @return integer
1253
     */
1254
    public function readUrl($queueId, $force = false)
1255
    {
1256
        $queryBuilder = GeneralUtility::makeInstance(ConnectionPool::class)->getQueryBuilderForTable($this->tableName);
1257
        $ret = 0;
1258
        $this->logger->debug('crawler-readurl start ' . microtime(true));
1259
        // Get entry:
1260
        $queryBuilder
1261
            ->select('*')
1262
            ->from('tx_crawler_queue')
1263
            ->where(
1264
                $queryBuilder->expr()->eq('qid', $queryBuilder->createNamedParameter($queueId, \PDO::PARAM_INT))
1265
            );
1266
        if (!$force) {
1267
            $queryBuilder
1268
                ->andWhere('exec_time = 0')
1269
                ->andWhere('process_scheduled > 0');
1270
        }
1271
        $queueRec = $queryBuilder->execute()->fetch();
1272
1273
        if (!is_array($queueRec)) {
1274
            return;
1275
        }
1276
1277
        SignalSlotUtility::emitSignal(
1278
            __CLASS__,
1279
            SignalSlotUtility::SIGNAL_QUEUEITEM_PREPROCESS,
1280
            [$queueId, &$queueRec]
1281
        );
1282
1283
        // Set exec_time to lock record:
1284
        $field_array = ['exec_time' => $this->getCurrentTime()];
1285
1286
        if (isset($this->processID)) {
1287
            //if mulitprocessing is used we need to store the id of the process which has handled this entry
1288
            $field_array['process_id_completed'] = $this->processID;
1289
        }
1290
1291
        GeneralUtility::makeInstance(ConnectionPool::class)->getConnectionForTable('tx_crawler_queue')
1292
            ->update(
1293
                'tx_crawler_queue',
1294
                $field_array,
1295
                ['qid' => (int)$queueId]
1296
            );
1297
1298
        $result = $this->queueExecutor->executeQueueItem($queueRec, $this);
1299
        if (null === $result['content']) {
1300
            $resultData = 'An errors happened';
1301
        } else {
1302
            $resultData = unserialize($result['content']);
1303
        }
1304
1305
        //atm there's no need to point to specific pollable extensions
1306
        if (is_array($GLOBALS['TYPO3_CONF_VARS']['EXTCONF']['crawler']['pollSuccess'])) {
1307
            foreach ($GLOBALS['TYPO3_CONF_VARS']['EXTCONF']['crawler']['pollSuccess'] as $pollable) {
1308
                // only check the success value if the instruction is runnig
1309
                // it is important to name the pollSuccess key same as the procInstructions key
1310
                if (is_array($resultData['parameters']['procInstructions'])
1311
                    && in_array(
1312
                        $pollable,
1313
                        $resultData['parameters']['procInstructions']
1314
                    )
1315
                ) {
1316
                    if (!empty($resultData['success'][$pollable]) && $resultData['success'][$pollable]) {
1317
                        $ret |= self::CLI_STATUS_POLLABLE_PROCESSED;
1318
                    }
1319
                }
1320
            }
1321
        }
1322
1323
        // Set result in log which also denotes the end of the processing of this entry.
1324
        $field_array = ['result_data' => serialize($result)];
1325
1326
        SignalSlotUtility::emitSignal(
1327
            __CLASS__,
1328
            SignalSlotUtility::SIGNAL_QUEUEITEM_POSTPROCESS,
1329
            [$queueId, &$field_array]
1330
        );
1331
1332
        GeneralUtility::makeInstance(ConnectionPool::class)->getConnectionForTable('tx_crawler_queue')
1333
            ->update(
1334
                'tx_crawler_queue',
1335
                $field_array,
1336
                ['qid' => (int)$queueId]
1337
            );
1338
1339
        $this->logger->debug('crawler-readurl stop ' . microtime(true));
1340
        return $ret;
1341
    }
1342
1343
    /**
1344
     * Read URL for not-yet-inserted log-entry
1345
     *
1346
     * @param array $field_array Queue field array,
1347
     *
1348
     * @return string
1349
     */
1350
    public function readUrlFromArray($field_array)
1351
    {
1352
        // Set exec_time to lock record:
1353
        $field_array['exec_time'] = $this->getCurrentTime();
1354
        $connectionForCrawlerQueue = GeneralUtility::makeInstance(ConnectionPool::class)->getConnectionForTable($this->tableName);
1355
        $connectionForCrawlerQueue->insert(
1356
            $this->tableName,
1357
            $field_array
1358
        );
1359
        $queueId = $field_array['qid'] = $connectionForCrawlerQueue->lastInsertId($this->tableName, 'qid');
1360
1361
        $result = $this->queueExecutor->executeQueueItem($field_array, $this);
1362
1363
        // Set result in log which also denotes the end of the processing of this entry.
1364
        $field_array = ['result_data' => serialize($result)];
1365
1366
        SignalSlotUtility::emitSignal(
1367
            __CLASS__,
1368
            SignalSlotUtility::SIGNAL_QUEUEITEM_POSTPROCESS,
1369
            [$queueId, &$field_array]
1370
        );
1371
1372
        $connectionForCrawlerQueue->update(
1373
            $this->tableName,
1374
            $field_array,
1375
            ['qid' => $queueId]
1376
        );
1377
1378
        return $result;
0 ignored issues
show
Bug Best Practice introduced by
The expression return $result also could return the type array|boolean which is incompatible with the documented return type string.
Loading history...
1379
    }
1380
1381
    /*****************************
1382
     *
1383
     * Compiling URLs to crawl - tools
1384
     *
1385
     *****************************/
1386
1387
    /**
1388
     * @param integer $id Root page id to start from.
1389
     * @param integer $depth Depth of tree, 0=only id-page, 1= on sublevel, 99 = infinite
1390
     * @param integer $scheduledTime Unix Time when the URL is timed to be visited when put in queue
1391
     * @param integer $reqMinute Number of requests per minute (creates the interleave between requests)
1392
     * @param boolean $submitCrawlUrls If set, submits the URLs to queue in database (real crawling)
1393
     * @param boolean $downloadCrawlUrls If set (and submitcrawlUrls is false) will fill $downloadUrls with entries)
1394
     * @param array $incomingProcInstructions Array of processing instructions
1395
     * @param array $configurationSelection Array of configuration keys
1396
     * @return string
1397
     */
1398
    public function getPageTreeAndUrls(
1399
        $id,
1400
        $depth,
1401
        $scheduledTime,
1402
        $reqMinute,
1403
        $submitCrawlUrls,
1404
        $downloadCrawlUrls,
1405
        array $incomingProcInstructions,
1406
        array $configurationSelection
1407
    ) {
1408
        $this->scheduledTime = $scheduledTime;
1409
        $this->reqMinute = $reqMinute;
1410
        $this->submitCrawlUrls = $submitCrawlUrls;
1411
        $this->downloadCrawlUrls = $downloadCrawlUrls;
1412
        $this->incomingProcInstructions = $incomingProcInstructions;
1413
        $this->incomingConfigurationSelection = $configurationSelection;
1414
1415
        $this->duplicateTrack = [];
1416
        $this->downloadUrls = [];
1417
1418
        // Drawing tree:
1419
        /* @var PageTreeView $tree */
1420
        $tree = GeneralUtility::makeInstance(PageTreeView::class);
1421
        $perms_clause = $this->getBackendUser()->getPagePermsClause(Permission::PAGE_SHOW);
1422
        $tree->init('AND ' . $perms_clause);
1423
1424
        $pageInfo = BackendUtility::readPageAccess($id, $perms_clause);
1425
        if (is_array($pageInfo)) {
0 ignored issues
show
introduced by
The condition is_array($pageInfo) is always true.
Loading history...
1426
            // Set root row:
1427
            $tree->tree[] = [
1428
                'row' => $pageInfo,
1429
                'HTML' => $this->iconFactory->getIconForRecord('pages', $pageInfo, Icon::SIZE_SMALL),
1430
            ];
1431
        }
1432
1433
        // Get branch beneath:
1434
        if ($depth) {
1435
            $tree->getTree($id, $depth, '');
1436
        }
1437
1438
        // Traverse page tree:
1439
        $code = '';
1440
1441
        foreach ($tree->tree as $data) {
1442
            $this->MP = false;
1443
1444
            // recognize mount points
1445
            if ($data['row']['doktype'] == PageRepository::DOKTYPE_MOUNTPOINT) {
1446
                $queryBuilder = GeneralUtility::makeInstance(ConnectionPool::class)->getQueryBuilderForTable('pages');
1447
                $queryBuilder->getRestrictions()->removeAll()->add(GeneralUtility::makeInstance(DeletedRestriction::class));
1448
                $mountpage = $queryBuilder
1449
                    ->select('*')
1450
                    ->from('pages')
1451
                    ->where(
1452
                        $queryBuilder->expr()->eq('uid', $queryBuilder->createNamedParameter($data['row']['uid'], \PDO::PARAM_INT))
1453
                    )
1454
                    ->execute()
1455
                    ->fetchAll();
1456
                $queryBuilder->resetRestrictions();
1457
1458
                // fetch mounted pages
1459
                $this->MP = $mountpage[0]['mount_pid'] . '-' . $data['row']['uid'];
0 ignored issues
show
Documentation Bug introduced by
The property $MP was declared of type boolean, but $mountpage[0]['mount_pid...' . $data['row']['uid'] is of type string. Maybe add a type cast?

This check looks for assignments to scalar types that may be of the wrong type.

To ensure the code behaves as expected, it may be a good idea to add an explicit type cast.

$answer = 42;

$correct = false;

$correct = (bool) $answer;
Loading history...
1460
1461
                $mountTree = GeneralUtility::makeInstance(PageTreeView::class);
1462
                $mountTree->init('AND ' . $perms_clause);
1463
                $mountTree->getTree($mountpage[0]['mount_pid'], $depth);
1464
1465
                foreach ($mountTree->tree as $mountData) {
1466
                    $code .= $this->drawURLs_addRowsForPage(
1467
                        $mountData['row'],
1468
                        $mountData['HTML'] . BackendUtility::getRecordTitle('pages', $mountData['row'], true)
1469
                    );
1470
                }
1471
1472
                // replace page when mount_pid_ol is enabled
1473
                if ($mountpage[0]['mount_pid_ol']) {
1474
                    $data['row']['uid'] = $mountpage[0]['mount_pid'];
1475
                } else {
1476
                    // if the mount_pid_ol is not set the MP must not be used for the mountpoint page
1477
                    $this->MP = false;
1478
                }
1479
            }
1480
1481
            $code .= $this->drawURLs_addRowsForPage(
1482
                $data['row'],
1483
                $data['HTML'] . BackendUtility::getRecordTitle('pages', $data['row'], true)
1484
            );
1485
        }
1486
1487
        return $code;
1488
    }
1489
1490
    /**
1491
     * Expands exclude string
1492
     *
1493
     * @param string $excludeString Exclude string
1494
     * @return array
1495
     */
1496 1
    public function expandExcludeString($excludeString)
1497
    {
1498
        // internal static caches;
1499 1
        static $expandedExcludeStringCache;
1500 1
        static $treeCache;
1501
1502 1
        if (empty($expandedExcludeStringCache[$excludeString])) {
1503 1
            $pidList = [];
1504
1505 1
            if (!empty($excludeString)) {
1506
                /** @var PageTreeView $tree */
1507
                $tree = GeneralUtility::makeInstance(PageTreeView::class);
1508
                $tree->init('AND ' . $this->getBackendUser()->getPagePermsClause(Permission::PAGE_SHOW));
1509
1510
                $excludeParts = GeneralUtility::trimExplode(',', $excludeString);
1511
1512
                foreach ($excludeParts as $excludePart) {
1513
                    [$pid, $depth] = GeneralUtility::trimExplode('+', $excludePart);
1514
1515
                    // default is "page only" = "depth=0"
1516
                    if (empty($depth)) {
1517
                        $depth = (stristr($excludePart, '+')) ? 99 : 0;
1518
                    }
1519
1520
                    $pidList[] = $pid;
1521
1522
                    if ($depth > 0) {
1523
                        if (empty($treeCache[$pid][$depth])) {
1524
                            $tree->reset();
1525
                            $tree->getTree($pid, $depth);
1526
                            $treeCache[$pid][$depth] = $tree->tree;
1527
                        }
1528
1529
                        foreach ($treeCache[$pid][$depth] as $data) {
1530
                            $pidList[] = $data['row']['uid'];
1531
                        }
1532
                    }
1533
                }
1534
            }
1535
1536 1
            $expandedExcludeStringCache[$excludeString] = array_unique($pidList);
1537
        }
1538
1539 1
        return $expandedExcludeStringCache[$excludeString];
1540
    }
1541
1542
    /**
1543
     * Create the rows for display of the page tree
1544
     * For each page a number of rows are shown displaying GET variable configuration
1545
     */
1546
    public function drawURLs_addRowsForPage(array $pageRow, string $pageTitle): string
1547
    {
1548
        $skipMessage = '';
1549
1550
        // Get list of configurations
1551
        $configurations = $this->getUrlsForPageRow($pageRow, $skipMessage);
1552
1553
        if (!empty($this->incomingConfigurationSelection)) {
1554
            // remove configuration that does not match the current selection
1555
            foreach ($configurations as $confKey => $confArray) {
1556
                if (!in_array($confKey, $this->incomingConfigurationSelection)) {
1557
                    unset($configurations[$confKey]);
1558
                }
1559
            }
1560
        }
1561
1562
        // Traverse parameter combinations:
1563
        $c = 0;
1564
        $content = '';
1565
        if (!empty($configurations)) {
1566
            foreach ($configurations as $confKey => $confArray) {
1567
1568
                // Title column:
1569
                if (!$c) {
1570
                    $titleClm = '<td rowspan="' . count($configurations) . '">' . $pageTitle . '</td>';
1571
                } else {
1572
                    $titleClm = '';
1573
                }
1574
1575
                if (!in_array($pageRow['uid'], $this->expandExcludeString($confArray['subCfg']['exclude']))) {
1576
1577
                    // URL list:
1578
                    $urlList = $this->urlListFromUrlArray(
1579
                        $confArray,
1580
                        $pageRow,
1581
                        $this->scheduledTime,
1582
                        $this->reqMinute,
1583
                        $this->submitCrawlUrls,
1584
                        $this->downloadCrawlUrls,
1585
                        $this->duplicateTrack,
1586
                        $this->downloadUrls,
1587
                        $this->incomingProcInstructions // if empty the urls won't be filtered by processing instructions
1588
                    );
1589
1590
                    // Expanded parameters:
1591
                    $paramExpanded = '';
1592
                    $calcAccu = [];
1593
                    $calcRes = 1;
1594
                    foreach ($confArray['paramExpanded'] as $gVar => $gVal) {
1595
                        $paramExpanded .= '
1596
                            <tr>
1597
                                <td>' . htmlspecialchars('&' . $gVar . '=') . '<br/>' .
1598
                            '(' . count($gVal) . ')' .
1599
                            '</td>
1600
                                <td nowrap="nowrap">' . nl2br(htmlspecialchars(implode(chr(10), $gVal))) . '</td>
1601
                            </tr>
1602
                        ';
1603
                        $calcRes *= count($gVal);
1604
                        $calcAccu[] = count($gVal);
1605
                    }
1606
                    $paramExpanded = '<table>' . $paramExpanded . '</table>';
1607
                    $paramExpanded .= 'Comb: ' . implode('*', $calcAccu) . '=' . $calcRes;
1608
1609
                    // Options
1610
                    $optionValues = '';
1611
                    if ($confArray['subCfg']['userGroups']) {
1612
                        $optionValues .= 'User Groups: ' . $confArray['subCfg']['userGroups'] . '<br/>';
1613
                    }
1614
                    if ($confArray['subCfg']['procInstrFilter']) {
1615
                        $optionValues .= 'ProcInstr: ' . $confArray['subCfg']['procInstrFilter'] . '<br/>';
1616
                    }
1617
1618
                    // Compile row:
1619
                    $content .= '
1620
                        <tr>
1621
                            ' . $titleClm . '
1622
                            <td>' . htmlspecialchars($confKey) . '</td>
1623
                            <td>' . nl2br(htmlspecialchars(rawurldecode(trim(str_replace('&', chr(10) . '&', GeneralUtility::implodeArrayForUrl('', $confArray['paramParsed'])))))) . '</td>
1624
                            <td>' . $paramExpanded . '</td>
1625
                            <td nowrap="nowrap">' . $urlList . '</td>
1626
                            <td nowrap="nowrap">' . $optionValues . '</td>
1627
                            <td nowrap="nowrap">' . DebugUtility::viewArray($confArray['subCfg']['procInstrParams.']) . '</td>
1628
                        </tr>';
1629
                } else {
1630
                    $content .= '<tr>
1631
                            ' . $titleClm . '
1632
                            <td>' . htmlspecialchars($confKey) . '</td>
1633
                            <td colspan="5"><em>No entries</em> (Page is excluded in this configuration)</td>
1634
                        </tr>';
1635
                }
1636
1637
                $c++;
1638
            }
1639
        } else {
1640
            $message = !empty($skipMessage) ? ' (' . $skipMessage . ')' : '';
1641
1642
            // Compile row:
1643
            $content .= '
1644
                <tr>
1645
                    <td>' . $pageTitle . '</td>
1646
                    <td colspan="6"><em>No entries</em>' . $message . '</td>
1647
                </tr>';
1648
        }
1649
1650
        return $content;
1651
    }
1652
1653
    /*****************************
1654
     *
1655
     * CLI functions
1656
     *
1657
     *****************************/
1658
1659
    /**
1660
     * Running the functionality of the CLI (crawling URLs from queue)
1661
     */
1662
    public function CLI_run(int $countInARun, int $sleepTime, int $sleepAfterFinish): int
1663
    {
1664
        $result = 0;
1665
        $counter = 0;
1666
1667
        // First, run hooks:
1668
        $this->CLI_runHooks();
1669
1670
        // Clean up the queue
1671
        $this->queueRepository->cleanupQueue();
1672
1673
        // Select entries:
1674
        $rows = $this->queueRepository->fetchRecordsToBeCrawled($countInARun);
1675
1676
        if (!empty($rows)) {
1677
            $quidList = [];
1678
1679
            foreach ($rows as $r) {
1680
                $quidList[] = $r['qid'];
1681
            }
1682
1683
            $processId = $this->CLI_buildProcessId();
1684
1685
            //save the number of assigned queue entries to determine how many have been processed later
1686
            $numberOfAffectedRows = $this->queueRepository->updateProcessIdAndSchedulerForQueueIds($quidList, $processId);
1687
            $this->processRepository->updateProcessAssignItemsCount($numberOfAffectedRows, $processId);
1688
1689
            if ($numberOfAffectedRows !== count($quidList)) {
1690
                $this->CLI_debug("Nothing processed due to multi-process collision (" . $this->CLI_buildProcessId() . ")");
1691
                return ($result | self::CLI_STATUS_ABORTED);
1692
            }
1693
1694
            foreach ($rows as $r) {
1695
                $result |= $this->readUrl($r['qid']);
1696
1697
                $counter++;
1698
                usleep((int)$sleepTime); // Just to relax the system
1699
1700
                // if during the start and the current read url the cli has been disable we need to return from the function
1701
                // mark the process NOT as ended.
1702
                if ($this->getDisabled()) {
1703
                    return ($result | self::CLI_STATUS_ABORTED);
1704
                }
1705
1706
                if (!$this->processRepository->isProcessActive($this->CLI_buildProcessId())) {
1707
                    $this->CLI_debug("conflict / timeout (" . $this->CLI_buildProcessId() . ")");
1708
1709
                    //TODO might need an additional returncode
1710
                    $result |= self::CLI_STATUS_ABORTED;
1711
                    break; //possible timeout
1712
                }
1713
            }
1714
1715
            sleep((int)$sleepAfterFinish);
1716
1717
            $msg = 'Rows: ' . $counter;
1718
            $this->CLI_debug($msg . " (" . $this->CLI_buildProcessId() . ")");
1719
        } else {
1720
            $this->CLI_debug("Nothing within queue which needs to be processed (" . $this->CLI_buildProcessId() . ")");
1721
        }
1722
1723
        if ($counter > 0) {
1724
            $result |= self::CLI_STATUS_PROCESSED;
1725
        }
1726
1727
        return $result;
1728
    }
1729
1730
    /**
1731
     * Activate hooks
1732
     *
1733
     * @return void
1734
     */
1735
    public function CLI_runHooks(): void
1736
    {
1737
        foreach ($GLOBALS['TYPO3_CONF_VARS']['EXTCONF']['crawler']['cli_hooks'] ?? [] as $objRef) {
1738
            $hookObj = GeneralUtility::makeInstance($objRef);
1739
            if (is_object($hookObj)) {
1740
                $hookObj->crawler_init($this);
1741
            }
1742
        }
1743
    }
1744
1745
    /**
1746
     * Try to acquire a new process with the given id
1747
     * also performs some auto-cleanup for orphan processes
1748
     * @param string $id identification string for the process
1749
     * @return boolean
1750
     * @todo preemption might not be the most elegant way to clean up
1751
     */
1752
    public function CLI_checkAndAcquireNewProcess($id)
1753
    {
1754
        $queryBuilder = GeneralUtility::makeInstance(ConnectionPool::class)->getQueryBuilderForTable($this->tableName);
1755
        $ret = true;
1756
1757
        $systemProcessId = getmypid();
1758
        if ($systemProcessId < 1) {
1759
            return false;
1760
        }
1761
1762
        $processCount = 0;
1763
        $orphanProcesses = [];
1764
1765
        $statement = $queryBuilder
1766
            ->select('process_id', 'ttl')
1767
            ->from('tx_crawler_process')
1768
            ->where(
1769
                'active = 1 AND deleted = 0'
1770
            )
1771
            ->execute();
1772
1773
        $currentTime = $this->getCurrentTime();
1774
1775
        while ($row = $statement->fetch()) {
1776
            if ($row['ttl'] < $currentTime) {
1777
                $orphanProcesses[] = $row['process_id'];
1778
            } else {
1779
                $processCount++;
1780
            }
1781
        }
1782
1783
        // if there are less than allowed active processes then add a new one
1784
        if ($processCount < (int)$this->extensionSettings['processLimit']) {
1785
            $this->CLI_debug("add process " . $this->CLI_buildProcessId() . " (" . ($processCount + 1) . "/" . (int)$this->extensionSettings['processLimit'] . ")");
1786
1787
            GeneralUtility::makeInstance(ConnectionPool::class)->getConnectionForTable('tx_crawler_process')->insert(
1788
                'tx_crawler_process',
1789
                [
1790
                    'process_id' => $id,
1791
                    'active' => 1,
1792
                    'ttl' => $currentTime + (int)$this->extensionSettings['processMaxRunTime'],
1793
                    'system_process_id' => $systemProcessId,
1794
                ]
1795
            );
1796
        } else {
1797
            $this->CLI_debug("Processlimit reached (" . ($processCount) . "/" . (int)$this->extensionSettings['processLimit'] . ")");
1798
            $ret = false;
1799
        }
1800
1801
        $this->processRepository->deleteProcessesMarkedAsDeleted();
1802
        $this->CLI_releaseProcesses($orphanProcesses);
1803
1804
        return $ret;
1805
    }
1806
1807
    /**
1808
     * Release a process and the required resources
1809
     *
1810
     * @param mixed $releaseIds string with a single process-id or array with multiple process-ids
1811
     * @return boolean
1812
     */
1813
    public function CLI_releaseProcesses($releaseIds)
1814
    {
1815
        $queryBuilder = GeneralUtility::makeInstance(ConnectionPool::class)->getQueryBuilderForTable($this->tableName);
1816
1817
        if (!is_array($releaseIds)) {
1818
            $releaseIds = [$releaseIds];
1819
        }
1820
1821
        if (empty($releaseIds)) {
1822
            return false;   //nothing to release
1823
        }
1824
1825
        // some kind of 2nd chance algo - this way you need at least 2 processes to have a real cleanup
1826
        // this ensures that a single process can't mess up the entire process table
1827
1828
        // mark all processes as deleted which have no "waiting" queue-entires and which are not active
1829
1830
        $queryBuilder
1831
            ->update($this->tableName, 'q')
1832
            ->where(
1833
                'q.process_id IN(SELECT p.process_id FROM tx_crawler_process as p WHERE p.active = 0)'
1834
            )
1835
            ->set('q.process_scheduled', 0)
1836
            ->set('q.process_id', '')
1837
            ->execute();
1838
1839
        // FIXME: Not entirely sure that this is equivalent to the previous version
1840
        $queryBuilder->resetQueryPart('set');
1841
1842
        $queryBuilder
1843
            ->update('tx_crawler_process')
1844
            ->where(
1845
                $queryBuilder->expr()->eq('active', 0),
1846
                'process_id IN(SELECT q.process_id FROM tx_crawler_queue as q WHERE q.exec_time = 0)'
1847
            )
1848
            ->set('system_process_id', 0)
1849
            ->execute();
1850
1851
        $this->processRepository->markRequestedProcessesAsNotActive($releaseIds);
1852
        $this->queueRepository->unsetProcessScheduledAndProcessIdForQueueEntries($releaseIds);
1853
1854
        return true;
1855
    }
1856
1857
    /**
1858
     * Create a unique Id for the current process
1859
     *
1860
     * @return string  the ID
1861
     */
1862 1
    public function CLI_buildProcessId()
1863
    {
1864 1
        if (!$this->processID) {
1865
            $this->processID = GeneralUtility::shortMD5(microtime(true));
1866
        }
1867 1
        return $this->processID;
1868
    }
1869
1870
    /**
1871
     * Prints a message to the stdout (only if debug-mode is enabled)
1872
     *
1873
     * @param string $msg the message
1874
     */
1875
    public function CLI_debug($msg): void
1876
    {
1877
        if ((int)$this->extensionSettings['processDebug']) {
1878
            echo $msg . "\n";
1879
            flush();
1880
        }
1881
    }
1882
1883
    /**
1884
     * Cleans up entries that stayed for too long in the queue. These are:
1885
     * - processed entries that are over 1.5 days in age
1886
     * - scheduled entries that are over 7 days old
1887
     *
1888
     * @return void
1889
     */
1890 1
    public function cleanUpOldQueueEntries(): void
1891
    {
1892 1
        $processedAgeInSeconds = $this->extensionSettings['cleanUpProcessedAge'] * 86400; // 24*60*60 Seconds in 24 hours
1893 1
        $scheduledAgeInSeconds = $this->extensionSettings['cleanUpScheduledAge'] * 86400;
1894
1895 1
        $now = time();
1896 1
        $condition = '(exec_time<>0 AND exec_time<' . ($now - $processedAgeInSeconds) . ') OR scheduled<=' . ($now - $scheduledAgeInSeconds);
1897 1
        $this->flushQueue($condition);
1898 1
    }
1899
1900
    /**
1901
     * Returns a md5 hash generated from a serialized configuration array.
1902
     *
1903
     * @param array $configuration
1904
     *
1905
     * @return string
1906
     */
1907 8
    protected function getConfigurationHash(array $configuration)
1908
    {
1909 8
        unset($configuration['paramExpanded']);
1910 8
        unset($configuration['URLs']);
1911 8
        return md5(serialize($configuration));
1912
    }
1913
1914
    /**
1915
     * Build a URL from a Page and the Query String. If the page has a Site configuration, it can be built by using
1916
     * the Site instance.
1917
     *
1918
     * @param int $pageId
1919
     * @param string $queryString
1920
     * @param string|null $alternativeBaseUrl
1921
     * @param int $httpsOrHttp see tx_crawler_configuration.force_ssl
1922
     * @return UriInterface
1923
     * @throws \TYPO3\CMS\Core\Exception\SiteNotFoundException
1924
     * @throws \TYPO3\CMS\Core\Routing\InvalidRouteArgumentsException
1925
     */
1926 10
    protected function getUrlFromPageAndQueryParameters(int $pageId, string $queryString, ?string $alternativeBaseUrl, int $httpsOrHttp): UriInterface
1927
    {
1928 10
        $site = GeneralUtility::makeInstance(SiteMatcher::class)->matchByPageId((int)$pageId);
1929 10
        if ($site instanceof Site) {
1930 5
            $queryString = ltrim($queryString, '?&');
1931 5
            $queryParts = [];
1932 5
            parse_str($queryString, $queryParts);
1933 5
            unset($queryParts['id']);
1934
            // workaround as long as we don't have native language support in crawler configurations
1935 5
            if (isset($queryParts['L'])) {
1936
                $queryParts['_language'] = $queryParts['L'];
1937
                unset($queryParts['L']);
1938
                $siteLanguage = $site->getLanguageById((int)$queryParts['_language']);
0 ignored issues
show
Unused Code introduced by
The assignment to $siteLanguage is dead and can be removed.
Loading history...
1939
            } else {
1940 5
                $siteLanguage = $site->getDefaultLanguage();
1941
            }
1942 5
            $url = $site->getRouter()->generateUri($pageId, $queryParts);
1943 5
            if (!empty($alternativeBaseUrl)) {
1944 3
                $alternativeBaseUrl = new Uri($alternativeBaseUrl);
1945 3
                $url = $url->withHost($alternativeBaseUrl->getHost());
1946 3
                $url = $url->withScheme($alternativeBaseUrl->getScheme());
1947 3
                $url = $url->withPort($alternativeBaseUrl->getPort());
1948 3
                if ($userInfo = $alternativeBaseUrl->getUserInfo()) {
1949 5
                    $url = $url->withUserInfo($userInfo);
1950
                }
1951
            }
1952
        } else {
1953
            // Technically this is not possible with site handling, but kept for backwards-compatibility reasons
1954
            // Once EXT:crawler is v10-only compatible, this should be removed completely
1955 5
            $baseUrl = ($alternativeBaseUrl ?: GeneralUtility::getIndpEnv('TYPO3_SITE_URL'));
1956 5
            $cacheHashCalculator = GeneralUtility::makeInstance(CacheHashCalculator::class);
1957 5
            $queryString .= '&cHash=' . $cacheHashCalculator->generateForParameters($queryString);
1958 5
            $url = rtrim($baseUrl, '/') . '/index.php' . $queryString;
1959 5
            $url = new Uri($url);
1960
        }
1961
1962 10
        if ($httpsOrHttp === -1) {
1963 2
            $url = $url->withScheme('http');
1964 8
        } elseif ($httpsOrHttp === 1) {
1965 6
            $url = $url->withScheme('https');
1966
        }
1967
1968 10
        return $url;
1969
    }
1970
1971 1
    protected function swapIfFirstIsLargerThanSecond(array $reg): array
1972
    {
1973
        // Swap if first is larger than last:
1974 1
        if ($reg[1] > $reg[2]) {
1975
            $temp = $reg[2];
1976
            $reg[2] = $reg[1];
1977
            $reg[1] = $temp;
1978
        }
1979
1980 1
        return $reg;
1981
    }
1982
}
1983