Passed
Push — Cleanup/misc ( 4cc982 )
by Tomas Norre
06:26
created

CrawlerController::getUrlsForPageId()   C

Complexity

Conditions 16
Paths 96

Size

Total Lines 93
Code Lines 51

Duplication

Lines 0
Ratio 0 %

Code Coverage

Tests 46
CRAP Score 16.0585

Importance

Changes 3
Bugs 0 Features 0
Metric Value
cc 16
eloc 51
c 3
b 0
f 0
nc 96
nop 1
dl 0
loc 93
ccs 46
cts 49
cp 0.9388
crap 16.0585
rs 5.5666

How to fix   Long Method    Complexity   

Long Method

Small methods make your code easier to understand, in particular if combined with a good name. Besides, if your method is small, finding a good name is usually much easier.

For example, if you find yourself adding comments to a method's body, this is usually a good sign to extract the commented part to a new method, and use the comment as a starting point when coming up with a good name for this new method.

Commonly applied refactorings include:

1
<?php
2
3
declare(strict_types=1);
4
5
namespace AOE\Crawler\Controller;
6
7
/***************************************************************
8
 *  Copyright notice
9
 *
10
 *  (c) 2020 AOE GmbH <[email protected]>
11
 *
12
 *  All rights reserved
13
 *
14
 *  This script is part of the TYPO3 project. The TYPO3 project is
15
 *  free software; you can redistribute it and/or modify
16
 *  it under the terms of the GNU General Public License as published by
17
 *  the Free Software Foundation; either version 3 of the License, or
18
 *  (at your option) any later version.
19
 *
20
 *  The GNU General Public License can be found at
21
 *  http://www.gnu.org/copyleft/gpl.html.
22
 *
23
 *  This script is distributed in the hope that it will be useful,
24
 *  but WITHOUT ANY WARRANTY; without even the implied warranty of
25
 *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
26
 *  GNU General Public License for more details.
27
 *
28
 *  This copyright notice MUST APPEAR in all copies of the script!
29
 ***************************************************************/
30
31
use AOE\Crawler\Configuration\ExtensionConfigurationProvider;
32
use AOE\Crawler\Converter\JsonCompatibilityConverter;
33
use AOE\Crawler\Domain\Repository\ConfigurationRepository;
34
use AOE\Crawler\Domain\Repository\ProcessRepository;
35
use AOE\Crawler\Domain\Repository\QueueRepository;
36
use AOE\Crawler\QueueExecutor;
37
use AOE\Crawler\Utility\HookUtility;
38
use AOE\Crawler\Utility\SignalSlotUtility;
39
use Psr\Http\Message\UriInterface;
40
use Psr\Log\LoggerAwareInterface;
41
use Psr\Log\LoggerAwareTrait;
42
use TYPO3\CMS\Backend\Tree\View\PageTreeView;
43
use TYPO3\CMS\Backend\Utility\BackendUtility;
44
use TYPO3\CMS\Core\Authentication\BackendUserAuthentication;
45
use TYPO3\CMS\Core\Compatibility\PublicMethodDeprecationTrait;
46
use TYPO3\CMS\Core\Core\Bootstrap;
47
use TYPO3\CMS\Core\Core\Environment;
48
use TYPO3\CMS\Core\Database\Connection;
49
use TYPO3\CMS\Core\Database\ConnectionPool;
50
use TYPO3\CMS\Core\Database\Query\Restriction\DeletedRestriction;
51
use TYPO3\CMS\Core\Http\Uri;
52
use TYPO3\CMS\Core\Imaging\Icon;
53
use TYPO3\CMS\Core\Imaging\IconFactory;
54
use TYPO3\CMS\Core\Routing\SiteMatcher;
55
use TYPO3\CMS\Core\Site\Entity\Site;
56
use TYPO3\CMS\Core\Type\Bitmask\Permission;
57
use TYPO3\CMS\Core\TypoScript\Parser\TypoScriptParser;
58
use TYPO3\CMS\Core\Utility\DebugUtility;
59
use TYPO3\CMS\Core\Utility\GeneralUtility;
60
use TYPO3\CMS\Core\Utility\MathUtility;
61
use TYPO3\CMS\Extbase\Object\ObjectManager;
62
use TYPO3\CMS\Frontend\Page\CacheHashCalculator;
63
use TYPO3\CMS\Frontend\Page\PageRepository;
64
65
/**
66
 * Class CrawlerController
67
 *
68
 * @package AOE\Crawler\Controller
69
 */
70
class CrawlerController implements LoggerAwareInterface
71
{
72
    use LoggerAwareTrait;
73
    use PublicMethodDeprecationTrait;
74
75
    public const CLI_STATUS_NOTHING_PROCCESSED = 0;
76
77
    public const CLI_STATUS_REMAIN = 1; //queue not empty
78
79
    public const CLI_STATUS_PROCESSED = 2; //(some) queue items where processed
80
81
    public const CLI_STATUS_ABORTED = 4; //instance didn't finish
82
83
    public const CLI_STATUS_POLLABLE_PROCESSED = 8;
84
85
    /**
86
     * @var integer
87
     */
88
    public $setID = 0;
89
90
    /**
91
     * @var string
92
     */
93
    public $processID = '';
94
95
    /**
96
     * @var array
97
     */
98
    public $duplicateTrack = [];
99
100
    /**
101
     * @var array
102
     */
103
    public $downloadUrls = [];
104
105
    /**
106
     * @var array
107
     */
108
    public $incomingProcInstructions = [];
109
110
    /**
111
     * @var array
112
     */
113
    public $incomingConfigurationSelection = [];
114
115
    /**
116
     * @var bool
117
     */
118
    public $registerQueueEntriesInternallyOnly = false;
119
120
    /**
121
     * @var array
122
     */
123
    public $queueEntries = [];
124
125
    /**
126
     * @var array
127
     */
128
    public $urlList = [];
129
130
    /**
131
     * @var array
132
     */
133
    public $extensionSettings = [];
134
135
    /**
136
     * Mount Point
137
     *
138
     * @var bool
139
     * Todo: Check what this is used for and adjust the type hint or code, as bool doesn't match the current code.
140
     */
141
    public $MP = false;
142
143
    /**
144
     * @var string
145
     */
146
    protected $processFilename;
147
148
    /**
149
     * Holds the internal access mode can be 'gui','cli' or 'cli_im'
150
     *
151
     * @var string
152
     */
153
    protected $accessMode;
154
155
    /**
156
     * @var QueueRepository
157
     */
158
    protected $queueRepository;
159
160
    /**
161
     * @var ProcessRepository
162
     */
163
    protected $processRepository;
164
165
    /**
166
     * @var ConfigurationRepository
167
     */
168
    protected $configurationRepository;
169
170
    /**
171
     * @var string
172
     */
173
    protected $tableName = 'tx_crawler_queue';
174
175
    /**
176
     * @var QueueExecutor
177
     */
178
    protected $queueExecutor;
179
180
    /**
181
     * @var int
182
     */
183
    protected $maximumUrlsToCompile = 10000;
184
185
    /**
186
     * @var IconFactory
187
     */
188
    protected $iconFactory;
189
190
    /**
191
     * @var string[]
192
     */
193
    private $deprecatedPublicMethods = [
0 ignored issues
show
introduced by
The private property $deprecatedPublicMethods is not used, and could be removed.
Loading history...
194
        'cleanUpOldQueueEntries' => 'Using CrawlerController::cleanUpOldQueueEntries() is deprecated since 9.0.1 and will be removed in v11.x, please use QueueRepository->cleanUpOldQueueEntries() instead.',
195
        'CLI_runHooks' => 'Using CrawlerController::CLI_runHooks() is deprecated since 9.0.1 and will be removed in v11.x',
196
        'flushQueue' => 'Using CrawlerController::flushQueue() is deprecated since 9.0.1 and will be removed in v11.x, please use QueueRepository->flushQueue() instead.',
197
        'getLogEntriesForSetId' => 'Using crawlerController::getLogEntriesForSetId() is deprecated since 9.0.1 and will be removed in v11.x',
198
    ];
199
200
    /**
201
     * @var BackendUserAuthentication|null
202
     */
203
    private $backendUser;
204
205
    /**
206
     * @var integer
207
     */
208
    private $scheduledTime = 0;
209
210
    /**
211
     * @var integer
212
     */
213
    private $reqMinute = 0;
214
215
    /**
216
     * @var bool
217
     */
218
    private $submitCrawlUrls = false;
219
220
    /**
221
     * @var bool
222
     */
223
    private $downloadCrawlUrls = false;
224
225
    /************************************
226
     *
227
     * Getting URLs based on Page TSconfig
228
     *
229
     ************************************/
230
231 43
    public function __construct()
232
    {
233 43
        $objectManager = GeneralUtility::makeInstance(ObjectManager::class);
234 43
        $this->queueRepository = $objectManager->get(QueueRepository::class);
235 43
        $this->processRepository = $objectManager->get(ProcessRepository::class);
236 43
        $this->configurationRepository = $objectManager->get(ConfigurationRepository::class);
237 43
        $this->queueExecutor = $objectManager->get(QueueExecutor::class);
238 43
        $this->iconFactory = GeneralUtility::makeInstance(IconFactory::class);
239
240 43
        $this->processFilename = Environment::getVarPath() . '/lock/tx_crawler.proc';
241
242
        /** @var ExtensionConfigurationProvider $configurationProvider */
243 43
        $configurationProvider = GeneralUtility::makeInstance(ExtensionConfigurationProvider::class);
244 43
        $settings = $configurationProvider->getExtensionConfiguration();
245 43
        $this->extensionSettings = is_array($settings) ? $settings : [];
0 ignored issues
show
introduced by
The condition is_array($settings) is always true.
Loading history...
246
247
        // set defaults:
248 43
        if (MathUtility::convertToPositiveInteger($this->extensionSettings['countInARun']) === 0) {
249
            $this->extensionSettings['countInARun'] = 100;
250
        }
251
252 43
        $this->extensionSettings['processLimit'] = MathUtility::forceIntegerInRange($this->extensionSettings['processLimit'], 1, 99, 1);
253 43
        $this->maximumUrlsToCompile = MathUtility::forceIntegerInRange($this->extensionSettings['maxCompileUrls'], 1, 1000000000, 10000);
254 43
    }
255
256
    /**
257
     * Method to set the accessMode can be gui, cli or cli_im
258
     *
259
     * @return string
260
     */
261 1
    public function getAccessMode()
262
    {
263 1
        return $this->accessMode;
264
    }
265
266
    /**
267
     * @param string $accessMode
268
     */
269 1
    public function setAccessMode($accessMode): void
270
    {
271 1
        $this->accessMode = $accessMode;
272 1
    }
273
274
    /**
275
     * Set disabled status to prevent processes from being processed
276
     *
277
     * @param bool $disabled (optional, defaults to true)
278
     */
279 2
    public function setDisabled($disabled = true): void
280
    {
281 2
        if ($disabled) {
282 1
            GeneralUtility::writeFile($this->processFilename, '');
283
        } else {
284 1
            if (is_file($this->processFilename)) {
285 1
                unlink($this->processFilename);
286
            }
287
        }
288 2
    }
289
290
    /**
291
     * Get disable status
292
     *
293
     * @return bool true if disabled
294
     */
295 2
    public function getDisabled()
296
    {
297 2
        return is_file($this->processFilename);
298
    }
299
300
    /**
301
     * @param string $filenameWithPath
302
     */
303 3
    public function setProcessFilename($filenameWithPath): void
304
    {
305 3
        $this->processFilename = $filenameWithPath;
306 3
    }
307
308
    /**
309
     * @return string
310
     */
311 1
    public function getProcessFilename()
312
    {
313 1
        return $this->processFilename;
314
    }
315
316
    /**
317
     * Sets the extensions settings (unserialized pendant of $TYPO3_CONF_VARS['EXT']['extConf']['crawler']).
318
     */
319 12
    public function setExtensionSettings(array $extensionSettings): void
320
    {
321 12
        $this->extensionSettings = $extensionSettings;
322 12
    }
323
324
    /**
325
     * Check if the given page should be crawled
326
     *
327
     * @return false|string false if the page should be crawled (not excluded), true / skipMessage if it should be skipped
328
     */
329 10
    public function checkIfPageShouldBeSkipped(array $pageRow)
330
    {
331 10
        $skipPage = false;
332 10
        $skipMessage = 'Skipped'; // message will be overwritten later
333
334
        // if page is hidden
335 10
        if (!$this->extensionSettings['crawlHiddenPages'] && $pageRow['hidden']) {
336 1
            $skipPage = true;
337 1
            $skipMessage = 'Because page is hidden';
338
        }
339
340 10
        if (! $skipPage && (GeneralUtility::inList('3,4', $pageRow['doktype']) || $pageRow['doktype'] >= 199)) {
341 3
                $skipPage = true;
342 3
                $skipMessage = 'Because doktype is not allowed';
343
        }
344
345 10
        if (! $skipPage) {
346 6
            foreach ($GLOBALS['TYPO3_CONF_VARS']['EXTCONF']['crawler']['excludeDoktype'] ?? [] as $key => $doktypeList) {
347 1
                if (GeneralUtility::inList($doktypeList, $pageRow['doktype'])) {
348 1
                    $skipPage = true;
349 1
                    $skipMessage = 'Doktype was excluded by "' . $key . '"';
350 1
                    break;
351
                }
352
            }
353
        }
354
355 10
        if (! $skipPage) {
356
            // veto hook
357 5
            foreach ($GLOBALS['TYPO3_CONF_VARS']['EXTCONF']['crawler']['pageVeto'] ?? [] as $key => $func) {
358
                $params = [
359
                    'pageRow' => $pageRow,
360
                ];
361
                // expects "false" if page is ok and "true" or a skipMessage if this page should _not_ be crawled
362
                $veto = GeneralUtility::callUserFunction($func, $params, $this);
363
                if ($veto !== false) {
364
                    $skipPage = true;
365
                    if (is_string($veto)) {
366
                        $skipMessage = $veto;
367
                    } else {
368
                        $skipMessage = 'Veto from hook "' . htmlspecialchars($key) . '"';
369
                    }
370
                    // no need to execute other hooks if a previous one return a veto
371
                    break;
372
                }
373
            }
374
        }
375
376 10
        return $skipPage ? $skipMessage : false;
377
    }
378
379
    /**
380
     * Wrapper method for getUrlsForPageId()
381
     * It returns an array of configurations and no urls!
382
     *
383
     * @param array $pageRow Page record with at least dok-type and uid columns.
384
     * @param string $skipMessage
385
     * @return array
386
     * @see getUrlsForPageId()
387
     */
388 6
    public function getUrlsForPageRow(array $pageRow, &$skipMessage = '')
389
    {
390 6
        $message = $this->checkIfPageShouldBeSkipped($pageRow);
391
392 6
        if ($message === false) {
393 5
            $res = $this->getUrlsForPageId($pageRow['uid']);
394 5
            $skipMessage = '';
395
        } else {
396 1
            $skipMessage = $message;
397 1
            $res = [];
398
        }
399
400 6
        return $res;
401
    }
402
403
    /**
404
     * Creates a list of URLs from input array (and submits them to queue if asked for)
405
     * See Web > Info module script + "indexed_search"'s crawler hook-client using this!
406
     *
407
     * @param array $vv Information about URLs from pageRow to crawl.
408
     * @param array $pageRow Page row
409
     * @param int $scheduledTime Unix time to schedule indexing to, typically time()
410
     * @param int $reqMinute Number of requests per minute (creates the interleave between requests)
411
     * @param bool $submitCrawlUrls If set, submits the URLs to queue
412
     * @param bool $downloadCrawlUrls If set (and submitcrawlUrls is false) will fill $downloadUrls with entries)
413
     * @param array $duplicateTrack Array which is passed by reference and contains the an id per url to secure we will not crawl duplicates
414
     * @param array $downloadUrls Array which will be filled with URLS for download if flag is set.
415
     * @param array $incomingProcInstructions Array of processing instructions
416
     * @return string List of URLs (meant for display in backend module)
417
     */
418 4
    public function urlListFromUrlArray(
419
        array $vv,
420
        array $pageRow,
421
        $scheduledTime,
422
        $reqMinute,
423
        $submitCrawlUrls,
424
        $downloadCrawlUrls,
425
        array &$duplicateTrack,
426
        array &$downloadUrls,
427
        array $incomingProcInstructions
428
    ) {
429 4
        if (! is_array($vv['URLs'])) {
430
            return 'ERROR - no URL generated';
431
        }
432 4
        $urlLog = [];
433 4
        $pageId = (int) $pageRow['uid'];
434 4
        $configurationHash = $this->getConfigurationHash($vv);
435 4
        $skipInnerCheck = $this->queueRepository->noUnprocessedQueueEntriesForPageWithConfigurationHashExist($pageId, $configurationHash);
436
437 4
        foreach ($vv['URLs'] as $urlQuery) {
438 4
            if (! $this->drawURLs_PIfilter($vv['subCfg']['procInstrFilter'], $incomingProcInstructions)) {
439
                continue;
440
            }
441 4
            $url = (string) $this->getUrlFromPageAndQueryParameters(
442 4
                $pageId,
443 4
                $urlQuery,
444 4
                $vv['subCfg']['baseUrl'] ?? null,
445 4
                $vv['subCfg']['force_ssl'] ?? 0
446
            );
447
448
            // Create key by which to determine unique-ness:
449 4
            $uKey = $url . '|' . $vv['subCfg']['userGroups'] . '|' . $vv['subCfg']['procInstrFilter'];
450
451 4
            if (isset($duplicateTrack[$uKey])) {
452
                //if the url key is registered just display it and do not resubmit is
453
                $urlLog[] = '<em><span class="text-muted">' . htmlspecialchars($url) . '</span></em>';
454
            } else {
455
                // Scheduled time:
456 4
                $schTime = $scheduledTime + round(count($duplicateTrack) * (60 / $reqMinute));
457 4
                $schTime = intval($schTime / 60) * 60;
458 4
                $formattedDate = BackendUtility::datetime($schTime);
459 4
                $this->urlList[] = '[' . $formattedDate . '] ' . $url;
460 4
                $urlList = '[' . $formattedDate . '] ' . htmlspecialchars($url);
461
462
                // Submit for crawling!
463 4
                if ($submitCrawlUrls) {
464 4
                    $added = $this->addUrl(
465 4
                        $pageId,
466 4
                        $url,
467 4
                        $vv['subCfg'],
468 4
                        $scheduledTime,
469 4
                        $configurationHash,
470 4
                        $skipInnerCheck
471
                    );
472 4
                    if ($added === false) {
473 4
                        $urlList .= ' (URL already existed)';
474
                    }
475
                } elseif ($downloadCrawlUrls) {
476
                    $downloadUrls[$url] = $url;
477
                }
478 4
                $urlLog[] = $urlList;
479
            }
480 4
            $duplicateTrack[$uKey] = true;
481
        }
482
483 4
        return implode('<br>', $urlLog);
484
    }
485
486
    /**
487
     * Returns true if input processing instruction is among registered ones.
488
     *
489
     * @param string $piString PI to test
490
     * @param array $incomingProcInstructions Processing instructions
491
     * @return boolean
492
     */
493 5
    public function drawURLs_PIfilter($piString, array $incomingProcInstructions)
494
    {
495 5
        if (empty($incomingProcInstructions)) {
496 1
            return true;
497
        }
498
499 4
        foreach ($incomingProcInstructions as $pi) {
500 4
            if (GeneralUtility::inList($piString, $pi)) {
501 2
                return true;
502
            }
503
        }
504 2
        return false;
505
    }
506
507 5
    public function getPageTSconfigForId($id): array
508
    {
509 5
        if (! $this->MP) {
510 5
            $pageTSconfig = BackendUtility::getPagesTSconfig($id);
0 ignored issues
show
Deprecated Code introduced by
The function TYPO3\CMS\Backend\Utilit...ity::getPagesTSconfig() has been deprecated. ( Ignorable by Annotation )

If this is a false-positive, you can also ignore this issue in your code via the ignore-deprecated  annotation

510
            $pageTSconfig = /** @scrutinizer ignore-deprecated */ BackendUtility::getPagesTSconfig($id);
Loading history...
511
        } else {
512
            // TODO: Please check, this makes no sense to split a boolean value.
513
            [, $mountPointId] = explode('-', $this->MP);
0 ignored issues
show
Bug introduced by
$this->MP of type true is incompatible with the type string expected by parameter $string of explode(). ( Ignorable by Annotation )

If this is a false-positive, you can also ignore this issue in your code via the ignore-type  annotation

513
            [, $mountPointId] = explode('-', /** @scrutinizer ignore-type */ $this->MP);
Loading history...
514
            $pageTSconfig = BackendUtility::getPagesTSconfig($mountPointId);
0 ignored issues
show
Bug introduced by
$mountPointId of type string is incompatible with the type integer expected by parameter $id of TYPO3\CMS\Backend\Utilit...ity::getPagesTSconfig(). ( Ignorable by Annotation )

If this is a false-positive, you can also ignore this issue in your code via the ignore-type  annotation

514
            $pageTSconfig = BackendUtility::getPagesTSconfig(/** @scrutinizer ignore-type */ $mountPointId);
Loading history...
Deprecated Code introduced by
The function TYPO3\CMS\Backend\Utilit...ity::getPagesTSconfig() has been deprecated. ( Ignorable by Annotation )

If this is a false-positive, you can also ignore this issue in your code via the ignore-deprecated  annotation

514
            $pageTSconfig = /** @scrutinizer ignore-deprecated */ BackendUtility::getPagesTSconfig($mountPointId);
Loading history...
515
        }
516
517
        // Call a hook to alter configuration
518 5
        if (is_array($GLOBALS['TYPO3_CONF_VARS']['EXTCONF']['crawler']['getPageTSconfigForId'])) {
519
            $params = [
520
                'pageId' => $id,
521
                'pageTSConfig' => &$pageTSconfig,
522
            ];
523
            foreach ($GLOBALS['TYPO3_CONF_VARS']['EXTCONF']['crawler']['getPageTSconfigForId'] as $userFunc) {
524
                GeneralUtility::callUserFunction($userFunc, $params, $this);
525
            }
526
        }
527 5
        return $pageTSconfig;
528
    }
529
530
    /**
531
     * This methods returns an array of configurations.
532
     * Adds no urls!
533
     */
534 4
    public function getUrlsForPageId(int $pageId): array
535
    {
536
        // Get page TSconfig for page ID
537 4
        $pageTSconfig = $this->getPageTSconfigForId($pageId);
538
539 4
        $res = [];
540
541
        // Fetch Crawler Configuration from pageTSconfig
542 4
        $crawlerCfg = $pageTSconfig['tx_crawler.']['crawlerCfg.']['paramSets.'] ?? [];
543 4
        foreach ($crawlerCfg as $key => $values) {
544 3
            if (! is_array($values)) {
545 3
                continue;
546
            }
547 3
            $key = str_replace('.', '', $key);
548
            // Sub configuration for a single configuration string:
549 3
            $subCfg = (array) $crawlerCfg[$key . '.'];
550 3
            $subCfg['key'] = $key;
551
552 3
            if (strcmp($subCfg['procInstrFilter'], '')) {
553 3
                $subCfg['procInstrFilter'] = implode(',', GeneralUtility::trimExplode(',', $subCfg['procInstrFilter']));
554
            }
555 3
            $pidOnlyList = implode(',', GeneralUtility::trimExplode(',', $subCfg['pidsOnly'], true));
556
557
            // process configuration if it is not page-specific or if the specific page is the current page:
558
            // TODO: Check if $pidOnlyList can be kept as Array instead of imploded
559 3
            if (! strcmp((string) $subCfg['pidsOnly'], '') || GeneralUtility::inList($pidOnlyList, strval($pageId))) {
560
561
                // Explode, process etc.:
562 3
                $res[$key] = [];
563 3
                $res[$key]['subCfg'] = $subCfg;
564 3
                $res[$key]['paramParsed'] = GeneralUtility::explodeUrl2Array($crawlerCfg[$key]);
565 3
                $res[$key]['paramExpanded'] = $this->expandParameters($res[$key]['paramParsed'], $pageId);
566 3
                $res[$key]['origin'] = 'pagets';
567
568
                // recognize MP value
569 3
                if (! $this->MP) {
570 3
                    $res[$key]['URLs'] = $this->compileUrls($res[$key]['paramExpanded'], ['?id=' . $pageId]);
571
                } else {
572
                    $res[$key]['URLs'] = $this->compileUrls($res[$key]['paramExpanded'], ['?id=' . $pageId . '&MP=' . $this->MP]);
0 ignored issues
show
Bug introduced by
Are you sure $this->MP of type true can be used in concatenation? ( Ignorable by Annotation )

If this is a false-positive, you can also ignore this issue in your code via the ignore-type  annotation

572
                    $res[$key]['URLs'] = $this->compileUrls($res[$key]['paramExpanded'], ['?id=' . $pageId . '&MP=' . /** @scrutinizer ignore-type */ $this->MP]);
Loading history...
573
                }
574
            }
575
        }
576
577
        // Get configuration from tx_crawler_configuration records up the rootline
578 4
        $crawlerConfigurations = $this->configurationRepository->getCrawlerConfigurationRecordsFromRootLine($pageId);
579 4
        foreach ($crawlerConfigurations as $configurationRecord) {
580
581
            // check access to the configuration record
582 1
            if (empty($configurationRecord['begroups']) || $this->getBackendUser()->isAdmin() || $this->hasGroupAccess($this->getBackendUser()->user['usergroup_cached_list'], $configurationRecord['begroups'])) {
583 1
                $pidOnlyList = implode(',', GeneralUtility::trimExplode(',', $configurationRecord['pidsonly'], true));
584
585
                // process configuration if it is not page-specific or if the specific page is the current page:
586
                // TODO: Check if $pidOnlyList can be kept as Array instead of imploded
587 1
                if (! strcmp($configurationRecord['pidsonly'], '') || GeneralUtility::inList($pidOnlyList, strval($pageId))) {
588 1
                    $key = $configurationRecord['name'];
589
590
                    // don't overwrite previously defined paramSets
591 1
                    if (! isset($res[$key])) {
592
593
                        /* @var $TSparserObject \TYPO3\CMS\Core\TypoScript\Parser\TypoScriptParser */
594 1
                        $TSparserObject = GeneralUtility::makeInstance(TypoScriptParser::class);
595 1
                        $TSparserObject->parse($configurationRecord['processing_instruction_parameters_ts']);
596
597
                        $subCfg = [
598 1
                            'procInstrFilter' => $configurationRecord['processing_instruction_filter'],
599 1
                            'procInstrParams.' => $TSparserObject->setup,
600 1
                            'baseUrl' => $configurationRecord['base_url'],
601 1
                            'force_ssl' => (int) $configurationRecord['force_ssl'],
602 1
                            'userGroups' => $configurationRecord['fegroups'],
603 1
                            'exclude' => $configurationRecord['exclude'],
604 1
                            'key' => $key,
605
                        ];
606
607 1
                        if (! in_array($pageId, $this->expandExcludeString($subCfg['exclude']), true)) {
608 1
                            $res[$key] = [];
609 1
                            $res[$key]['subCfg'] = $subCfg;
610 1
                            $res[$key]['paramParsed'] = GeneralUtility::explodeUrl2Array($configurationRecord['configuration']);
611 1
                            $res[$key]['paramExpanded'] = $this->expandParameters($res[$key]['paramParsed'], $pageId);
612 1
                            $res[$key]['URLs'] = $this->compileUrls($res[$key]['paramExpanded'], ['?id=' . $pageId]);
613 1
                            $res[$key]['origin'] = 'tx_crawler_configuration_' . $configurationRecord['uid'];
614
                        }
615
                    }
616
                }
617
            }
618
        }
619
620 4
        foreach ($GLOBALS['TYPO3_CONF_VARS']['EXTCONF']['crawler']['processUrls'] ?? [] as $func) {
621
            $params = [
622
                'res' => &$res,
623
            ];
624
            GeneralUtility::callUserFunction($func, $params, $this);
625
        }
626 4
        return $res;
627
    }
628
629
    /**
630
     * Find all configurations of subpages of a page
631
     * TODO: Write Functional Tests
632
     */
633 1
    public function getConfigurationsForBranch(int $rootid, int $depth): array
634
    {
635 1
        $configurationsForBranch = [];
636 1
        $pageTSconfig = $this->getPageTSconfigForId($rootid);
637 1
        $sets = $pageTSconfig['tx_crawler.']['crawlerCfg.']['paramSets.'] ?? [];
638 1
        foreach ($sets as $key => $value) {
639
            if (! is_array($value)) {
640
                continue;
641
            }
642
            $configurationsForBranch[] = substr($key, -1) === '.' ? substr($key, 0, -1) : $key;
643
        }
644 1
        $pids = [];
645 1
        $rootLine = BackendUtility::BEgetRootLine($rootid);
646 1
        foreach ($rootLine as $node) {
647 1
            $pids[] = $node['uid'];
648
        }
649
        /* @var PageTreeView $tree */
650 1
        $tree = GeneralUtility::makeInstance(PageTreeView::class);
651 1
        $perms_clause = $this->getBackendUser()->getPagePermsClause(Permission::PAGE_SHOW);
652 1
        $tree->init(empty($perms_clause) ? '' : ('AND ' . $perms_clause));
653 1
        $tree->getTree($rootid, $depth, '');
654 1
        foreach ($tree->tree as $node) {
655
            $pids[] = $node['row']['uid'];
656
        }
657
658 1
        $queryBuilder = $this->getQueryBuilder('tx_crawler_configuration');
659
        $statement = $queryBuilder
660 1
            ->select('name')
661 1
            ->from('tx_crawler_configuration')
662 1
            ->where(
663 1
                $queryBuilder->expr()->in('pid', $queryBuilder->createNamedParameter($pids, Connection::PARAM_INT_ARRAY))
664
            )
665 1
            ->execute();
666
667 1
        while ($row = $statement->fetch()) {
668 1
            $configurationsForBranch[] = $row['name'];
669
        }
670 1
        return $configurationsForBranch;
671
    }
672
673
    /**
674
     * Check if a user has access to an item
675
     * (e.g. get the group list of the current logged in user from $GLOBALS['TSFE']->gr_list)
676
     *
677
     * @param string $groupList Comma-separated list of (fe_)group UIDs from a user
678
     * @param string $accessList Comma-separated list of (fe_)group UIDs of the item to access
679
     * @return bool                 TRUE if at least one of the users group UIDs is in the access list or the access list is empty
680
     * @see \TYPO3\CMS\Frontend\Page\PageRepository::getMultipleGroupsWhereClause()
681
     */
682 3
    public function hasGroupAccess($groupList, $accessList)
683
    {
684 3
        if (empty($accessList)) {
685 1
            return true;
686
        }
687 2
        foreach (GeneralUtility::intExplode(',', $groupList) as $groupUid) {
688 2
            if (GeneralUtility::inList($accessList, $groupUid)) {
689 1
                return true;
690
            }
691
        }
692 1
        return false;
693
    }
694
695
    /**
696
     * Will expand the parameters configuration to individual values. This follows a certain syntax of the value of each parameter.
697
     * Syntax of values:
698
     * - Basically: If the value is wrapped in [...] it will be expanded according to the following syntax, otherwise the value is taken literally
699
     * - Configuration is splitted by "|" and the parts are processed individually and finally added together
700
     * - For each configuration part:
701
     *         - "[int]-[int]" = Integer range, will be expanded to all values in between, values included, starting from low to high (max. 1000). Example "1-34" or "-40--30"
702
     *         - "_TABLE:[TCA table name];[_PID:[optional page id, default is current page]];[_ENABLELANG:1]" = Look up of table records from PID, filtering out deleted records. Example "_TABLE:tt_content; _PID:123"
703
     *        _ENABLELANG:1 picks only original records without their language overlays
704
     *         - Default: Literal value
705
     *
706
     * @param array $paramArray Array with key (GET var name) and values (value of GET var which is configuration for expansion)
707
     * @param integer $pid Current page ID
708
     * @return array
709
     *
710
     * TODO: Write Functional Tests
711
     */
712 11
    public function expandParameters($paramArray, $pid)
713
    {
714
        // Traverse parameter names:
715 11
        foreach ($paramArray as $p => $v) {
716 11
            $v = trim($v);
717
718
            // If value is encapsulated in square brackets it means there are some ranges of values to find, otherwise the value is literal
719 11
            if (strpos($v, '[') === 0 && substr($v, -1) === ']') {
720
                // So, find the value inside brackets and reset the paramArray value as an array.
721 11
                $v = substr($v, 1, -1);
722 11
                $paramArray[$p] = [];
723
724
                // Explode parts and traverse them:
725 11
                $parts = explode('|', $v);
726 11
                foreach ($parts as $pV) {
727
728
                    // Look for integer range: (fx. 1-34 or -40--30 // reads minus 40 to minus 30)
729 11
                    if (preg_match('/^(-?[0-9]+)\s*-\s*(-?[0-9]+)$/', trim($pV), $reg)) {
730 1
                        $reg = $this->swapIfFirstIsLargerThanSecond($reg);
731
732
                        // Traverse range, add values:
733 1
                        $runAwayBrake = 1000; // Limit to size of range!
734 1
                        for ($a = $reg[1]; $a <= $reg[2]; $a++) {
735 1
                            $paramArray[$p][] = $a;
736 1
                            $runAwayBrake--;
737 1
                            if ($runAwayBrake <= 0) {
738
                                break;
739
                            }
740
                        }
741 10
                    } elseif (strpos(trim($pV), '_TABLE:') === 0) {
742
743
                        // Parse parameters:
744 6
                        $subparts = GeneralUtility::trimExplode(';', $pV);
745 6
                        $subpartParams = [];
746 6
                        foreach ($subparts as $spV) {
747 6
                            [$pKey, $pVal] = GeneralUtility::trimExplode(':', $spV);
748 6
                            $subpartParams[$pKey] = $pVal;
749
                        }
750
751
                        // Table exists:
752 6
                        if (isset($GLOBALS['TCA'][$subpartParams['_TABLE']])) {
753 6
                            $lookUpPid = isset($subpartParams['_PID']) ? intval($subpartParams['_PID']) : intval($pid);
754 6
                            $recursiveDepth = isset($subpartParams['_RECURSIVE']) ? intval($subpartParams['_RECURSIVE']) : 0;
755 6
                            $pidField = isset($subpartParams['_PIDFIELD']) ? trim($subpartParams['_PIDFIELD']) : 'pid';
756 6
                            $where = $subpartParams['_WHERE'] ?? '';
757 6
                            $addTable = $subpartParams['_ADDTABLE'] ?? '';
758
759 6
                            $fieldName = $subpartParams['_FIELD'] ? $subpartParams['_FIELD'] : 'uid';
760 6
                            if ($fieldName === 'uid' || $GLOBALS['TCA'][$subpartParams['_TABLE']]['columns'][$fieldName]) {
761 6
                                $queryBuilder = $this->getQueryBuilder($subpartParams['_TABLE']);
762
763 6
                                if ($recursiveDepth > 0) {
764
                                    /** @var \TYPO3\CMS\Core\Database\QueryGenerator $queryGenerator */
765 2
                                    $queryGenerator = GeneralUtility::makeInstance(\TYPO3\CMS\Core\Database\QueryGenerator::class);
766 2
                                    $pidList = $queryGenerator->getTreeList($lookUpPid, $recursiveDepth, 0, 1);
767 2
                                    $pidArray = GeneralUtility::intExplode(',', $pidList);
768
                                } else {
769 4
                                    $pidArray = [(string) $lookUpPid];
770
                                }
771
772 6
                                $queryBuilder->getRestrictions()
773 6
                                    ->removeAll()
774 6
                                    ->add(GeneralUtility::makeInstance(DeletedRestriction::class));
775
776
                                $queryBuilder
777 6
                                    ->select($fieldName)
778 6
                                    ->from($subpartParams['_TABLE'])
779 6
                                    ->where(
780 6
                                        $queryBuilder->expr()->in($pidField, $queryBuilder->createNamedParameter($pidArray, Connection::PARAM_INT_ARRAY)),
781 6
                                        $where
782
                                    );
783
784 6
                                if (! empty($addTable)) {
785
                                    // TODO: Check if this works as intended!
786
                                    $queryBuilder->add('from', $addTable);
787
                                }
788 6
                                $transOrigPointerField = $GLOBALS['TCA'][$subpartParams['_TABLE']]['ctrl']['transOrigPointerField'];
789
790 6
                                if ($subpartParams['_ENABLELANG'] && $transOrigPointerField) {
791
                                    $queryBuilder->andWhere(
792
                                        $queryBuilder->expr()->lte(
793
                                            $transOrigPointerField,
794
                                            0
795
                                        )
796
                                    );
797
                                }
798
799 6
                                $statement = $queryBuilder->execute();
800
801 6
                                $rows = [];
802 6
                                while ($row = $statement->fetch()) {
803 6
                                    $rows[$row[$fieldName]] = $row;
804
                                }
805
806 6
                                if (is_array($rows)) {
807 6
                                    $paramArray[$p] = array_merge($paramArray[$p], array_keys($rows));
808
                                }
809
                            }
810
                        }
811
                    } else { // Just add value:
812 4
                        $paramArray[$p][] = $pV;
813
                    }
814
                    // Hook for processing own expandParameters place holder
815 11
                    if (is_array($GLOBALS['TYPO3_CONF_VARS']['SC_OPTIONS']['crawler/class.tx_crawler_lib.php']['expandParameters'])) {
816
                        $_params = [
817
                            'pObj' => &$this,
818
                            'paramArray' => &$paramArray,
819
                            'currentKey' => $p,
820
                            'currentValue' => $pV,
821
                            'pid' => $pid,
822
                        ];
823
                        foreach ($GLOBALS['TYPO3_CONF_VARS']['SC_OPTIONS']['crawler/class.tx_crawler_lib.php']['expandParameters'] as $_funcRef) {
824
                            GeneralUtility::callUserFunction($_funcRef, $_params, $this);
825
                        }
826
                    }
827
                }
828
829
                // Make unique set of values and sort array by key:
830 11
                $paramArray[$p] = array_unique($paramArray[$p]);
831 11
                ksort($paramArray);
832
            } else {
833
                // Set the literal value as only value in array:
834 4
                $paramArray[$p] = [$v];
835
            }
836
        }
837
838 11
        return $paramArray;
839
    }
840
841
    /**
842
     * Compiling URLs from parameter array (output of expandParameters())
843
     * The number of URLs will be the multiplication of the number of parameter values for each key
844
     *
845
     * @param array $paramArray Output of expandParameters(): Array with keys (GET var names) and for each an array of values
846
     * @param array $urls URLs accumulated in this array (for recursion)
847
     * @return array
848
     */
849 7
    public function compileUrls($paramArray, array $urls)
850
    {
851 7
        if (empty($paramArray)) {
852 7
            return $urls;
853
        }
854
        // shift first off stack:
855 6
        reset($paramArray);
856 6
        $varName = key($paramArray);
857 6
        $valueSet = array_shift($paramArray);
858
859
        // Traverse value set:
860 6
        $newUrls = [];
861 6
        foreach ($urls as $url) {
862 5
            foreach ($valueSet as $val) {
863 5
                $newUrls[] = $url . (strcmp((string) $val, '') ? '&' . rawurlencode($varName) . '=' . rawurlencode((string) $val) : '');
864
865 5
                if (count($newUrls) > $this->maximumUrlsToCompile) {
866
                    break;
867
                }
868
            }
869
        }
870 6
        return $this->compileUrls($paramArray, $newUrls);
871
    }
872
873
    /************************************
874
     *
875
     * Crawler log
876
     *
877
     ************************************/
878
879
    /**
880
     * Return array of records from crawler queue for input page ID
881
     *
882
     * @param integer $id Page ID for which to look up log entries.
883
     * @param string $filter Filter: "all" => all entries, "pending" => all that is not yet run, "finished" => all complete ones
884
     * @param boolean $doFlush If TRUE, then entries selected at DELETED(!) instead of selected!
885
     * @param boolean $doFullFlush
886
     * @param integer $itemsPerPage Limit the amount of entries per page default is 10
887
     * @return array
888
     */
889 4
    public function getLogEntriesForPageId($id, $filter = '', $doFlush = false, $doFullFlush = false, $itemsPerPage = 10)
890
    {
891 4
        $queryBuilder = GeneralUtility::makeInstance(ConnectionPool::class)->getQueryBuilderForTable($this->tableName);
892
        $queryBuilder
893 4
            ->select('*')
894 4
            ->from($this->tableName)
895 4
            ->where(
896 4
                $queryBuilder->expr()->eq('page_id', $queryBuilder->createNamedParameter($id, \PDO::PARAM_INT))
897
            )
898 4
            ->orderBy('scheduled', 'DESC');
899
900 4
        $expressionBuilder = GeneralUtility::makeInstance(ConnectionPool::class)
901 4
            ->getConnectionForTable($this->tableName)
902 4
            ->getExpressionBuilder();
903 4
        $query = $expressionBuilder->andX();
0 ignored issues
show
Unused Code introduced by
The assignment to $query is dead and can be removed.
Loading history...
904
        // PHPStorm adds the highlight that the $addWhere is immediately overwritten,
905
        // but the $query = $expressionBuilder->andX() ensures that the $addWhere is written correctly with AND
906
        // between the statements, it's not a mistake in the code.
907 4
        switch ($filter) {
908 4
            case 'pending':
909
                $queryBuilder->andWhere($queryBuilder->expr()->eq('exec_time', 0));
910
                break;
911 4
            case 'finished':
912
                $queryBuilder->andWhere($queryBuilder->expr()->gt('exec_time', 0));
913
                break;
914
        }
915
916 4
        if ($doFlush) {
917 2
            if ($doFullFlush) {
918 1
                $this->queueRepository->flushQueue('all');
919
            } else {
920 1
                $this->queueRepository->flushQueue($filter);
921
            }
922
        }
923 4
        if ($itemsPerPage > 0) {
924
            $queryBuilder
925 4
                ->setMaxResults((int) $itemsPerPage);
926
        }
927
928 4
        return $queryBuilder->execute()->fetchAll();
929
    }
930
931
    /**
932
     * Return array of records from crawler queue for input set ID
933
     *
934
     * @param int $set_id Set ID for which to look up log entries.
935
     * @param string $filter Filter: "all" => all entries, "pending" => all that is not yet run, "finished" => all complete ones
936
     * @param bool $doFlush If TRUE, then entries selected at DELETED(!) instead of selected!
937
     * @param int $itemsPerPage Limit the amount of entries per page default is 10
938
     * @return array
939
     *
940
     * @deprecated
941
     */
942 6
    public function getLogEntriesForSetId(int $set_id, string $filter = '', bool $doFlush = false, bool $doFullFlush = false, int $itemsPerPage = 10)
943
    {
944 6
        $queryBuilder = GeneralUtility::makeInstance(ConnectionPool::class)->getQueryBuilderForTable($this->tableName);
945
        $queryBuilder
946 6
            ->select('*')
947 6
            ->from($this->tableName)
948 6
            ->where(
949 6
                $queryBuilder->expr()->eq('set_id', $queryBuilder->createNamedParameter($set_id, \PDO::PARAM_INT))
950
            )
951 6
            ->orderBy('scheduled', 'DESC');
952
953 6
        $expressionBuilder = GeneralUtility::makeInstance(ConnectionPool::class)
954 6
            ->getConnectionForTable($this->tableName)
955 6
            ->getExpressionBuilder();
956 6
        $query = $expressionBuilder->andX();
957
        // PHPStorm adds the highlight that the $addWhere is immediately overwritten,
958
        // but the $query = $expressionBuilder->andX() ensures that the $addWhere is written correctly with AND
959
        // between the statements, it's not a mistake in the code.
960 6
        $addWhere = '';
961 6
        switch ($filter) {
962 6
            case 'pending':
963 1
                $queryBuilder->andWhere($queryBuilder->expr()->eq('exec_time', 0));
964 1
                $addWhere = $query->add($expressionBuilder->eq('exec_time', 0));
0 ignored issues
show
Unused Code introduced by
The assignment to $addWhere is dead and can be removed.
Loading history...
965 1
                break;
966 5
            case 'finished':
967 1
                $queryBuilder->andWhere($queryBuilder->expr()->gt('exec_time', 0));
968 1
                $addWhere = $query->add($expressionBuilder->gt('exec_time', 0));
969 1
                break;
970
        }
971 6
        if ($doFlush) {
972 4
            $addWhere = $query->add($expressionBuilder->eq('set_id', (int) $set_id));
973 4
            $this->flushQueue($doFullFlush ? '' : $addWhere);
0 ignored issues
show
Deprecated Code introduced by
The function AOE\Crawler\Controller\C...ontroller::flushQueue() has been deprecated. ( Ignorable by Annotation )

If this is a false-positive, you can also ignore this issue in your code via the ignore-deprecated  annotation

973
            /** @scrutinizer ignore-deprecated */ $this->flushQueue($doFullFlush ? '' : $addWhere);
Loading history...
974 4
            return [];
975
        }
976 2
        if ($itemsPerPage > 0) {
977
            $queryBuilder
978 2
                ->setMaxResults((int) $itemsPerPage);
979
        }
980
981 2
        return $queryBuilder->execute()->fetchAll();
982
    }
983
984
    /**
985
     * Adding call back entries to log (called from hooks typically, see indexed search class "class.crawler.php"
986
     *
987
     * @param integer $setId Set ID
988
     * @param array $params Parameters to pass to call back function
989
     * @param string $callBack Call back object reference, eg. 'EXT:indexed_search/class.crawler.php:&tx_indexedsearch_crawler'
990
     * @param integer $page_id Page ID to attach it to
991
     * @param integer $schedule Time at which to activate
992
     */
993
    public function addQueueEntry_callBack($setId, $params, $callBack, $page_id = 0, $schedule = 0): void
994
    {
995
        if (! is_array($params)) {
0 ignored issues
show
introduced by
The condition is_array($params) is always true.
Loading history...
996
            $params = [];
997
        }
998
        $params['_CALLBACKOBJ'] = $callBack;
999
1000
        GeneralUtility::makeInstance(ConnectionPool::class)->getConnectionForTable('tx_crawler_queue')
1001
            ->insert(
1002
                'tx_crawler_queue',
1003
                [
1004
                    'page_id' => (int) $page_id,
1005
                    'parameters' => json_encode($params),
1006
                    'scheduled' => (int) $schedule ?: $this->getCurrentTime(),
1007
                    'exec_time' => 0,
1008
                    'set_id' => (int) $setId,
1009
                    'result_data' => '',
1010
                ]
1011
            );
1012
    }
1013
1014
    /************************************
1015
     *
1016
     * URL setting
1017
     *
1018
     ************************************/
1019
1020
    /**
1021
     * Setting a URL for crawling:
1022
     *
1023
     * @param integer $id Page ID
1024
     * @param string $url Complete URL
1025
     * @param array $subCfg Sub configuration array (from TS config)
1026
     * @param integer $tstamp Scheduled-time
1027
     * @param string $configurationHash (optional) configuration hash
1028
     * @param bool $skipInnerDuplicationCheck (optional) skip inner duplication check
1029
     * @return bool
1030
     */
1031 8
    public function addUrl(
1032
        $id,
1033
        $url,
1034
        array $subCfg,
1035
        $tstamp,
1036
        $configurationHash = '',
1037
        $skipInnerDuplicationCheck = false
1038
    ) {
1039 8
        $urlAdded = false;
1040 8
        $rows = [];
1041
1042
        // Creating parameters:
1043
        $parameters = [
1044 8
            'url' => $url,
1045
        ];
1046
1047
        // fe user group simulation:
1048 8
        $uGs = implode(',', array_unique(GeneralUtility::intExplode(',', $subCfg['userGroups'], true)));
1049 8
        if ($uGs) {
1050 1
            $parameters['feUserGroupList'] = $uGs;
1051
        }
1052
1053
        // Setting processing instructions
1054 8
        $parameters['procInstructions'] = GeneralUtility::trimExplode(',', $subCfg['procInstrFilter']);
1055 8
        if (is_array($subCfg['procInstrParams.'])) {
1056 5
            $parameters['procInstrParams'] = $subCfg['procInstrParams.'];
1057
        }
1058
1059
        // Compile value array:
1060 8
        $parameters_serialized = json_encode($parameters);
1061
        $fieldArray = [
1062 8
            'page_id' => (int) $id,
1063 8
            'parameters' => $parameters_serialized,
1064 8
            'parameters_hash' => GeneralUtility::shortMD5($parameters_serialized),
1065 8
            'configuration_hash' => $configurationHash,
1066 8
            'scheduled' => $tstamp,
1067 8
            'exec_time' => 0,
1068 8
            'set_id' => (int) $this->setID,
1069 8
            'result_data' => '',
1070 8
            'configuration' => $subCfg['key'],
1071
        ];
1072
1073 8
        if ($this->registerQueueEntriesInternallyOnly) {
1074
            //the entries will only be registered and not stored to the database
1075 1
            $this->queueEntries[] = $fieldArray;
1076
        } else {
1077 7
            if (! $skipInnerDuplicationCheck) {
1078
                // check if there is already an equal entry
1079 6
                $rows = $this->getDuplicateRowsIfExist($tstamp, $fieldArray);
1080
            }
1081
1082 7
            if (empty($rows)) {
1083 6
                $connectionForCrawlerQueue = GeneralUtility::makeInstance(ConnectionPool::class)->getConnectionForTable('tx_crawler_queue');
1084 6
                $connectionForCrawlerQueue->insert(
1085 6
                    'tx_crawler_queue',
1086 6
                    $fieldArray
1087
                );
1088 6
                $uid = $connectionForCrawlerQueue->lastInsertId('tx_crawler_queue', 'qid');
1089 6
                $rows[] = $uid;
1090 6
                $urlAdded = true;
1091
1092 6
                $signalPayload = ['uid' => $uid, 'fieldArray' => $fieldArray];
1093 6
                SignalSlotUtility::emitSignal(
1094 6
                    self::class,
1095 6
                    SignalSlotUtility::SIGNAL_URL_ADDED_TO_QUEUE,
1096 6
                    $signalPayload
1097
                );
1098
            } else {
1099 3
                $signalPayload = ['rows' => $rows, 'fieldArray' => $fieldArray];
1100 3
                SignalSlotUtility::emitSignal(
1101 3
                    self::class,
1102 3
                    SignalSlotUtility::SIGNAL_DUPLICATE_URL_IN_QUEUE,
1103 3
                    $signalPayload
1104
                );
1105
            }
1106
        }
1107
1108 8
        return $urlAdded;
1109
    }
1110
1111
    /**
1112
     * Returns the current system time
1113
     *
1114
     * @return int
1115
     */
1116
    public function getCurrentTime()
1117
    {
1118
        return time();
1119
    }
1120
1121
    /************************************
1122
     *
1123
     * URL reading
1124
     *
1125
     ************************************/
1126
1127
    /**
1128
     * Read URL for single queue entry
1129
     *
1130
     * @param integer $queueId
1131
     * @param boolean $force If set, will process even if exec_time has been set!
1132
     * @return integer
1133
     */
1134
    public function readUrl($queueId, $force = false)
1135
    {
1136
        $queryBuilder = GeneralUtility::makeInstance(ConnectionPool::class)->getQueryBuilderForTable($this->tableName);
1137
        $ret = 0;
1138
        $this->logger->debug('crawler-readurl start ' . microtime(true));
1139
        // Get entry:
1140
        $queryBuilder
1141
            ->select('*')
1142
            ->from('tx_crawler_queue')
1143
            ->where(
1144
                $queryBuilder->expr()->eq('qid', $queryBuilder->createNamedParameter($queueId, \PDO::PARAM_INT))
1145
            );
1146
        if (! $force) {
1147
            $queryBuilder
1148
                ->andWhere('exec_time = 0')
1149
                ->andWhere('process_scheduled > 0');
1150
        }
1151
        $queueRec = $queryBuilder->execute()->fetch();
1152
1153
        if (! is_array($queueRec)) {
1154
            return;
1155
        }
1156
1157
        SignalSlotUtility::emitSignal(
1158
            self::class,
1159
            SignalSlotUtility::SIGNAL_QUEUEITEM_PREPROCESS,
1160
            [$queueId, &$queueRec]
1161
        );
1162
1163
        // Set exec_time to lock record:
1164
        $field_array = ['exec_time' => $this->getCurrentTime()];
1165
1166
        if (isset($this->processID)) {
1167
            //if mulitprocessing is used we need to store the id of the process which has handled this entry
1168
            $field_array['process_id_completed'] = $this->processID;
1169
        }
1170
1171
        GeneralUtility::makeInstance(ConnectionPool::class)->getConnectionForTable('tx_crawler_queue')
1172
            ->update(
1173
                'tx_crawler_queue',
1174
                $field_array,
1175
                ['qid' => (int) $queueId]
1176
            );
1177
1178
        $result = $this->queueExecutor->executeQueueItem($queueRec, $this);
1179
        if ($result['content'] === null) {
1180
            $resultData = 'An errors happened';
1181
        } else {
1182
            /** @var JsonCompatibilityConverter $jsonCompatibilityConverter */
1183
            $jsonCompatibilityConverter = GeneralUtility::makeInstance(JsonCompatibilityConverter::class);
1184
            $resultData = $jsonCompatibilityConverter->convert($result['content']);
1185
        }
1186
1187
        //atm there's no need to point to specific pollable extensions
1188
        if (is_array($GLOBALS['TYPO3_CONF_VARS']['EXTCONF']['crawler']['pollSuccess'])) {
1189
            foreach ($GLOBALS['TYPO3_CONF_VARS']['EXTCONF']['crawler']['pollSuccess'] as $pollable) {
1190
                // only check the success value if the instruction is runnig
1191
                // it is important to name the pollSuccess key same as the procInstructions key
1192
                if (is_array($resultData['parameters']['procInstructions'])
1193
                    && in_array(
1194
                        $pollable,
1195
                        $resultData['parameters']['procInstructions'], true
1196
                    )
1197
                ) {
1198
                    if (! empty($resultData['success'][$pollable]) && $resultData['success'][$pollable]) {
1199
                        $ret |= self::CLI_STATUS_POLLABLE_PROCESSED;
1200
                    }
1201
                }
1202
            }
1203
        }
1204
1205
        // Set result in log which also denotes the end of the processing of this entry.
1206
        $field_array = ['result_data' => json_encode($result)];
1207
1208
        SignalSlotUtility::emitSignal(
1209
            self::class,
1210
            SignalSlotUtility::SIGNAL_QUEUEITEM_POSTPROCESS,
1211
            [$queueId, &$field_array]
1212
        );
1213
1214
        GeneralUtility::makeInstance(ConnectionPool::class)->getConnectionForTable('tx_crawler_queue')
1215
            ->update(
1216
                'tx_crawler_queue',
1217
                $field_array,
1218
                ['qid' => (int) $queueId]
1219
            );
1220
1221
        $this->logger->debug('crawler-readurl stop ' . microtime(true));
1222
        return $ret;
1223
    }
1224
1225
    /**
1226
     * Read URL for not-yet-inserted log-entry
1227
     *
1228
     * @param array $field_array Queue field array,
1229
     *
1230
     * @return string
1231
     */
1232
    public function readUrlFromArray($field_array)
1233
    {
1234
        // Set exec_time to lock record:
1235
        $field_array['exec_time'] = $this->getCurrentTime();
1236
        $connectionForCrawlerQueue = GeneralUtility::makeInstance(ConnectionPool::class)->getConnectionForTable($this->tableName);
1237
        $connectionForCrawlerQueue->insert(
1238
            $this->tableName,
1239
            $field_array
1240
        );
1241
        $queueId = $field_array['qid'] = $connectionForCrawlerQueue->lastInsertId($this->tableName, 'qid');
1242
1243
        $result = $this->queueExecutor->executeQueueItem($field_array, $this);
1244
1245
        // Set result in log which also denotes the end of the processing of this entry.
1246
        $field_array = ['result_data' => json_encode($result)];
1247
1248
        SignalSlotUtility::emitSignal(
1249
            self::class,
1250
            SignalSlotUtility::SIGNAL_QUEUEITEM_POSTPROCESS,
1251
            [$queueId, &$field_array]
1252
        );
1253
1254
        $connectionForCrawlerQueue->update(
1255
            $this->tableName,
1256
            $field_array,
1257
            ['qid' => $queueId]
1258
        );
1259
1260
        return $result;
0 ignored issues
show
Bug Best Practice introduced by
The expression return $result also could return the type array|boolean which is incompatible with the documented return type string.
Loading history...
1261
    }
1262
1263
    /*****************************
1264
     *
1265
     * Compiling URLs to crawl - tools
1266
     *
1267
     *****************************/
1268
1269
    /**
1270
     * @param integer $id Root page id to start from.
1271
     * @param integer $depth Depth of tree, 0=only id-page, 1= on sublevel, 99 = infinite
1272
     * @param integer $scheduledTime Unix Time when the URL is timed to be visited when put in queue
1273
     * @param integer $reqMinute Number of requests per minute (creates the interleave between requests)
1274
     * @param boolean $submitCrawlUrls If set, submits the URLs to queue in database (real crawling)
1275
     * @param boolean $downloadCrawlUrls If set (and submitcrawlUrls is false) will fill $downloadUrls with entries)
1276
     * @param array $incomingProcInstructions Array of processing instructions
1277
     * @param array $configurationSelection Array of configuration keys
1278
     * @return string
1279
     */
1280
    public function getPageTreeAndUrls(
1281
        $id,
1282
        $depth,
1283
        $scheduledTime,
1284
        $reqMinute,
1285
        $submitCrawlUrls,
1286
        $downloadCrawlUrls,
1287
        array $incomingProcInstructions,
1288
        array $configurationSelection
1289
    ) {
1290
        $this->scheduledTime = $scheduledTime;
1291
        $this->reqMinute = $reqMinute;
1292
        $this->submitCrawlUrls = $submitCrawlUrls;
1293
        $this->downloadCrawlUrls = $downloadCrawlUrls;
1294
        $this->incomingProcInstructions = $incomingProcInstructions;
1295
        $this->incomingConfigurationSelection = $configurationSelection;
1296
1297
        $this->duplicateTrack = [];
1298
        $this->downloadUrls = [];
1299
1300
        // Drawing tree:
1301
        /* @var PageTreeView $tree */
1302
        $tree = GeneralUtility::makeInstance(PageTreeView::class);
1303
        $perms_clause = $this->getBackendUser()->getPagePermsClause(Permission::PAGE_SHOW);
1304
        $tree->init('AND ' . $perms_clause);
1305
1306
        $pageInfo = BackendUtility::readPageAccess($id, $perms_clause);
1307
        if (is_array($pageInfo)) {
0 ignored issues
show
introduced by
The condition is_array($pageInfo) is always true.
Loading history...
1308
            // Set root row:
1309
            $tree->tree[] = [
1310
                'row' => $pageInfo,
1311
                'HTML' => $this->iconFactory->getIconForRecord('pages', $pageInfo, Icon::SIZE_SMALL),
1312
            ];
1313
        }
1314
1315
        // Get branch beneath:
1316
        if ($depth) {
1317
            $tree->getTree($id, $depth, '');
1318
        }
1319
1320
        // Traverse page tree:
1321
        $code = '';
1322
1323
        foreach ($tree->tree as $data) {
1324
            $this->MP = false;
1325
1326
            // recognize mount points
1327
            if ($data['row']['doktype'] === PageRepository::DOKTYPE_MOUNTPOINT) {
1328
                $queryBuilder = GeneralUtility::makeInstance(ConnectionPool::class)->getQueryBuilderForTable('pages');
1329
                $queryBuilder->getRestrictions()->removeAll()->add(GeneralUtility::makeInstance(DeletedRestriction::class));
1330
                $mountpage = $queryBuilder
1331
                    ->select('*')
1332
                    ->from('pages')
1333
                    ->where(
1334
                        $queryBuilder->expr()->eq('uid', $queryBuilder->createNamedParameter($data['row']['uid'], \PDO::PARAM_INT))
1335
                    )
1336
                    ->execute()
1337
                    ->fetchAll();
1338
                $queryBuilder->resetRestrictions();
1339
1340
                // fetch mounted pages
1341
                $this->MP = $mountpage[0]['mount_pid'] . '-' . $data['row']['uid'];
0 ignored issues
show
Documentation Bug introduced by
The property $MP was declared of type boolean, but $mountpage[0]['mount_pid...' . $data['row']['uid'] is of type string. Maybe add a type cast?

This check looks for assignments to scalar types that may be of the wrong type.

To ensure the code behaves as expected, it may be a good idea to add an explicit type cast.

$answer = 42;

$correct = false;

$correct = (bool) $answer;
Loading history...
1342
1343
                $mountTree = GeneralUtility::makeInstance(PageTreeView::class);
1344
                $mountTree->init('AND ' . $perms_clause);
1345
                $mountTree->getTree($mountpage[0]['mount_pid'], $depth);
1346
1347
                foreach ($mountTree->tree as $mountData) {
1348
                    $code .= $this->drawURLs_addRowsForPage(
1349
                        $mountData['row'],
1350
                        $mountData['HTML'] . BackendUtility::getRecordTitle('pages', $mountData['row'], true)
1351
                    );
1352
                }
1353
1354
                // replace page when mount_pid_ol is enabled
1355
                if ($mountpage[0]['mount_pid_ol']) {
1356
                    $data['row']['uid'] = $mountpage[0]['mount_pid'];
1357
                } else {
1358
                    // if the mount_pid_ol is not set the MP must not be used for the mountpoint page
1359
                    $this->MP = false;
1360
                }
1361
            }
1362
1363
            $code .= $this->drawURLs_addRowsForPage(
1364
                $data['row'],
1365
                $data['HTML'] . BackendUtility::getRecordTitle('pages', $data['row'], true)
1366
            );
1367
        }
1368
1369
        return $code;
1370
    }
1371
1372
    /**
1373
     * Expands exclude string
1374
     *
1375
     * @param string $excludeString Exclude string
1376
     * @return array
1377
     */
1378 1
    public function expandExcludeString($excludeString)
1379
    {
1380
        // internal static caches;
1381 1
        static $expandedExcludeStringCache;
1382 1
        static $treeCache;
1383
1384 1
        if (empty($expandedExcludeStringCache[$excludeString])) {
1385 1
            $pidList = [];
1386
1387 1
            if (! empty($excludeString)) {
1388
                /** @var PageTreeView $tree */
1389
                $tree = GeneralUtility::makeInstance(PageTreeView::class);
1390
                $tree->init('AND ' . $this->getBackendUser()->getPagePermsClause(Permission::PAGE_SHOW));
1391
1392
                $excludeParts = GeneralUtility::trimExplode(',', $excludeString);
1393
1394
                foreach ($excludeParts as $excludePart) {
1395
                    [$pid, $depth] = GeneralUtility::trimExplode('+', $excludePart);
1396
1397
                    // default is "page only" = "depth=0"
1398
                    if (empty($depth)) {
1399
                        $depth = (stristr($excludePart, '+')) ? 99 : 0;
1400
                    }
1401
1402
                    $pidList[] = $pid;
1403
1404
                    if ($depth > 0) {
1405
                        if (empty($treeCache[$pid][$depth])) {
1406
                            $tree->reset();
1407
                            $tree->getTree($pid, $depth);
1408
                            $treeCache[$pid][$depth] = $tree->tree;
1409
                        }
1410
1411
                        foreach ($treeCache[$pid][$depth] as $data) {
1412
                            $pidList[] = $data['row']['uid'];
1413
                        }
1414
                    }
1415
                }
1416
            }
1417
1418 1
            $expandedExcludeStringCache[$excludeString] = array_unique($pidList);
1419
        }
1420
1421 1
        return $expandedExcludeStringCache[$excludeString];
1422
    }
1423
1424
    /**
1425
     * Create the rows for display of the page tree
1426
     * For each page a number of rows are shown displaying GET variable configuration
1427
     */
1428
    public function drawURLs_addRowsForPage(array $pageRow, string $pageTitle): string
1429
    {
1430
        $skipMessage = '';
1431
1432
        // Get list of configurations
1433
        $configurations = $this->getUrlsForPageRow($pageRow, $skipMessage);
1434
1435
        if (! empty($this->incomingConfigurationSelection)) {
1436
            // remove configuration that does not match the current selection
1437
            foreach ($configurations as $confKey => $confArray) {
1438
                if (! in_array($confKey, $this->incomingConfigurationSelection, true)) {
1439
                    unset($configurations[$confKey]);
1440
                }
1441
            }
1442
        }
1443
1444
        // Traverse parameter combinations:
1445
        $c = 0;
1446
        $content = '';
1447
        if (! empty($configurations)) {
1448
            foreach ($configurations as $confKey => $confArray) {
1449
1450
                // Title column:
1451
                if (! $c) {
1452
                    $titleClm = '<td rowspan="' . count($configurations) . '">' . $pageTitle . '</td>';
1453
                } else {
1454
                    $titleClm = '';
1455
                }
1456
1457
                if (! in_array($pageRow['uid'], $this->expandExcludeString($confArray['subCfg']['exclude']), true)) {
1458
1459
                    // URL list:
1460
                    $urlList = $this->urlListFromUrlArray(
1461
                        $confArray,
1462
                        $pageRow,
1463
                        $this->scheduledTime,
1464
                        $this->reqMinute,
1465
                        $this->submitCrawlUrls,
1466
                        $this->downloadCrawlUrls,
1467
                        $this->duplicateTrack,
1468
                        $this->downloadUrls,
1469
                        $this->incomingProcInstructions // if empty the urls won't be filtered by processing instructions
1470
                    );
1471
1472
                    // Expanded parameters:
1473
                    $paramExpanded = '';
1474
                    $calcAccu = [];
1475
                    $calcRes = 1;
1476
                    foreach ($confArray['paramExpanded'] as $gVar => $gVal) {
1477
                        $paramExpanded .= '
1478
                            <tr>
1479
                                <td>' . htmlspecialchars('&' . $gVar . '=') . '<br/>' .
1480
                            '(' . count($gVal) . ')' .
1481
                            '</td>
1482
                                <td nowrap="nowrap">' . nl2br(htmlspecialchars(implode(chr(10), $gVal))) . '</td>
1483
                            </tr>
1484
                        ';
1485
                        $calcRes *= count($gVal);
1486
                        $calcAccu[] = count($gVal);
1487
                    }
1488
                    $paramExpanded = '<table>' . $paramExpanded . '</table>';
1489
                    $paramExpanded .= 'Comb: ' . implode('*', $calcAccu) . '=' . $calcRes;
1490
1491
                    // Options
1492
                    $optionValues = '';
1493
                    if ($confArray['subCfg']['userGroups']) {
1494
                        $optionValues .= 'User Groups: ' . $confArray['subCfg']['userGroups'] . '<br/>';
1495
                    }
1496
                    if ($confArray['subCfg']['procInstrFilter']) {
1497
                        $optionValues .= 'ProcInstr: ' . $confArray['subCfg']['procInstrFilter'] . '<br/>';
1498
                    }
1499
1500
                    // Compile row:
1501
                    $content .= '
1502
                        <tr>
1503
                            ' . $titleClm . '
1504
                            <td>' . htmlspecialchars($confKey) . '</td>
1505
                            <td>' . nl2br(htmlspecialchars(rawurldecode(trim(str_replace('&', chr(10) . '&', GeneralUtility::implodeArrayForUrl('', $confArray['paramParsed'])))))) . '</td>
1506
                            <td>' . $paramExpanded . '</td>
1507
                            <td nowrap="nowrap">' . $urlList . '</td>
1508
                            <td nowrap="nowrap">' . $optionValues . '</td>
1509
                            <td nowrap="nowrap">' . DebugUtility::viewArray($confArray['subCfg']['procInstrParams.']) . '</td>
1510
                        </tr>';
1511
                } else {
1512
                    $content .= '<tr>
1513
                            ' . $titleClm . '
1514
                            <td>' . htmlspecialchars($confKey) . '</td>
1515
                            <td colspan="5"><em>No entries</em> (Page is excluded in this configuration)</td>
1516
                        </tr>';
1517
                }
1518
1519
                $c++;
1520
            }
1521
        } else {
1522
            $message = ! empty($skipMessage) ? ' (' . $skipMessage . ')' : '';
1523
1524
            // Compile row:
1525
            $content .= '
1526
                <tr>
1527
                    <td>' . $pageTitle . '</td>
1528
                    <td colspan="6"><em>No entries</em>' . $message . '</td>
1529
                </tr>';
1530
        }
1531
1532
        return $content;
1533
    }
1534
1535
    /*****************************
1536
     *
1537
     * CLI functions
1538
     *
1539
     *****************************/
1540
1541
    /**
1542
     * Running the functionality of the CLI (crawling URLs from queue)
1543
     */
1544
    public function CLI_run(int $countInARun, int $sleepTime, int $sleepAfterFinish): int
1545
    {
1546
        $result = 0;
1547
        $counter = 0;
1548
1549
        // First, run hooks:
1550
        /** @var HookUtility $hookUtility */
1551
        $hookUtility = GeneralUtility::makeInstance(HookUtility::class);
1552
        $hookUtility->triggerCliHooks();
1553
1554
1555
        // Clean up the queue
1556
        $this->queueRepository->cleanupQueue();
1557
1558
        // Select entries:
1559
        $rows = $this->queueRepository->fetchRecordsToBeCrawled($countInARun);
1560
1561
        if (! empty($rows)) {
1562
            $quidList = [];
1563
1564
            foreach ($rows as $r) {
1565
                $quidList[] = $r['qid'];
1566
            }
1567
1568
            $processId = $this->CLI_buildProcessId();
1569
1570
            //save the number of assigned queue entries to determine how many have been processed later
1571
            $numberOfAffectedRows = $this->queueRepository->updateProcessIdAndSchedulerForQueueIds($quidList, $processId);
1572
            $this->processRepository->updateProcessAssignItemsCount($numberOfAffectedRows, $processId);
1573
1574
            if ($numberOfAffectedRows !== count($quidList)) {
1575
                $this->CLI_debug('Nothing processed due to multi-process collision (' . $this->CLI_buildProcessId() . ')');
1576
                return ($result | self::CLI_STATUS_ABORTED);
1577
            }
1578
1579
            foreach ($rows as $r) {
1580
                $result |= $this->readUrl($r['qid']);
1581
1582
                $counter++;
1583
                usleep((int) $sleepTime); // Just to relax the system
1584
1585
                // if during the start and the current read url the cli has been disable we need to return from the function
1586
                // mark the process NOT as ended.
1587
                if ($this->getDisabled()) {
1588
                    return ($result | self::CLI_STATUS_ABORTED);
1589
                }
1590
1591
                if (! $this->processRepository->isProcessActive($this->CLI_buildProcessId())) {
1592
                    $this->CLI_debug('conflict / timeout (' . $this->CLI_buildProcessId() . ')');
1593
                    $result |= self::CLI_STATUS_ABORTED;
1594
                    break; //possible timeout
1595
                }
1596
            }
1597
1598
            sleep((int) $sleepAfterFinish);
1599
1600
            $msg = 'Rows: ' . $counter;
1601
            $this->CLI_debug($msg . ' (' . $this->CLI_buildProcessId() . ')');
1602
        } else {
1603
            $this->CLI_debug('Nothing within queue which needs to be processed (' . $this->CLI_buildProcessId() . ')');
1604
        }
1605
1606
        if ($counter > 0) {
1607
            $result |= self::CLI_STATUS_PROCESSED;
1608
        }
1609
1610
        return $result;
1611
    }
1612
1613
    /**
1614
     * @deprecated
1615
     */
1616
    public function CLI_runHooks(): void
1617
    {
1618
        /** @var HookUtility $hookUtility */
1619
        $hookUtility = GeneralUtility::makeInstance(HookUtility::class);
1620
        $hookUtility->triggerCliHooks();
1621
    }
1622
1623
    /**
1624
     * Try to acquire a new process with the given id
1625
     * also performs some auto-cleanup for orphan processes
1626
     * @param string $id identification string for the process
1627
     * @return boolean
1628
     * @todo preemption might not be the most elegant way to clean up
1629
     */
1630
    public function CLI_checkAndAcquireNewProcess($id)
1631
    {
1632
        $queryBuilder = GeneralUtility::makeInstance(ConnectionPool::class)->getQueryBuilderForTable($this->tableName);
1633
        $ret = true;
1634
1635
        $systemProcessId = getmypid();
1636
        if ($systemProcessId < 1) {
1637
            return false;
1638
        }
1639
1640
        $processCount = 0;
1641
        $orphanProcesses = [];
1642
1643
        $statement = $queryBuilder
1644
            ->select('process_id', 'ttl')
1645
            ->from('tx_crawler_process')
1646
            ->where(
1647
                'active = 1 AND deleted = 0'
1648
            )
1649
            ->execute();
1650
1651
        $currentTime = $this->getCurrentTime();
1652
1653
        while ($row = $statement->fetch()) {
1654
            if ($row['ttl'] < $currentTime) {
1655
                $orphanProcesses[] = $row['process_id'];
1656
            } else {
1657
                $processCount++;
1658
            }
1659
        }
1660
1661
        // if there are less than allowed active processes then add a new one
1662
        if ($processCount < (int) $this->extensionSettings['processLimit']) {
1663
            $this->CLI_debug('add process ' . $this->CLI_buildProcessId() . ' (' . ($processCount + 1) . '/' . (int) $this->extensionSettings['processLimit'] . ')');
1664
1665
            GeneralUtility::makeInstance(ConnectionPool::class)->getConnectionForTable('tx_crawler_process')->insert(
1666
                'tx_crawler_process',
1667
                [
1668
                    'process_id' => $id,
1669
                    'active' => 1,
1670
                    'ttl' => $currentTime + (int) $this->extensionSettings['processMaxRunTime'],
1671
                    'system_process_id' => $systemProcessId,
1672
                ]
1673
            );
1674
        } else {
1675
            $this->CLI_debug('Processlimit reached (' . ($processCount) . '/' . (int) $this->extensionSettings['processLimit'] . ')');
1676
            $ret = false;
1677
        }
1678
1679
        $this->processRepository->deleteProcessesMarkedAsDeleted();
1680
        $this->CLI_releaseProcesses($orphanProcesses);
1681
1682
        return $ret;
1683
    }
1684
1685
    /**
1686
     * Release a process and the required resources
1687
     *
1688
     * @param mixed $releaseIds string with a single process-id or array with multiple process-ids
1689
     * @return boolean
1690
     */
1691
    public function CLI_releaseProcesses($releaseIds)
1692
    {
1693
        $queryBuilder = GeneralUtility::makeInstance(ConnectionPool::class)->getQueryBuilderForTable($this->tableName);
1694
1695
        if (! is_array($releaseIds)) {
1696
            $releaseIds = [$releaseIds];
1697
        }
1698
1699
        if (empty($releaseIds)) {
1700
            return false;   //nothing to release
1701
        }
1702
1703
        // some kind of 2nd chance algo - this way you need at least 2 processes to have a real cleanup
1704
        // this ensures that a single process can't mess up the entire process table
1705
1706
        // mark all processes as deleted which have no "waiting" queue-entires and which are not active
1707
1708
        $queryBuilder
1709
            ->update($this->tableName, 'q')
1710
            ->where(
1711
                'q.process_id IN(SELECT p.process_id FROM tx_crawler_process as p WHERE p.active = 0)'
1712
            )
1713
            ->set('q.process_scheduled', 0)
1714
            ->set('q.process_id', '')
1715
            ->execute();
1716
1717
        // FIXME: Not entirely sure that this is equivalent to the previous version
1718
        $queryBuilder->resetQueryPart('set');
1719
1720
        $queryBuilder
1721
            ->update('tx_crawler_process')
1722
            ->where(
1723
                $queryBuilder->expr()->eq('active', 0),
1724
                'process_id IN(SELECT q.process_id FROM tx_crawler_queue as q WHERE q.exec_time = 0)'
1725
            )
1726
            ->set('system_process_id', 0)
1727
            ->execute();
1728
1729
        $this->processRepository->markRequestedProcessesAsNotActive($releaseIds);
1730
        $this->queueRepository->unsetProcessScheduledAndProcessIdForQueueEntries($releaseIds);
1731
1732
        return true;
1733
    }
1734
1735
    /**
1736
     * Create a unique Id for the current process
1737
     *
1738
     * @return string  the ID
1739
     */
1740 1
    public function CLI_buildProcessId()
1741
    {
1742 1
        if (! $this->processID) {
1743
            $this->processID = GeneralUtility::shortMD5(microtime(true));
1744
        }
1745 1
        return $this->processID;
1746
    }
1747
1748
    /**
1749
     * Prints a message to the stdout (only if debug-mode is enabled)
1750
     *
1751
     * @param string $msg the message
1752
     */
1753
    public function CLI_debug($msg): void
1754
    {
1755
        if ((int) $this->extensionSettings['processDebug']) {
1756
            echo $msg . "\n";
1757
            flush();
1758
        }
1759
    }
1760
1761
    /**
1762
     * Cleans up entries that stayed for too long in the queue. These are:
1763
     * - processed entries that are over 1.5 days in age
1764
     * - scheduled entries that are over 7 days old
1765
     *
1766
     * @deprecated
1767
     */
1768 1
    public function cleanUpOldQueueEntries(): void
1769
    {
1770 1
        $processedAgeInSeconds = $this->extensionSettings['cleanUpProcessedAge'] * 86400; // 24*60*60 Seconds in 24 hours
1771 1
        $scheduledAgeInSeconds = $this->extensionSettings['cleanUpScheduledAge'] * 86400;
1772
1773 1
        $now = time();
1774 1
        $condition = '(exec_time<>0 AND exec_time<' . ($now - $processedAgeInSeconds) . ') OR scheduled<=' . ($now - $scheduledAgeInSeconds);
1775 1
        $this->flushQueue($condition);
0 ignored issues
show
Deprecated Code introduced by
The function AOE\Crawler\Controller\C...ontroller::flushQueue() has been deprecated. ( Ignorable by Annotation )

If this is a false-positive, you can also ignore this issue in your code via the ignore-deprecated  annotation

1775
        /** @scrutinizer ignore-deprecated */ $this->flushQueue($condition);
Loading history...
1776 1
    }
1777
1778
    /**
1779
     * Removes queue entries
1780
     *
1781
     * @param string $where SQL related filter for the entries which should be removed
1782
     *
1783
     * @deprecated
1784
     */
1785 5
    protected function flushQueue($where = ''): void
1786
    {
1787 5
        $realWhere = strlen((string) $where) > 0 ? $where : '1=1';
1788
1789 5
        $queryBuilder = $this->getQueryBuilder($this->tableName);
1790
1791
        $groups = $queryBuilder
1792 5
            ->selectLiteral('DISTINCT set_id')
1793 5
            ->from($this->tableName)
1794 5
            ->where($realWhere)
1795 5
            ->execute()
1796 5
            ->fetchAll();
1797 5
        if (is_array($groups)) {
0 ignored issues
show
introduced by
The condition is_array($groups) is always true.
Loading history...
1798 5
            foreach ($groups as $group) {
1799
                $subSet = $queryBuilder
1800 4
                    ->select('qid', 'set_id')
1801 4
                    ->from($this->tableName)
1802 4
                    ->where(
1803 4
                        $realWhere,
1804 4
                        $queryBuilder->expr()->eq('set_id', $group['set_id'])
1805
                    )
1806 4
                    ->execute()
1807 4
                    ->fetchAll();
1808
1809 4
                $payLoad = ['subSet' => $subSet];
1810 4
                SignalSlotUtility::emitSignal(
1811 4
                    self::class,
1812 4
                    SignalSlotUtility::SIGNAL_QUEUE_ENTRY_FLUSH,
1813 4
                    $payLoad
1814
                );
1815
            }
1816
        }
1817
1818
        $queryBuilder
1819 5
            ->delete($this->tableName)
1820 5
            ->where($realWhere)
1821 5
            ->execute();
1822 5
    }
1823
1824
    /**
1825
     * This method determines duplicates for a queue entry with the same parameters and this timestamp.
1826
     * If the timestamp is in the past, it will check if there is any unprocessed queue entry in the past.
1827
     * If the timestamp is in the future it will check, if the queued entry has exactly the same timestamp
1828
     *
1829
     * @param int $tstamp
1830
     * @param array $fieldArray
1831
     *
1832
     * @return array
1833
     */
1834 9
    protected function getDuplicateRowsIfExist($tstamp, $fieldArray)
1835
    {
1836 9
        $rows = [];
1837
1838 9
        $currentTime = $this->getCurrentTime();
1839
1840 9
        $queryBuilder = GeneralUtility::makeInstance(ConnectionPool::class)->getQueryBuilderForTable($this->tableName);
1841
        $queryBuilder
1842 9
            ->select('qid')
1843 9
            ->from('tx_crawler_queue');
1844
        //if this entry is scheduled with "now"
1845 9
        if ($tstamp <= $currentTime) {
1846 3
            if ($this->extensionSettings['enableTimeslot']) {
1847 2
                $timeBegin = $currentTime - 100;
1848 2
                $timeEnd = $currentTime + 100;
1849
                $queryBuilder
1850 2
                    ->where(
1851 2
                        'scheduled BETWEEN ' . $timeBegin . ' AND ' . $timeEnd . ''
1852
                    )
1853 2
                    ->orWhere(
1854 2
                        $queryBuilder->expr()->lte('scheduled', $currentTime)
1855
                    );
1856
            } else {
1857
                $queryBuilder
1858 1
                    ->where(
1859 3
                        $queryBuilder->expr()->lte('scheduled', $currentTime)
1860
                    );
1861
            }
1862 6
        } elseif ($tstamp > $currentTime) {
1863
            //entry with a timestamp in the future need to have the same schedule time
1864
            $queryBuilder
1865 6
                ->where(
1866 6
                    $queryBuilder->expr()->eq('scheduled', $tstamp)
1867
                );
1868
        }
1869
1870
        $queryBuilder
1871 9
            ->andWhere('NOT exec_time')
1872 9
            ->andWhere('NOT process_id')
1873 9
            ->andWhere($queryBuilder->expr()->eq('page_id', $queryBuilder->createNamedParameter($fieldArray['page_id'], \PDO::PARAM_INT)))
1874 9
            ->andWhere($queryBuilder->expr()->eq('parameters_hash', $queryBuilder->createNamedParameter($fieldArray['parameters_hash'], \PDO::PARAM_STR)));
1875
1876 9
        $statement = $queryBuilder->execute();
1877
1878 9
        while ($row = $statement->fetch()) {
1879 7
            $rows[] = $row['qid'];
1880
        }
1881
1882 9
        return $rows;
1883
    }
1884
1885
    /**
1886
     * Returns a md5 hash generated from a serialized configuration array.
1887
     *
1888
     * @return string
1889
     */
1890 10
    protected function getConfigurationHash(array $configuration)
1891
    {
1892 10
        unset($configuration['paramExpanded']);
1893 10
        unset($configuration['URLs']);
1894 10
        return md5(serialize($configuration));
1895
    }
1896
1897
    /**
1898
     * Build a URL from a Page and the Query String. If the page has a Site configuration, it can be built by using
1899
     * the Site instance.
1900
     *
1901
     * @param int $httpsOrHttp see tx_crawler_configuration.force_ssl
1902
     * @throws \TYPO3\CMS\Core\Exception\SiteNotFoundException
1903
     * @throws \TYPO3\CMS\Core\Routing\InvalidRouteArgumentsException
1904
     */
1905 12
    protected function getUrlFromPageAndQueryParameters(int $pageId, string $queryString, ?string $alternativeBaseUrl, int $httpsOrHttp): UriInterface
1906
    {
1907 12
        $site = GeneralUtility::makeInstance(SiteMatcher::class)->matchByPageId((int) $pageId);
1908 12
        if ($site instanceof Site) {
1909 5
            $queryString = ltrim($queryString, '?&');
1910 5
            $queryParts = [];
1911 5
            parse_str($queryString, $queryParts);
1912 5
            unset($queryParts['id']);
1913
            // workaround as long as we don't have native language support in crawler configurations
1914 5
            if (isset($queryParts['L'])) {
1915
                $queryParts['_language'] = $queryParts['L'];
1916
                unset($queryParts['L']);
1917
                $siteLanguage = $site->getLanguageById((int) $queryParts['_language']);
0 ignored issues
show
Unused Code introduced by
The assignment to $siteLanguage is dead and can be removed.
Loading history...
1918
            } else {
1919 5
                $siteLanguage = $site->getDefaultLanguage();
1920
            }
1921 5
            $url = $site->getRouter()->generateUri($pageId, $queryParts);
1922 5
            if (! empty($alternativeBaseUrl)) {
1923 3
                $alternativeBaseUrl = new Uri($alternativeBaseUrl);
1924 3
                $url = $url->withHost($alternativeBaseUrl->getHost());
1925 3
                $url = $url->withScheme($alternativeBaseUrl->getScheme());
1926 3
                $url = $url->withPort($alternativeBaseUrl->getPort());
1927 3
                if ($userInfo = $alternativeBaseUrl->getUserInfo()) {
1928 5
                    $url = $url->withUserInfo($userInfo);
1929
                }
1930
            }
1931
        } else {
1932
            // Technically this is not possible with site handling, but kept for backwards-compatibility reasons
1933
            // Once EXT:crawler is v10-only compatible, this should be removed completely
1934 7
            $baseUrl = ($alternativeBaseUrl ?: GeneralUtility::getIndpEnv('TYPO3_SITE_URL'));
1935 7
            $cacheHashCalculator = GeneralUtility::makeInstance(CacheHashCalculator::class);
1936 7
            $queryString .= '&cHash=' . $cacheHashCalculator->generateForParameters($queryString);
1937 7
            $url = rtrim($baseUrl, '/') . '/index.php' . $queryString;
1938 7
            $url = new Uri($url);
1939
        }
1940
1941 12
        if ($httpsOrHttp === -1) {
1942 2
            $url = $url->withScheme('http');
1943 10
        } elseif ($httpsOrHttp === 1) {
1944 6
            $url = $url->withScheme('https');
1945
        }
1946
1947 12
        return $url;
1948
    }
1949
1950 1
    protected function swapIfFirstIsLargerThanSecond(array $reg): array
1951
    {
1952
        // Swap if first is larger than last:
1953 1
        if ($reg[1] > $reg[2]) {
1954
            $temp = $reg[2];
1955
            $reg[2] = $reg[1];
1956
            $reg[1] = $temp;
1957
        }
1958
1959 1
        return $reg;
1960
    }
1961
1962
    /**
1963
     * @return BackendUserAuthentication
1964
     */
1965 1
    private function getBackendUser()
1966
    {
1967
        // Make sure the _cli_ user is loaded
1968 1
        Bootstrap::initializeBackendAuthentication();
1969 1
        if ($this->backendUser === null) {
1970 1
            $this->backendUser = $GLOBALS['BE_USER'];
1971
        }
1972 1
        return $this->backendUser;
1973
    }
1974
1975
    /**
1976
     * Get querybuilder for given table
1977
     *
1978
     * @return \TYPO3\CMS\Core\Database\Query\QueryBuilder
1979
     */
1980 12
    private function getQueryBuilder(string $table)
1981
    {
1982 12
        return GeneralUtility::makeInstance(ConnectionPool::class)->getQueryBuilderForTable($table);
1983
    }
1984
}
1985