Passed
Push — cleanup/crawlercontroller ( 49e992 )
by Tomas Norre
08:00
created

CrawlerController::getLogEntriesForPageId()   A

Complexity

Conditions 2
Paths 2

Size

Total Lines 25
Code Lines 16

Duplication

Lines 0
Ratio 0 %

Code Coverage

Tests 15
CRAP Score 2

Importance

Changes 2
Bugs 0 Features 0
Metric Value
cc 2
eloc 16
c 2
b 0
f 0
nc 2
nop 5
dl 0
loc 25
ccs 15
cts 15
cp 1
crap 2
rs 9.7333
1
<?php
2
3
declare(strict_types=1);
4
5
namespace AOE\Crawler\Controller;
6
7
/***************************************************************
8
 *  Copyright notice
9
 *
10
 *  (c) 2020 AOE GmbH <[email protected]>
11
 *
12
 *  All rights reserved
13
 *
14
 *  This script is part of the TYPO3 project. The TYPO3 project is
15
 *  free software; you can redistribute it and/or modify
16
 *  it under the terms of the GNU General Public License as published by
17
 *  the Free Software Foundation; either version 3 of the License, or
18
 *  (at your option) any later version.
19
 *
20
 *  The GNU General Public License can be found at
21
 *  http://www.gnu.org/copyleft/gpl.html.
22
 *
23
 *  This script is distributed in the hope that it will be useful,
24
 *  but WITHOUT ANY WARRANTY; without even the implied warranty of
25
 *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
26
 *  GNU General Public License for more details.
27
 *
28
 *  This copyright notice MUST APPEAR in all copies of the script!
29
 ***************************************************************/
30
31
use AOE\Crawler\Configuration\ExtensionConfigurationProvider;
32
use AOE\Crawler\Domain\Repository\ConfigurationRepository;
33
use AOE\Crawler\Domain\Repository\ProcessRepository;
34
use AOE\Crawler\Domain\Repository\QueueRepository;
35
use AOE\Crawler\Event\EventDispatcher;
36
use AOE\Crawler\QueueExecutor;
37
use AOE\Crawler\Utility\SignalSlotUtility;
38
use Psr\Http\Message\UriInterface;
39
use Psr\Log\LoggerAwareInterface;
40
use Psr\Log\LoggerAwareTrait;
41
use TYPO3\CMS\Backend\Tree\View\PageTreeView;
42
use TYPO3\CMS\Backend\Utility\BackendUtility;
43
use TYPO3\CMS\Core\Authentication\BackendUserAuthentication;
44
use TYPO3\CMS\Core\Compatibility\PublicMethodDeprecationTrait;
45
use TYPO3\CMS\Core\Core\Bootstrap;
46
use TYPO3\CMS\Core\Core\Environment;
47
use TYPO3\CMS\Core\Database\Connection;
48
use TYPO3\CMS\Core\Database\ConnectionPool;
49
use TYPO3\CMS\Core\Database\Query\Restriction\DeletedRestriction;
50
use TYPO3\CMS\Core\Http\Uri;
51
use TYPO3\CMS\Core\Imaging\Icon;
52
use TYPO3\CMS\Core\Imaging\IconFactory;
53
use TYPO3\CMS\Core\Routing\SiteMatcher;
54
use TYPO3\CMS\Core\Site\Entity\Site;
55
use TYPO3\CMS\Core\Type\Bitmask\Permission;
56
use TYPO3\CMS\Core\TypoScript\Parser\TypoScriptParser;
57
use TYPO3\CMS\Core\Utility\DebugUtility;
58
use TYPO3\CMS\Core\Utility\GeneralUtility;
59
use TYPO3\CMS\Core\Utility\MathUtility;
60
use TYPO3\CMS\Extbase\Object\ObjectManager;
61
use TYPO3\CMS\Frontend\Page\CacheHashCalculator;
62
use TYPO3\CMS\Frontend\Page\PageRepository;
63
64
/**
65
 * Class CrawlerController
66
 *
67
 * @package AOE\Crawler\Controller
68
 */
69
class CrawlerController implements LoggerAwareInterface
70
{
71
    use LoggerAwareTrait;
72
    use PublicMethodDeprecationTrait;
73
74
    /**
75
     * @var string[]
76
     */
77
    private $deprecatedPublicMethods = [
0 ignored issues
show
introduced by
The private property $deprecatedPublicMethods is not used, and could be removed.
Loading history...
78
        'getLogEntriesForSetId' => 'Using crawlerController::getLogEntriesForSetId() is deprecated since 9.0.1 and will be removed in v10.x',
79
        'flushQueue' => 'Using CrawlerController::flushQueue() is deprecated since 9.0.1 and will be removed in v10.x, please use QueueRepository->flushQueue() instead.',
80
        'cleanUpOldQueueEntries' => 'Using CrawlerController::cleanUpOldQueueEntries() is deprecated since 9.0.1 and will be removed in v10.x, please use QueueRepository->cleanUpOldQueueEntries() instead.',
81
    ];
82
83
    public const CLI_STATUS_NOTHING_PROCCESSED = 0;
84
85
    public const CLI_STATUS_REMAIN = 1; //queue not empty
86
87
    public const CLI_STATUS_PROCESSED = 2; //(some) queue items where processed
88
89
    public const CLI_STATUS_ABORTED = 4; //instance didn't finish
90
91
    public const CLI_STATUS_POLLABLE_PROCESSED = 8;
92
93
    /**
94
     * @var integer
95
     */
96
    public $setID = 0;
97
98
    /**
99
     * @var string
100
     */
101
    public $processID = '';
102
103
    /**
104
     * @var array
105
     */
106
    public $duplicateTrack = [];
107
108
    /**
109
     * @var array
110
     */
111
    public $downloadUrls = [];
112
113
    /**
114
     * @var array
115
     */
116
    public $incomingProcInstructions = [];
117
118
    /**
119
     * @var array
120
     */
121
    public $incomingConfigurationSelection = [];
122
123
    /**
124
     * @var bool
125
     */
126
    public $registerQueueEntriesInternallyOnly = false;
127
128
    /**
129
     * @var array
130
     */
131
    public $queueEntries = [];
132
133
    /**
134
     * @var array
135
     */
136
    public $urlList = [];
137
138
    /**
139
     * @var array
140
     */
141
    public $extensionSettings = [];
142
143
    /**
144
     * Mount Point
145
     *
146
     * @var bool
147
     * Todo: Check what this is used for and adjust the type hint or code, as bool doesn't match the current code.
148
     */
149
    public $MP = false;
150
151
    /**
152
     * @var string
153
     */
154
    protected $processFilename;
155
156
    /**
157
     * Holds the internal access mode can be 'gui','cli' or 'cli_im'
158
     *
159
     * @var string
160
     */
161
    protected $accessMode;
162
163
    /**
164
     * @var QueueRepository
165
     */
166
    protected $queueRepository;
167
168
    /**
169
     * @var ProcessRepository
170
     */
171
    protected $processRepository;
172
173
    /**
174
     * @var ConfigurationRepository
175
     */
176
    protected $configurationRepository;
177
178
    /**
179
     * @var string
180
     */
181
    protected $tableName = 'tx_crawler_queue';
182
183
    /**
184
     * @var QueueExecutor
185
     */
186
    protected $queueExecutor;
187
188
    /**
189
     * @var int
190
     */
191
    protected $maximumUrlsToCompile = 10000;
192
193
    /**
194
     * @var IconFactory
195
     */
196
    protected $iconFactory;
197
198
    /**
199
     * @var BackendUserAuthentication|null
200
     */
201
    private $backendUser;
202
203
    /**
204
     * @var integer
205
     */
206
    private $scheduledTime = 0;
207
208
    /**
209
     * @var integer
210
     */
211
    private $reqMinute = 0;
212
213
    /**
214
     * @var bool
215
     */
216
    private $submitCrawlUrls = false;
217
218
    /**
219
     * @var bool
220
     */
221
    private $downloadCrawlUrls = false;
222
223
    /************************************
224
     *
225
     * Getting URLs based on Page TSconfig
226
     *
227
     ************************************/
228
229 41
    public function __construct()
230
    {
231 41
        $objectManager = GeneralUtility::makeInstance(ObjectManager::class);
232 41
        $this->queueRepository = $objectManager->get(QueueRepository::class);
233 41
        $this->processRepository = $objectManager->get(ProcessRepository::class);
234 41
        $this->configurationRepository = $objectManager->get(ConfigurationRepository::class);
235 41
        $this->queueExecutor = $objectManager->get(QueueExecutor::class);
236 41
        $this->iconFactory = GeneralUtility::makeInstance(IconFactory::class);
237
238 41
        $this->processFilename = Environment::getVarPath() . '/lock/tx_crawler.proc';
239
240
        /** @var ExtensionConfigurationProvider $configurationProvider */
241 41
        $configurationProvider = GeneralUtility::makeInstance(ExtensionConfigurationProvider::class);
242 41
        $settings = $configurationProvider->getExtensionConfiguration();
243 41
        $this->extensionSettings = is_array($settings) ? $settings : [];
0 ignored issues
show
introduced by
The condition is_array($settings) is always true.
Loading history...
244
245
        // set defaults:
246 41
        if (MathUtility::convertToPositiveInteger($this->extensionSettings['countInARun']) === 0) {
247
            $this->extensionSettings['countInARun'] = 100;
248
        }
249
250 41
        $this->extensionSettings['processLimit'] = MathUtility::forceIntegerInRange($this->extensionSettings['processLimit'], 1, 99, 1);
251 41
        $this->maximumUrlsToCompile = MathUtility::forceIntegerInRange($this->extensionSettings['maxCompileUrls'], 1, 1000000000, 10000);
252 41
    }
253
254
    /**
255
     * Method to set the accessMode can be gui, cli or cli_im
256
     *
257
     * @return string
258
     */
259 1
    public function getAccessMode()
260
    {
261 1
        return $this->accessMode;
262
    }
263
264
    /**
265
     * @param string $accessMode
266
     */
267 1
    public function setAccessMode($accessMode): void
268
    {
269 1
        $this->accessMode = $accessMode;
270 1
    }
271
272
    /**
273
     * Set disabled status to prevent processes from being processed
274
     *
275
     * @param bool $disabled (optional, defaults to true)
276
     */
277 2
    public function setDisabled($disabled = true): void
278
    {
279 2
        if ($disabled) {
280 1
            GeneralUtility::writeFile($this->processFilename, '');
281
        } else {
282 1
            if (is_file($this->processFilename)) {
283 1
                unlink($this->processFilename);
284
            }
285
        }
286 2
    }
287
288
    /**
289
     * Get disable status
290
     *
291
     * @return bool true if disabled
292
     */
293 2
    public function getDisabled()
294
    {
295 2
        return is_file($this->processFilename);
296
    }
297
298
    /**
299
     * @param string $filenameWithPath
300
     */
301 3
    public function setProcessFilename($filenameWithPath): void
302
    {
303 3
        $this->processFilename = $filenameWithPath;
304 3
    }
305
306
    /**
307
     * @return string
308
     */
309 1
    public function getProcessFilename()
310
    {
311 1
        return $this->processFilename;
312
    }
313
314
    /**
315
     * Sets the extensions settings (unserialized pendant of $TYPO3_CONF_VARS['EXT']['extConf']['crawler']).
316
     */
317 12
    public function setExtensionSettings(array $extensionSettings): void
318
    {
319 12
        $this->extensionSettings = $extensionSettings;
320 12
    }
321
322
    /**
323
     * Check if the given page should be crawled
324
     *
325
     * @return false|string false if the page should be crawled (not excluded), true / skipMessage if it should be skipped
326
     */
327 8
    public function checkIfPageShouldBeSkipped(array $pageRow)
328
    {
329 8
        $skipPage = false;
330 8
        $skipMessage = 'Skipped'; // message will be overwritten later
331
332
        // if page is hidden
333 8
        if (! $this->extensionSettings['crawlHiddenPages']) {
334 8
            if ($pageRow['hidden']) {
335 1
                $skipPage = true;
336 1
                $skipMessage = 'Because page is hidden';
337
            }
338
        }
339
340 8
        if (! $skipPage) {
341 7
            if (GeneralUtility::inList('3,4', $pageRow['doktype']) || $pageRow['doktype'] >= 199) {
342 3
                $skipPage = true;
343 3
                $skipMessage = 'Because doktype is not allowed';
344
            }
345
        }
346
347 8
        if (! $skipPage) {
348 4
            foreach ($GLOBALS['TYPO3_CONF_VARS']['EXTCONF']['crawler']['excludeDoktype'] ?? [] as $key => $doktypeList) {
349 1
                if (GeneralUtility::inList($doktypeList, $pageRow['doktype'])) {
350 1
                    $skipPage = true;
351 1
                    $skipMessage = 'Doktype was excluded by "' . $key . '"';
352 1
                    break;
353
                }
354
            }
355
        }
356
357 8
        if (! $skipPage) {
358
            // veto hook
359 3
            foreach ($GLOBALS['TYPO3_CONF_VARS']['EXTCONF']['crawler']['pageVeto'] ?? [] as $key => $func) {
360
                $params = [
361
                    'pageRow' => $pageRow,
362
                ];
363
                // expects "false" if page is ok and "true" or a skipMessage if this page should _not_ be crawled
364
                $veto = GeneralUtility::callUserFunction($func, $params, $this);
365
                if ($veto !== false) {
366
                    $skipPage = true;
367
                    if (is_string($veto)) {
368
                        $skipMessage = $veto;
369
                    } else {
370
                        $skipMessage = 'Veto from hook "' . htmlspecialchars($key) . '"';
371
                    }
372
                    // no need to execute other hooks if a previous one return a veto
373
                    break;
374
                }
375
            }
376
        }
377
378 8
        return $skipPage ? $skipMessage : false;
379
    }
380
381
    /**
382
     * Wrapper method for getUrlsForPageId()
383
     * It returns an array of configurations and no urls!
384
     *
385
     * @param array $pageRow Page record with at least dok-type and uid columns.
386
     * @param string $skipMessage
387
     * @return array
388
     * @see getUrlsForPageId()
389
     */
390 4
    public function getUrlsForPageRow(array $pageRow, &$skipMessage = '')
391
    {
392 4
        $message = $this->checkIfPageShouldBeSkipped($pageRow);
393
394 4
        if ($message === false) {
395 3
            $res = $this->getUrlsForPageId($pageRow['uid']);
396 3
            $skipMessage = '';
397
        } else {
398 1
            $skipMessage = $message;
399 1
            $res = [];
400
        }
401
402 4
        return $res;
403
    }
404
405
    /**
406
     * Creates a list of URLs from input array (and submits them to queue if asked for)
407
     * See Web > Info module script + "indexed_search"'s crawler hook-client using this!
408
     *
409
     * @param array $vv Information about URLs from pageRow to crawl.
410
     * @param array $pageRow Page row
411
     * @param int $scheduledTime Unix time to schedule indexing to, typically time()
412
     * @param int $reqMinute Number of requests per minute (creates the interleave between requests)
413
     * @param bool $submitCrawlUrls If set, submits the URLs to queue
414
     * @param bool $downloadCrawlUrls If set (and submitcrawlUrls is false) will fill $downloadUrls with entries)
415
     * @param array $duplicateTrack Array which is passed by reference and contains the an id per url to secure we will not crawl duplicates
416
     * @param array $downloadUrls Array which will be filled with URLS for download if flag is set.
417
     * @param array $incomingProcInstructions Array of processing instructions
418
     * @return string List of URLs (meant for display in backend module)
419
     */
420 2
    public function urlListFromUrlArray(
421
        array $vv,
422
        array $pageRow,
423
        $scheduledTime,
424
        $reqMinute,
425
        $submitCrawlUrls,
426
        $downloadCrawlUrls,
427
        array &$duplicateTrack,
428
        array &$downloadUrls,
429
        array $incomingProcInstructions
430
    ) {
431 2
        if (! is_array($vv['URLs'])) {
432
            return 'ERROR - no URL generated';
433
        }
434 2
        $urlLog = [];
435 2
        $pageId = (int) $pageRow['uid'];
436 2
        $configurationHash = $this->getConfigurationHash($vv);
437 2
        $skipInnerCheck = $this->queueRepository->noUnprocessedQueueEntriesForPageWithConfigurationHashExist($pageId, $configurationHash);
438
439 2
        foreach ($vv['URLs'] as $urlQuery) {
440 2
            if (! $this->drawURLs_PIfilter($vv['subCfg']['procInstrFilter'], $incomingProcInstructions)) {
441
                continue;
442
            }
443 2
            $url = (string) $this->getUrlFromPageAndQueryParameters(
444 2
                $pageId,
445 2
                $urlQuery,
446 2
                $vv['subCfg']['baseUrl'] ?? null,
447 2
                $vv['subCfg']['force_ssl'] ?? 0
448
            );
449
450
            // Create key by which to determine unique-ness:
451 2
            $uKey = $url . '|' . $vv['subCfg']['userGroups'] . '|' . $vv['subCfg']['procInstrFilter'];
452
453 2
            if (isset($duplicateTrack[$uKey])) {
454
                //if the url key is registered just display it and do not resubmit is
455
                $urlLog[] = '<em><span class="text-muted">' . htmlspecialchars($url) . '</span></em>';
456
            } else {
457
                // Scheduled time:
458 2
                $schTime = $scheduledTime + round(count($duplicateTrack) * (60 / $reqMinute));
459 2
                $schTime = intval($schTime / 60) * 60;
460 2
                $formattedDate = BackendUtility::datetime($schTime);
461 2
                $this->urlList[] = '[' . $formattedDate . '] ' . $url;
462 2
                $urlList = '[' . $formattedDate . '] ' . htmlspecialchars($url);
463
464
                // Submit for crawling!
465 2
                if ($submitCrawlUrls) {
466 2
                    $added = $this->addUrl(
467 2
                        $pageId,
468 2
                        $url,
469 2
                        $vv['subCfg'],
470 2
                        $scheduledTime,
471 2
                        $configurationHash,
472 2
                        $skipInnerCheck
473
                    );
474 2
                    if ($added === false) {
475 2
                        $urlList .= ' (URL already existed)';
476
                    }
477
                } elseif ($downloadCrawlUrls) {
478
                    $downloadUrls[$url] = $url;
479
                }
480 2
                $urlLog[] = $urlList;
481
            }
482 2
            $duplicateTrack[$uKey] = true;
483
        }
484
485 2
        return implode('<br>', $urlLog);
486
    }
487
488
    /**
489
     * Returns true if input processing instruction is among registered ones.
490
     *
491
     * @param string $piString PI to test
492
     * @param array $incomingProcInstructions Processing instructions
493
     * @return boolean
494
     */
495 5
    public function drawURLs_PIfilter($piString, array $incomingProcInstructions)
496
    {
497 5
        if (empty($incomingProcInstructions)) {
498 1
            return true;
499
        }
500
501 4
        foreach ($incomingProcInstructions as $pi) {
502 4
            if (GeneralUtility::inList($piString, $pi)) {
503 2
                return true;
504
            }
505
        }
506 2
        return false;
507
    }
508
509 3
    public function getPageTSconfigForId($id): array
510
    {
511 3
        if (! $this->MP) {
512 3
            $pageTSconfig = BackendUtility::getPagesTSconfig($id);
0 ignored issues
show
Deprecated Code introduced by
The function TYPO3\CMS\Backend\Utilit...ity::getPagesTSconfig() has been deprecated. ( Ignorable by Annotation )

If this is a false-positive, you can also ignore this issue in your code via the ignore-deprecated  annotation

512
            $pageTSconfig = /** @scrutinizer ignore-deprecated */ BackendUtility::getPagesTSconfig($id);
Loading history...
513
        } else {
514
            // TODO: Please check, this makes no sense to split a boolean value.
515
            [, $mountPointId] = explode('-', $this->MP);
0 ignored issues
show
Bug introduced by
$this->MP of type true is incompatible with the type string expected by parameter $string of explode(). ( Ignorable by Annotation )

If this is a false-positive, you can also ignore this issue in your code via the ignore-type  annotation

515
            [, $mountPointId] = explode('-', /** @scrutinizer ignore-type */ $this->MP);
Loading history...
516
            $pageTSconfig = BackendUtility::getPagesTSconfig($mountPointId);
0 ignored issues
show
Deprecated Code introduced by
The function TYPO3\CMS\Backend\Utilit...ity::getPagesTSconfig() has been deprecated. ( Ignorable by Annotation )

If this is a false-positive, you can also ignore this issue in your code via the ignore-deprecated  annotation

516
            $pageTSconfig = /** @scrutinizer ignore-deprecated */ BackendUtility::getPagesTSconfig($mountPointId);
Loading history...
Bug introduced by
$mountPointId of type string is incompatible with the type integer expected by parameter $id of TYPO3\CMS\Backend\Utilit...ity::getPagesTSconfig(). ( Ignorable by Annotation )

If this is a false-positive, you can also ignore this issue in your code via the ignore-type  annotation

516
            $pageTSconfig = BackendUtility::getPagesTSconfig(/** @scrutinizer ignore-type */ $mountPointId);
Loading history...
517
        }
518
519
        // Call a hook to alter configuration
520 3
        if (is_array($GLOBALS['TYPO3_CONF_VARS']['EXTCONF']['crawler']['getPageTSconfigForId'])) {
521
            $params = [
522
                'pageId' => $id,
523
                'pageTSConfig' => &$pageTSconfig,
524
            ];
525
            foreach ($GLOBALS['TYPO3_CONF_VARS']['EXTCONF']['crawler']['getPageTSconfigForId'] as $userFunc) {
526
                GeneralUtility::callUserFunction($userFunc, $params, $this);
527
            }
528
        }
529 3
        return $pageTSconfig;
530
    }
531
532
    /**
533
     * This methods returns an array of configurations.
534
     * Adds no urls!
535
     */
536 2
    public function getUrlsForPageId(int $pageId): array
537
    {
538
        // Get page TSconfig for page ID
539 2
        $pageTSconfig = $this->getPageTSconfigForId($pageId);
540
541 2
        $res = [];
542
543
        // Fetch Crawler Configuration from pageTSconfig
544 2
        $crawlerCfg = $pageTSconfig['tx_crawler.']['crawlerCfg.']['paramSets.'] ?? [];
545 2
        foreach ($crawlerCfg as $key => $values) {
546 1
            if (! is_array($values)) {
547 1
                continue;
548
            }
549 1
            $key = str_replace('.', '', $key);
550
            // Sub configuration for a single configuration string:
551 1
            $subCfg = (array) $crawlerCfg[$key . '.'];
552 1
            $subCfg['key'] = $key;
553
554 1
            if (strcmp($subCfg['procInstrFilter'], '')) {
555 1
                $subCfg['procInstrFilter'] = implode(',', GeneralUtility::trimExplode(',', $subCfg['procInstrFilter']));
556
            }
557 1
            $pidOnlyList = implode(',', GeneralUtility::trimExplode(',', $subCfg['pidsOnly'], true));
558
559
            // process configuration if it is not page-specific or if the specific page is the current page:
560
            // TODO: Check if $pidOnlyList can be kept as Array instead of imploded
561 1
            if (! strcmp((string) $subCfg['pidsOnly'], '') || GeneralUtility::inList($pidOnlyList, strval($pageId))) {
562
563
                // Explode, process etc.:
564 1
                $res[$key] = [];
565 1
                $res[$key]['subCfg'] = $subCfg;
566 1
                $res[$key]['paramParsed'] = GeneralUtility::explodeUrl2Array($crawlerCfg[$key]);
567 1
                $res[$key]['paramExpanded'] = $this->expandParameters($res[$key]['paramParsed'], $pageId);
568 1
                $res[$key]['origin'] = 'pagets';
569
570
                // recognize MP value
571 1
                if (! $this->MP) {
572 1
                    $res[$key]['URLs'] = $this->compileUrls($res[$key]['paramExpanded'], ['?id=' . $pageId]);
573
                } else {
574
                    $res[$key]['URLs'] = $this->compileUrls($res[$key]['paramExpanded'], ['?id=' . $pageId . '&MP=' . $this->MP]);
0 ignored issues
show
Bug introduced by
Are you sure $this->MP of type true can be used in concatenation? ( Ignorable by Annotation )

If this is a false-positive, you can also ignore this issue in your code via the ignore-type  annotation

574
                    $res[$key]['URLs'] = $this->compileUrls($res[$key]['paramExpanded'], ['?id=' . $pageId . '&MP=' . /** @scrutinizer ignore-type */ $this->MP]);
Loading history...
575
                }
576
            }
577
        }
578
579
        // Get configuration from tx_crawler_configuration records up the rootline
580 2
        $crawlerConfigurations = $this->configurationRepository->getCrawlerConfigurationRecordsFromRootLine($pageId);
581 2
        foreach ($crawlerConfigurations as $configurationRecord) {
582
583
            // check access to the configuration record
584 1
            if (empty($configurationRecord['begroups']) || $this->getBackendUser()->isAdmin() || $this->hasGroupAccess($this->getBackendUser()->user['usergroup_cached_list'], $configurationRecord['begroups'])) {
585 1
                $pidOnlyList = implode(',', GeneralUtility::trimExplode(',', $configurationRecord['pidsonly'], true));
586
587
                // process configuration if it is not page-specific or if the specific page is the current page:
588
                // TODO: Check if $pidOnlyList can be kept as Array instead of imploded
589 1
                if (! strcmp($configurationRecord['pidsonly'], '') || GeneralUtility::inList($pidOnlyList, strval($pageId))) {
590 1
                    $key = $configurationRecord['name'];
591
592
                    // don't overwrite previously defined paramSets
593 1
                    if (! isset($res[$key])) {
594
595
                        /* @var $TSparserObject \TYPO3\CMS\Core\TypoScript\Parser\TypoScriptParser */
596 1
                        $TSparserObject = GeneralUtility::makeInstance(TypoScriptParser::class);
597 1
                        $TSparserObject->parse($configurationRecord['processing_instruction_parameters_ts']);
598
599
                        $subCfg = [
600 1
                            'procInstrFilter' => $configurationRecord['processing_instruction_filter'],
601 1
                            'procInstrParams.' => $TSparserObject->setup,
602 1
                            'baseUrl' => $configurationRecord['base_url'],
603 1
                            'force_ssl' => (int) $configurationRecord['force_ssl'],
604 1
                            'userGroups' => $configurationRecord['fegroups'],
605 1
                            'exclude' => $configurationRecord['exclude'],
606 1
                            'key' => $key,
607
                        ];
608
609 1
                        if (! in_array($pageId, $this->expandExcludeString($subCfg['exclude']), true)) {
610 1
                            $res[$key] = [];
611 1
                            $res[$key]['subCfg'] = $subCfg;
612 1
                            $res[$key]['paramParsed'] = GeneralUtility::explodeUrl2Array($configurationRecord['configuration']);
613 1
                            $res[$key]['paramExpanded'] = $this->expandParameters($res[$key]['paramParsed'], $pageId);
614 1
                            $res[$key]['URLs'] = $this->compileUrls($res[$key]['paramExpanded'], ['?id=' . $pageId]);
615 1
                            $res[$key]['origin'] = 'tx_crawler_configuration_' . $configurationRecord['uid'];
616
                        }
617
                    }
618
                }
619
            }
620
        }
621
622 2
        foreach ($GLOBALS['TYPO3_CONF_VARS']['EXTCONF']['crawler']['processUrls'] ?? [] as $func) {
623
            $params = [
624
                'res' => &$res,
625
            ];
626
            GeneralUtility::callUserFunction($func, $params, $this);
627
        }
628 2
        return $res;
629
    }
630
631
    /**
632
     * Find all configurations of subpages of a page
633
     * TODO: Write Functional Tests
634
     */
635 1
    public function getConfigurationsForBranch(int $rootid, int $depth): array
636
    {
637 1
        $configurationsForBranch = [];
638 1
        $pageTSconfig = $this->getPageTSconfigForId($rootid);
639 1
        $sets = $pageTSconfig['tx_crawler.']['crawlerCfg.']['paramSets.'] ?? [];
640 1
        foreach ($sets as $key => $value) {
641
            if (! is_array($value)) {
642
                continue;
643
            }
644
            $configurationsForBranch[] = substr($key, -1) === '.' ? substr($key, 0, -1) : $key;
645
        }
646 1
        $pids = [];
647 1
        $rootLine = BackendUtility::BEgetRootLine($rootid);
648 1
        foreach ($rootLine as $node) {
649 1
            $pids[] = $node['uid'];
650
        }
651
        /* @var PageTreeView $tree */
652 1
        $tree = GeneralUtility::makeInstance(PageTreeView::class);
653 1
        $perms_clause = $this->getBackendUser()->getPagePermsClause(Permission::PAGE_SHOW);
654 1
        $tree->init(empty($perms_clause) ? '' : ('AND ' . $perms_clause));
655 1
        $tree->getTree($rootid, $depth, '');
656 1
        foreach ($tree->tree as $node) {
657
            $pids[] = $node['row']['uid'];
658
        }
659
660 1
        $queryBuilder = $this->getQueryBuilder('tx_crawler_configuration');
661
        $statement = $queryBuilder
662 1
            ->select('name')
663 1
            ->from('tx_crawler_configuration')
664 1
            ->where(
665 1
                $queryBuilder->expr()->in('pid', $queryBuilder->createNamedParameter($pids, Connection::PARAM_INT_ARRAY))
666
            )
667 1
            ->execute();
668
669 1
        while ($row = $statement->fetch()) {
670 1
            $configurationsForBranch[] = $row['name'];
671
        }
672 1
        return $configurationsForBranch;
673
    }
674
675
    /**
676
     * Check if a user has access to an item
677
     * (e.g. get the group list of the current logged in user from $GLOBALS['TSFE']->gr_list)
678
     *
679
     * @param string $groupList Comma-separated list of (fe_)group UIDs from a user
680
     * @param string $accessList Comma-separated list of (fe_)group UIDs of the item to access
681
     * @return bool                 TRUE if at least one of the users group UIDs is in the access list or the access list is empty
682
     * @see \TYPO3\CMS\Frontend\Page\PageRepository::getMultipleGroupsWhereClause()
683
     */
684 3
    public function hasGroupAccess($groupList, $accessList)
685
    {
686 3
        if (empty($accessList)) {
687 1
            return true;
688
        }
689 2
        foreach (GeneralUtility::intExplode(',', $groupList) as $groupUid) {
690 2
            if (GeneralUtility::inList($accessList, $groupUid)) {
691 1
                return true;
692
            }
693
        }
694 1
        return false;
695
    }
696
697
    /**
698
     * Will expand the parameters configuration to individual values. This follows a certain syntax of the value of each parameter.
699
     * Syntax of values:
700
     * - Basically: If the value is wrapped in [...] it will be expanded according to the following syntax, otherwise the value is taken literally
701
     * - Configuration is splitted by "|" and the parts are processed individually and finally added together
702
     * - For each configuration part:
703
     *         - "[int]-[int]" = Integer range, will be expanded to all values in between, values included, starting from low to high (max. 1000). Example "1-34" or "-40--30"
704
     *         - "_TABLE:[TCA table name];[_PID:[optional page id, default is current page]];[_ENABLELANG:1]" = Look up of table records from PID, filtering out deleted records. Example "_TABLE:tt_content; _PID:123"
705
     *        _ENABLELANG:1 picks only original records without their language overlays
706
     *         - Default: Literal value
707
     *
708
     * @param array $paramArray Array with key (GET var name) and values (value of GET var which is configuration for expansion)
709
     * @param integer $pid Current page ID
710
     * @return array
711
     *
712
     * TODO: Write Functional Tests
713
     */
714 9
    public function expandParameters($paramArray, $pid)
715
    {
716
        // Traverse parameter names:
717 9
        foreach ($paramArray as $p => $v) {
718 9
            $v = trim($v);
719
720
            // If value is encapsulated in square brackets it means there are some ranges of values to find, otherwise the value is literal
721 9
            if (strpos($v, '[') === 0 && substr($v, -1) === ']') {
722
                // So, find the value inside brackets and reset the paramArray value as an array.
723 9
                $v = substr($v, 1, -1);
724 9
                $paramArray[$p] = [];
725
726
                // Explode parts and traverse them:
727 9
                $parts = explode('|', $v);
728 9
                foreach ($parts as $pV) {
729
730
                    // Look for integer range: (fx. 1-34 or -40--30 // reads minus 40 to minus 30)
731 9
                    if (preg_match('/^(-?[0-9]+)\s*-\s*(-?[0-9]+)$/', trim($pV), $reg)) {
732 1
                        $reg = $this->swapIfFirstIsLargerThanSecond($reg);
733
734
                        // Traverse range, add values:
735 1
                        $runAwayBrake = 1000; // Limit to size of range!
736 1
                        for ($a = $reg[1]; $a <= $reg[2]; $a++) {
737 1
                            $paramArray[$p][] = $a;
738 1
                            $runAwayBrake--;
739 1
                            if ($runAwayBrake <= 0) {
740
                                break;
741
                            }
742
                        }
743 8
                    } elseif (strpos(trim($pV), '_TABLE:') === 0) {
744
745
                        // Parse parameters:
746 6
                        $subparts = GeneralUtility::trimExplode(';', $pV);
747 6
                        $subpartParams = [];
748 6
                        foreach ($subparts as $spV) {
749 6
                            [$pKey, $pVal] = GeneralUtility::trimExplode(':', $spV);
750 6
                            $subpartParams[$pKey] = $pVal;
751
                        }
752
753
                        // Table exists:
754 6
                        if (isset($GLOBALS['TCA'][$subpartParams['_TABLE']])) {
755 6
                            $lookUpPid = isset($subpartParams['_PID']) ? intval($subpartParams['_PID']) : intval($pid);
756 6
                            $recursiveDepth = isset($subpartParams['_RECURSIVE']) ? intval($subpartParams['_RECURSIVE']) : 0;
757 6
                            $pidField = isset($subpartParams['_PIDFIELD']) ? trim($subpartParams['_PIDFIELD']) : 'pid';
758 6
                            $where = $subpartParams['_WHERE'] ?? '';
759 6
                            $addTable = $subpartParams['_ADDTABLE'] ?? '';
760
761 6
                            $fieldName = $subpartParams['_FIELD'] ? $subpartParams['_FIELD'] : 'uid';
762 6
                            if ($fieldName === 'uid' || $GLOBALS['TCA'][$subpartParams['_TABLE']]['columns'][$fieldName]) {
763 6
                                $queryBuilder = $this->getQueryBuilder($subpartParams['_TABLE']);
764
765 6
                                if ($recursiveDepth > 0) {
766
                                    /** @var \TYPO3\CMS\Core\Database\QueryGenerator $queryGenerator */
767 2
                                    $queryGenerator = GeneralUtility::makeInstance(\TYPO3\CMS\Core\Database\QueryGenerator::class);
768 2
                                    $pidList = $queryGenerator->getTreeList($lookUpPid, $recursiveDepth, 0, 1);
769 2
                                    $pidArray = GeneralUtility::intExplode(',', $pidList);
770
                                } else {
771 4
                                    $pidArray = [(string) $lookUpPid];
772
                                }
773
774 6
                                $queryBuilder->getRestrictions()
775 6
                                    ->removeAll()
776 6
                                    ->add(GeneralUtility::makeInstance(DeletedRestriction::class));
777
778
                                $queryBuilder
779 6
                                    ->select($fieldName)
780 6
                                    ->from($subpartParams['_TABLE'])
781 6
                                    ->where(
782 6
                                        $queryBuilder->expr()->in($pidField, $queryBuilder->createNamedParameter($pidArray, Connection::PARAM_INT_ARRAY)),
783 6
                                        $where
784
                                    );
785
786 6
                                if (! empty($addTable)) {
787
                                    // TODO: Check if this works as intended!
788
                                    $queryBuilder->add('from', $addTable);
789
                                }
790 6
                                $transOrigPointerField = $GLOBALS['TCA'][$subpartParams['_TABLE']]['ctrl']['transOrigPointerField'];
791
792 6
                                if ($subpartParams['_ENABLELANG'] && $transOrigPointerField) {
793
                                    $queryBuilder->andWhere(
794
                                        $queryBuilder->expr()->lte(
795
                                            $transOrigPointerField,
796
                                            0
797
                                        )
798
                                    );
799
                                }
800
801 6
                                $statement = $queryBuilder->execute();
802
803 6
                                $rows = [];
804 6
                                while ($row = $statement->fetch()) {
805 6
                                    $rows[$row[$fieldName]] = $row;
806
                                }
807
808 6
                                if (is_array($rows)) {
809 6
                                    $paramArray[$p] = array_merge($paramArray[$p], array_keys($rows));
810
                                }
811
                            }
812
                        }
813
                    } else { // Just add value:
814 2
                        $paramArray[$p][] = $pV;
815
                    }
816
                    // Hook for processing own expandParameters place holder
817 9
                    if (is_array($GLOBALS['TYPO3_CONF_VARS']['SC_OPTIONS']['crawler/class.tx_crawler_lib.php']['expandParameters'])) {
818
                        $_params = [
819
                            'pObj' => &$this,
820
                            'paramArray' => &$paramArray,
821
                            'currentKey' => $p,
822
                            'currentValue' => $pV,
823
                            'pid' => $pid,
824
                        ];
825
                        foreach ($GLOBALS['TYPO3_CONF_VARS']['SC_OPTIONS']['crawler/class.tx_crawler_lib.php']['expandParameters'] as $_funcRef) {
826
                            GeneralUtility::callUserFunction($_funcRef, $_params, $this);
827
                        }
828
                    }
829
                }
830
831
                // Make unique set of values and sort array by key:
832 9
                $paramArray[$p] = array_unique($paramArray[$p]);
833 9
                ksort($paramArray);
834
            } else {
835
                // Set the literal value as only value in array:
836 2
                $paramArray[$p] = [$v];
837
            }
838
        }
839
840 9
        return $paramArray;
841
    }
842
843
    /**
844
     * Compiling URLs from parameter array (output of expandParameters())
845
     * The number of URLs will be the multiplication of the number of parameter values for each key
846
     *
847
     * @param array $paramArray Output of expandParameters(): Array with keys (GET var names) and for each an array of values
848
     * @param array $urls URLs accumulated in this array (for recursion)
849
     * @return array
850
     */
851 5
    public function compileUrls($paramArray, array $urls)
852
    {
853 5
        if (empty($paramArray)) {
854 5
            return $urls;
855
        }
856
        // shift first off stack:
857 4
        reset($paramArray);
858 4
        $varName = key($paramArray);
859 4
        $valueSet = array_shift($paramArray);
860
861
        // Traverse value set:
862 4
        $newUrls = [];
863 4
        foreach ($urls as $url) {
864 3
            foreach ($valueSet as $val) {
865 3
                $newUrls[] = $url . (strcmp((string) $val, '') ? '&' . rawurlencode($varName) . '=' . rawurlencode((string) $val) : '');
866
867 3
                if (count($newUrls) > $this->maximumUrlsToCompile) {
868
                    break;
869
                }
870
            }
871
        }
872 4
        return $this->compileUrls($paramArray, $newUrls);
873
    }
874
875
    /************************************
876
     *
877
     * Crawler log
878
     *
879
     ************************************/
880
881
    /**
882
     * Return array of records from crawler queue for input page ID
883
     *
884
     * @param integer $id Page ID for which to look up log entries.
885
     * @param string $filter Filter: "all" => all entries, "pending" => all that is not yet run, "finished" => all complete ones
886
     * @param boolean $doFlush If TRUE, then entries selected at DELETED(!) instead of selected!
887
     * @param boolean $doFullFlush
888
     * @param integer $itemsPerPage Limit the amount of entries per page default is 10
889
     * @return array
890
     */
891 4
    public function getLogEntriesForPageId($id, $filter = '', $doFlush = false, $doFullFlush = false, $itemsPerPage = 10)
0 ignored issues
show
Unused Code introduced by
The parameter $doFlush is not used and could be removed. ( Ignorable by Annotation )

If this is a false-positive, you can also ignore this issue in your code via the ignore-unused  annotation

891
    public function getLogEntriesForPageId($id, $filter = '', /** @scrutinizer ignore-unused */ $doFlush = false, $doFullFlush = false, $itemsPerPage = 10)

This check looks for parameters that have been defined for a function or method, but which are not used in the method body.

Loading history...
Unused Code introduced by
The parameter $doFullFlush is not used and could be removed. ( Ignorable by Annotation )

If this is a false-positive, you can also ignore this issue in your code via the ignore-unused  annotation

891
    public function getLogEntriesForPageId($id, $filter = '', $doFlush = false, /** @scrutinizer ignore-unused */ $doFullFlush = false, $itemsPerPage = 10)

This check looks for parameters that have been defined for a function or method, but which are not used in the method body.

Loading history...
892
    {
893 4
        $queryBuilder = GeneralUtility::makeInstance(ConnectionPool::class)->getQueryBuilderForTable($this->tableName);
894
        $queryBuilder
895 4
            ->select('*')
896 4
            ->from($this->tableName)
897 4
            ->where(
898 4
                $queryBuilder->expr()->eq('page_id', $queryBuilder->createNamedParameter($id, \PDO::PARAM_INT))
899
            )
900 4
            ->orderBy('scheduled', 'DESC');
901
902 4
        $expressionBuilder = GeneralUtility::makeInstance(ConnectionPool::class)
903 4
            ->getConnectionForTable($this->tableName)
904 4
            ->getExpressionBuilder();
905 4
        $query = $expressionBuilder->andX();
0 ignored issues
show
Unused Code introduced by
The assignment to $query is dead and can be removed.
Loading history...
906
        // PHPStorm adds the highlight that the $addWhere is immediately overwritten,
907
        // but the $query = $expressionBuilder->andX() ensures that the $addWhere is written correctly with AND
908
        // between the statements, it's not a mistake in the code.
909 4
        $this->queueRepository->flushQueue($filter);
910 4
        if ($itemsPerPage > 0) {
911
            $queryBuilder
912 4
                ->setMaxResults((int) $itemsPerPage);
913
        }
914
915 4
        return $queryBuilder->execute()->fetchAll();
916
    }
917
918
    /**
919
     * Return array of records from crawler queue for input set ID
920
     *
921
     * @param int $set_id Set ID for which to look up log entries.
922
     * @param string $filter Filter: "all" => all entries, "pending" => all that is not yet run, "finished" => all complete ones
923
     * @param bool $doFlush If TRUE, then entries selected at DELETED(!) instead of selected!
924
     * @param int $itemsPerPage Limit the amount of entries per page default is 10
925
     * @return array
926
     *
927
     * @deprecated
928
     */
929 6
    public function getLogEntriesForSetId(int $set_id, string $filter = '', bool $doFlush = false, bool $doFullFlush = false, int $itemsPerPage = 10)
930
    {
931 6
        $queryBuilder = GeneralUtility::makeInstance(ConnectionPool::class)->getQueryBuilderForTable($this->tableName);
932
        $queryBuilder
933 6
            ->select('*')
934 6
            ->from($this->tableName)
935 6
            ->where(
936 6
                $queryBuilder->expr()->eq('set_id', $queryBuilder->createNamedParameter($set_id, \PDO::PARAM_INT))
937
            )
938 6
            ->orderBy('scheduled', 'DESC');
939
940 6
        $expressionBuilder = GeneralUtility::makeInstance(ConnectionPool::class)
941 6
            ->getConnectionForTable($this->tableName)
942 6
            ->getExpressionBuilder();
943 6
        $query = $expressionBuilder->andX();
944
        // PHPStorm adds the highlight that the $addWhere is immediately overwritten,
945
        // but the $query = $expressionBuilder->andX() ensures that the $addWhere is written correctly with AND
946
        // between the statements, it's not a mistake in the code.
947 6
        $addWhere = '';
948 6
        switch ($filter) {
949 6
            case 'pending':
950 1
                $queryBuilder->andWhere($queryBuilder->expr()->eq('exec_time', 0));
951 1
                $addWhere = $query->add($expressionBuilder->eq('exec_time', 0));
0 ignored issues
show
Unused Code introduced by
The assignment to $addWhere is dead and can be removed.
Loading history...
952 1
                break;
953 5
            case 'finished':
954 1
                $queryBuilder->andWhere($queryBuilder->expr()->gt('exec_time', 0));
955 1
                $addWhere = $query->add($expressionBuilder->gt('exec_time', 0));
956 1
                break;
957
        }
958 6
        if ($doFlush) {
959 4
            $addWhere = $query->add($expressionBuilder->eq('set_id', (int) $set_id));
960 4
            $this->flushQueue($doFullFlush ? '' : $addWhere);
0 ignored issues
show
Deprecated Code introduced by
The function AOE\Crawler\Controller\C...ontroller::flushQueue() has been deprecated. ( Ignorable by Annotation )

If this is a false-positive, you can also ignore this issue in your code via the ignore-deprecated  annotation

960
            /** @scrutinizer ignore-deprecated */ $this->flushQueue($doFullFlush ? '' : $addWhere);
Loading history...
961 4
            return [];
962
        }
963 2
        if ($itemsPerPage > 0) {
964
            $queryBuilder
965 2
                ->setMaxResults((int) $itemsPerPage);
966
        }
967
968 2
        return $queryBuilder->execute()->fetchAll();
969
    }
970
971
    /**
972
     * Removes queue entries
973
     *
974
     * @param string $where SQL related filter for the entries which should be removed
975
     * @return void
976
     *
977
     * @deprecated
978
     */
979 5
    protected function flushQueue($where = ''): void
980
    {
981 5
        $realWhere = strlen((string)$where) > 0 ? $where : '1=1';
982
983 5
        $queryBuilder = $this->getQueryBuilder($this->tableName);
984
985 5
        if (EventDispatcher::getInstance()->hasObserver('queueEntryFlush')) {
986
            $groups = $queryBuilder
987
                ->select('DISTINCT set_id')
988
                ->from($this->tableName)
989
                ->where($realWhere)
990
                ->execute()
991
                ->fetchAll();
992
            if (is_array($groups)) {
0 ignored issues
show
introduced by
The condition is_array($groups) is always true.
Loading history...
993
                foreach ($groups as $group) {
994
                    $subSet = $queryBuilder
995
                        ->select('uid', 'set_id')
996
                        ->from($this->tableName)
997
                        ->where(
998
                            $realWhere,
999
                            $queryBuilder->expr()->eq('set_id', $group['set_id'])
1000
                        )
1001
                        ->execute()
1002
                        ->fetchAll();
1003
                    EventDispatcher::getInstance()->post('queueEntryFlush', $group['set_id'], $subSet);
1004
                }
1005
            }
1006
        }
1007
1008
        $queryBuilder
1009 5
            ->delete($this->tableName)
1010 5
            ->where($realWhere)
1011 5
            ->execute();
1012 5
    }
1013
1014
    /**
1015
     * Adding call back entries to log (called from hooks typically, see indexed search class "class.crawler.php"
1016
     *
1017
     * @param integer $setId Set ID
1018
     * @param array $params Parameters to pass to call back function
1019
     * @param string $callBack Call back object reference, eg. 'EXT:indexed_search/class.crawler.php:&tx_indexedsearch_crawler'
1020
     * @param integer $page_id Page ID to attach it to
1021
     * @param integer $schedule Time at which to activate
1022
     */
1023
    public function addQueueEntry_callBack($setId, $params, $callBack, $page_id = 0, $schedule = 0): void
1024
    {
1025
        if (! is_array($params)) {
0 ignored issues
show
introduced by
The condition is_array($params) is always true.
Loading history...
1026
            $params = [];
1027
        }
1028
        $params['_CALLBACKOBJ'] = $callBack;
1029
1030
        GeneralUtility::makeInstance(ConnectionPool::class)->getConnectionForTable('tx_crawler_queue')
1031
            ->insert(
1032
                'tx_crawler_queue',
1033
                [
1034
                    'page_id' => (int) $page_id,
1035
                    'parameters' => serialize($params),
1036
                    'scheduled' => (int) $schedule ?: $this->getCurrentTime(),
1037
                    'exec_time' => 0,
1038
                    'set_id' => (int) $setId,
1039
                    'result_data' => '',
1040
                ]
1041
            );
1042
    }
1043
1044
    /************************************
1045
     *
1046
     * URL setting
1047
     *
1048
     ************************************/
1049
1050
    /**
1051
     * Setting a URL for crawling:
1052
     *
1053
     * @param integer $id Page ID
1054
     * @param string $url Complete URL
1055
     * @param array $subCfg Sub configuration array (from TS config)
1056
     * @param integer $tstamp Scheduled-time
1057
     * @param string $configurationHash (optional) configuration hash
1058
     * @param bool $skipInnerDuplicationCheck (optional) skip inner duplication check
1059
     * @return bool
1060
     */
1061 6
    public function addUrl(
1062
        $id,
1063
        $url,
1064
        array $subCfg,
1065
        $tstamp,
1066
        $configurationHash = '',
1067
        $skipInnerDuplicationCheck = false
1068
    ) {
1069 6
        $urlAdded = false;
1070 6
        $rows = [];
1071
1072
        // Creating parameters:
1073
        $parameters = [
1074 6
            'url' => $url,
1075
        ];
1076
1077
        // fe user group simulation:
1078 6
        $uGs = implode(',', array_unique(GeneralUtility::intExplode(',', $subCfg['userGroups'], true)));
1079 6
        if ($uGs) {
1080 1
            $parameters['feUserGroupList'] = $uGs;
1081
        }
1082
1083
        // Setting processing instructions
1084 6
        $parameters['procInstructions'] = GeneralUtility::trimExplode(',', $subCfg['procInstrFilter']);
1085 6
        if (is_array($subCfg['procInstrParams.'])) {
1086 3
            $parameters['procInstrParams'] = $subCfg['procInstrParams.'];
1087
        }
1088
1089
        // Compile value array:
1090 6
        $parameters_serialized = serialize($parameters);
1091
        $fieldArray = [
1092 6
            'page_id' => (int) $id,
1093 6
            'parameters' => $parameters_serialized,
1094 6
            'parameters_hash' => GeneralUtility::shortMD5($parameters_serialized),
1095 6
            'configuration_hash' => $configurationHash,
1096 6
            'scheduled' => $tstamp,
1097 6
            'exec_time' => 0,
1098 6
            'set_id' => (int) $this->setID,
1099 6
            'result_data' => '',
1100 6
            'configuration' => $subCfg['key'],
1101
        ];
1102
1103 6
        if ($this->registerQueueEntriesInternallyOnly) {
1104
            //the entries will only be registered and not stored to the database
1105 1
            $this->queueEntries[] = $fieldArray;
1106
        } else {
1107 5
            if (! $skipInnerDuplicationCheck) {
1108
                // check if there is already an equal entry
1109 4
                $rows = $this->getDuplicateRowsIfExist($tstamp, $fieldArray);
1110
            }
1111
1112 5
            if (empty($rows)) {
1113 4
                $connectionForCrawlerQueue = GeneralUtility::makeInstance(ConnectionPool::class)->getConnectionForTable('tx_crawler_queue');
1114 4
                $connectionForCrawlerQueue->insert(
1115 4
                    'tx_crawler_queue',
1116 4
                    $fieldArray
1117
                );
1118 4
                $uid = $connectionForCrawlerQueue->lastInsertId('tx_crawler_queue', 'qid');
1119 4
                $rows[] = $uid;
1120 4
                $urlAdded = true;
1121 4
                EventDispatcher::getInstance()->post('urlAddedToQueue', strval($this->setID), ['uid' => $uid, 'fieldArray' => $fieldArray]);
1122
            } else {
1123 1
                EventDispatcher::getInstance()->post('duplicateUrlInQueue', strval($this->setID), ['rows' => $rows, 'fieldArray' => $fieldArray]);
1124
            }
1125
        }
1126
1127 6
        return $urlAdded;
1128
    }
1129
1130
    /**
1131
     * Returns the current system time
1132
     *
1133
     * @return int
1134
     */
1135
    public function getCurrentTime()
1136
    {
1137
        return time();
1138
    }
1139
1140
    /************************************
1141
     *
1142
     * URL reading
1143
     *
1144
     ************************************/
1145
1146
    /**
1147
     * Read URL for single queue entry
1148
     *
1149
     * @param integer $queueId
1150
     * @param boolean $force If set, will process even if exec_time has been set!
1151
     * @return integer
1152
     */
1153
    public function readUrl($queueId, $force = false)
1154
    {
1155
        $queryBuilder = GeneralUtility::makeInstance(ConnectionPool::class)->getQueryBuilderForTable($this->tableName);
1156
        $ret = 0;
1157
        $this->logger->debug('crawler-readurl start ' . microtime(true));
1158
        // Get entry:
1159
        $queryBuilder
1160
            ->select('*')
1161
            ->from('tx_crawler_queue')
1162
            ->where(
1163
                $queryBuilder->expr()->eq('qid', $queryBuilder->createNamedParameter($queueId, \PDO::PARAM_INT))
1164
            );
1165
        if (! $force) {
1166
            $queryBuilder
1167
                ->andWhere('exec_time = 0')
1168
                ->andWhere('process_scheduled > 0');
1169
        }
1170
        $queueRec = $queryBuilder->execute()->fetch();
1171
1172
        if (! is_array($queueRec)) {
1173
            return;
1174
        }
1175
1176
        SignalSlotUtility::emitSignal(
1177
            self::class,
1178
            SignalSlotUtility::SIGNAL_QUEUEITEM_PREPROCESS,
1179
            [$queueId, &$queueRec]
1180
        );
1181
1182
        // Set exec_time to lock record:
1183
        $field_array = ['exec_time' => $this->getCurrentTime()];
1184
1185
        if (isset($this->processID)) {
1186
            //if mulitprocessing is used we need to store the id of the process which has handled this entry
1187
            $field_array['process_id_completed'] = $this->processID;
1188
        }
1189
1190
        GeneralUtility::makeInstance(ConnectionPool::class)->getConnectionForTable('tx_crawler_queue')
1191
            ->update(
1192
                'tx_crawler_queue',
1193
                $field_array,
1194
                ['qid' => (int) $queueId]
1195
            );
1196
1197
        $result = $this->queueExecutor->executeQueueItem($queueRec, $this);
1198
        if ($result['content'] === null) {
1199
            $resultData = 'An errors happened';
1200
        } else {
1201
            $resultData = unserialize($result['content']);
1202
        }
1203
1204
        //atm there's no need to point to specific pollable extensions
1205
        if (is_array($GLOBALS['TYPO3_CONF_VARS']['EXTCONF']['crawler']['pollSuccess'])) {
1206
            foreach ($GLOBALS['TYPO3_CONF_VARS']['EXTCONF']['crawler']['pollSuccess'] as $pollable) {
1207
                // only check the success value if the instruction is runnig
1208
                // it is important to name the pollSuccess key same as the procInstructions key
1209
                if (is_array($resultData['parameters']['procInstructions'])
1210
                    && in_array(
1211
                        $pollable,
1212
                        $resultData['parameters']['procInstructions'], true
1213
                    )
1214
                ) {
1215
                    if (! empty($resultData['success'][$pollable]) && $resultData['success'][$pollable]) {
1216
                        $ret |= self::CLI_STATUS_POLLABLE_PROCESSED;
1217
                    }
1218
                }
1219
            }
1220
        }
1221
1222
        // Set result in log which also denotes the end of the processing of this entry.
1223
        $field_array = ['result_data' => serialize($result)];
1224
1225
        SignalSlotUtility::emitSignal(
1226
            self::class,
1227
            SignalSlotUtility::SIGNAL_QUEUEITEM_POSTPROCESS,
1228
            [$queueId, &$field_array]
1229
        );
1230
1231
        GeneralUtility::makeInstance(ConnectionPool::class)->getConnectionForTable('tx_crawler_queue')
1232
            ->update(
1233
                'tx_crawler_queue',
1234
                $field_array,
1235
                ['qid' => (int) $queueId]
1236
            );
1237
1238
        $this->logger->debug('crawler-readurl stop ' . microtime(true));
1239
        return $ret;
1240
    }
1241
1242
    /**
1243
     * Read URL for not-yet-inserted log-entry
1244
     *
1245
     * @param array $field_array Queue field array,
1246
     *
1247
     * @return string
1248
     */
1249
    public function readUrlFromArray($field_array)
1250
    {
1251
        // Set exec_time to lock record:
1252
        $field_array['exec_time'] = $this->getCurrentTime();
1253
        $connectionForCrawlerQueue = GeneralUtility::makeInstance(ConnectionPool::class)->getConnectionForTable($this->tableName);
1254
        $connectionForCrawlerQueue->insert(
1255
            $this->tableName,
1256
            $field_array
1257
        );
1258
        $queueId = $field_array['qid'] = $connectionForCrawlerQueue->lastInsertId($this->tableName, 'qid');
1259
1260
        $result = $this->queueExecutor->executeQueueItem($field_array, $this);
1261
1262
        // Set result in log which also denotes the end of the processing of this entry.
1263
        $field_array = ['result_data' => serialize($result)];
1264
1265
        SignalSlotUtility::emitSignal(
1266
            self::class,
1267
            SignalSlotUtility::SIGNAL_QUEUEITEM_POSTPROCESS,
1268
            [$queueId, &$field_array]
1269
        );
1270
1271
        $connectionForCrawlerQueue->update(
1272
            $this->tableName,
1273
            $field_array,
1274
            ['qid' => $queueId]
1275
        );
1276
1277
        return $result;
0 ignored issues
show
Bug Best Practice introduced by
The expression return $result also could return the type array|boolean which is incompatible with the documented return type string.
Loading history...
1278
    }
1279
1280
    /*****************************
1281
     *
1282
     * Compiling URLs to crawl - tools
1283
     *
1284
     *****************************/
1285
1286
    /**
1287
     * @param integer $id Root page id to start from.
1288
     * @param integer $depth Depth of tree, 0=only id-page, 1= on sublevel, 99 = infinite
1289
     * @param integer $scheduledTime Unix Time when the URL is timed to be visited when put in queue
1290
     * @param integer $reqMinute Number of requests per minute (creates the interleave between requests)
1291
     * @param boolean $submitCrawlUrls If set, submits the URLs to queue in database (real crawling)
1292
     * @param boolean $downloadCrawlUrls If set (and submitcrawlUrls is false) will fill $downloadUrls with entries)
1293
     * @param array $incomingProcInstructions Array of processing instructions
1294
     * @param array $configurationSelection Array of configuration keys
1295
     * @return string
1296
     */
1297
    public function getPageTreeAndUrls(
1298
        $id,
1299
        $depth,
1300
        $scheduledTime,
1301
        $reqMinute,
1302
        $submitCrawlUrls,
1303
        $downloadCrawlUrls,
1304
        array $incomingProcInstructions,
1305
        array $configurationSelection
1306
    ) {
1307
        $this->scheduledTime = $scheduledTime;
1308
        $this->reqMinute = $reqMinute;
1309
        $this->submitCrawlUrls = $submitCrawlUrls;
1310
        $this->downloadCrawlUrls = $downloadCrawlUrls;
1311
        $this->incomingProcInstructions = $incomingProcInstructions;
1312
        $this->incomingConfigurationSelection = $configurationSelection;
1313
1314
        $this->duplicateTrack = [];
1315
        $this->downloadUrls = [];
1316
1317
        // Drawing tree:
1318
        /* @var PageTreeView $tree */
1319
        $tree = GeneralUtility::makeInstance(PageTreeView::class);
1320
        $perms_clause = $this->getBackendUser()->getPagePermsClause(Permission::PAGE_SHOW);
1321
        $tree->init('AND ' . $perms_clause);
1322
1323
        $pageInfo = BackendUtility::readPageAccess($id, $perms_clause);
1324
        if (is_array($pageInfo)) {
0 ignored issues
show
introduced by
The condition is_array($pageInfo) is always true.
Loading history...
1325
            // Set root row:
1326
            $tree->tree[] = [
1327
                'row' => $pageInfo,
1328
                'HTML' => $this->iconFactory->getIconForRecord('pages', $pageInfo, Icon::SIZE_SMALL),
1329
            ];
1330
        }
1331
1332
        // Get branch beneath:
1333
        if ($depth) {
1334
            $tree->getTree($id, $depth, '');
1335
        }
1336
1337
        // Traverse page tree:
1338
        $code = '';
1339
1340
        foreach ($tree->tree as $data) {
1341
            $this->MP = false;
1342
1343
            // recognize mount points
1344
            if ($data['row']['doktype'] === PageRepository::DOKTYPE_MOUNTPOINT) {
1345
                $queryBuilder = GeneralUtility::makeInstance(ConnectionPool::class)->getQueryBuilderForTable('pages');
1346
                $queryBuilder->getRestrictions()->removeAll()->add(GeneralUtility::makeInstance(DeletedRestriction::class));
1347
                $mountpage = $queryBuilder
1348
                    ->select('*')
1349
                    ->from('pages')
1350
                    ->where(
1351
                        $queryBuilder->expr()->eq('uid', $queryBuilder->createNamedParameter($data['row']['uid'], \PDO::PARAM_INT))
1352
                    )
1353
                    ->execute()
1354
                    ->fetchAll();
1355
                $queryBuilder->resetRestrictions();
1356
1357
                // fetch mounted pages
1358
                $this->MP = $mountpage[0]['mount_pid'] . '-' . $data['row']['uid'];
0 ignored issues
show
Documentation Bug introduced by
The property $MP was declared of type boolean, but $mountpage[0]['mount_pid...' . $data['row']['uid'] is of type string. Maybe add a type cast?

This check looks for assignments to scalar types that may be of the wrong type.

To ensure the code behaves as expected, it may be a good idea to add an explicit type cast.

$answer = 42;

$correct = false;

$correct = (bool) $answer;
Loading history...
1359
1360
                $mountTree = GeneralUtility::makeInstance(PageTreeView::class);
1361
                $mountTree->init('AND ' . $perms_clause);
1362
                $mountTree->getTree($mountpage[0]['mount_pid'], $depth);
1363
1364
                foreach ($mountTree->tree as $mountData) {
1365
                    $code .= $this->drawURLs_addRowsForPage(
1366
                        $mountData['row'],
1367
                        $mountData['HTML'] . BackendUtility::getRecordTitle('pages', $mountData['row'], true)
1368
                    );
1369
                }
1370
1371
                // replace page when mount_pid_ol is enabled
1372
                if ($mountpage[0]['mount_pid_ol']) {
1373
                    $data['row']['uid'] = $mountpage[0]['mount_pid'];
1374
                } else {
1375
                    // if the mount_pid_ol is not set the MP must not be used for the mountpoint page
1376
                    $this->MP = false;
1377
                }
1378
            }
1379
1380
            $code .= $this->drawURLs_addRowsForPage(
1381
                $data['row'],
1382
                $data['HTML'] . BackendUtility::getRecordTitle('pages', $data['row'], true)
1383
            );
1384
        }
1385
1386
        return $code;
1387
    }
1388
1389
    /**
1390
     * Expands exclude string
1391
     *
1392
     * @param string $excludeString Exclude string
1393
     * @return array
1394
     */
1395 1
    public function expandExcludeString($excludeString)
1396
    {
1397
        // internal static caches;
1398 1
        static $expandedExcludeStringCache;
1399 1
        static $treeCache;
1400
1401 1
        if (empty($expandedExcludeStringCache[$excludeString])) {
1402 1
            $pidList = [];
1403
1404 1
            if (! empty($excludeString)) {
1405
                /** @var PageTreeView $tree */
1406
                $tree = GeneralUtility::makeInstance(PageTreeView::class);
1407
                $tree->init('AND ' . $this->getBackendUser()->getPagePermsClause(Permission::PAGE_SHOW));
1408
1409
                $excludeParts = GeneralUtility::trimExplode(',', $excludeString);
1410
1411
                foreach ($excludeParts as $excludePart) {
1412
                    [$pid, $depth] = GeneralUtility::trimExplode('+', $excludePart);
1413
1414
                    // default is "page only" = "depth=0"
1415
                    if (empty($depth)) {
1416
                        $depth = (stristr($excludePart, '+')) ? 99 : 0;
1417
                    }
1418
1419
                    $pidList[] = $pid;
1420
1421
                    if ($depth > 0) {
1422
                        if (empty($treeCache[$pid][$depth])) {
1423
                            $tree->reset();
1424
                            $tree->getTree($pid, $depth);
1425
                            $treeCache[$pid][$depth] = $tree->tree;
1426
                        }
1427
1428
                        foreach ($treeCache[$pid][$depth] as $data) {
1429
                            $pidList[] = $data['row']['uid'];
1430
                        }
1431
                    }
1432
                }
1433
            }
1434
1435 1
            $expandedExcludeStringCache[$excludeString] = array_unique($pidList);
1436
        }
1437
1438 1
        return $expandedExcludeStringCache[$excludeString];
1439
    }
1440
1441
    /**
1442
     * Create the rows for display of the page tree
1443
     * For each page a number of rows are shown displaying GET variable configuration
1444
     */
1445
    public function drawURLs_addRowsForPage(array $pageRow, string $pageTitle): string
1446
    {
1447
        $skipMessage = '';
1448
1449
        // Get list of configurations
1450
        $configurations = $this->getUrlsForPageRow($pageRow, $skipMessage);
1451
1452
        if (! empty($this->incomingConfigurationSelection)) {
1453
            // remove configuration that does not match the current selection
1454
            foreach ($configurations as $confKey => $confArray) {
1455
                if (! in_array($confKey, $this->incomingConfigurationSelection, true)) {
1456
                    unset($configurations[$confKey]);
1457
                }
1458
            }
1459
        }
1460
1461
        // Traverse parameter combinations:
1462
        $c = 0;
1463
        $content = '';
1464
        if (! empty($configurations)) {
1465
            foreach ($configurations as $confKey => $confArray) {
1466
1467
                // Title column:
1468
                if (! $c) {
1469
                    $titleClm = '<td rowspan="' . count($configurations) . '">' . $pageTitle . '</td>';
1470
                } else {
1471
                    $titleClm = '';
1472
                }
1473
1474
                if (! in_array($pageRow['uid'], $this->expandExcludeString($confArray['subCfg']['exclude']), true)) {
1475
1476
                    // URL list:
1477
                    $urlList = $this->urlListFromUrlArray(
1478
                        $confArray,
1479
                        $pageRow,
1480
                        $this->scheduledTime,
1481
                        $this->reqMinute,
1482
                        $this->submitCrawlUrls,
1483
                        $this->downloadCrawlUrls,
1484
                        $this->duplicateTrack,
1485
                        $this->downloadUrls,
1486
                        $this->incomingProcInstructions // if empty the urls won't be filtered by processing instructions
1487
                    );
1488
1489
                    // Expanded parameters:
1490
                    $paramExpanded = '';
1491
                    $calcAccu = [];
1492
                    $calcRes = 1;
1493
                    foreach ($confArray['paramExpanded'] as $gVar => $gVal) {
1494
                        $paramExpanded .= '
1495
                            <tr>
1496
                                <td>' . htmlspecialchars('&' . $gVar . '=') . '<br/>' .
1497
                            '(' . count($gVal) . ')' .
1498
                            '</td>
1499
                                <td nowrap="nowrap">' . nl2br(htmlspecialchars(implode(chr(10), $gVal))) . '</td>
1500
                            </tr>
1501
                        ';
1502
                        $calcRes *= count($gVal);
1503
                        $calcAccu[] = count($gVal);
1504
                    }
1505
                    $paramExpanded = '<table>' . $paramExpanded . '</table>';
1506
                    $paramExpanded .= 'Comb: ' . implode('*', $calcAccu) . '=' . $calcRes;
1507
1508
                    // Options
1509
                    $optionValues = '';
1510
                    if ($confArray['subCfg']['userGroups']) {
1511
                        $optionValues .= 'User Groups: ' . $confArray['subCfg']['userGroups'] . '<br/>';
1512
                    }
1513
                    if ($confArray['subCfg']['procInstrFilter']) {
1514
                        $optionValues .= 'ProcInstr: ' . $confArray['subCfg']['procInstrFilter'] . '<br/>';
1515
                    }
1516
1517
                    // Compile row:
1518
                    $content .= '
1519
                        <tr>
1520
                            ' . $titleClm . '
1521
                            <td>' . htmlspecialchars($confKey) . '</td>
1522
                            <td>' . nl2br(htmlspecialchars(rawurldecode(trim(str_replace('&', chr(10) . '&', GeneralUtility::implodeArrayForUrl('', $confArray['paramParsed'])))))) . '</td>
1523
                            <td>' . $paramExpanded . '</td>
1524
                            <td nowrap="nowrap">' . $urlList . '</td>
1525
                            <td nowrap="nowrap">' . $optionValues . '</td>
1526
                            <td nowrap="nowrap">' . DebugUtility::viewArray($confArray['subCfg']['procInstrParams.']) . '</td>
1527
                        </tr>';
1528
                } else {
1529
                    $content .= '<tr>
1530
                            ' . $titleClm . '
1531
                            <td>' . htmlspecialchars($confKey) . '</td>
1532
                            <td colspan="5"><em>No entries</em> (Page is excluded in this configuration)</td>
1533
                        </tr>';
1534
                }
1535
1536
                $c++;
1537
            }
1538
        } else {
1539
            $message = ! empty($skipMessage) ? ' (' . $skipMessage . ')' : '';
1540
1541
            // Compile row:
1542
            $content .= '
1543
                <tr>
1544
                    <td>' . $pageTitle . '</td>
1545
                    <td colspan="6"><em>No entries</em>' . $message . '</td>
1546
                </tr>';
1547
        }
1548
1549
        return $content;
1550
    }
1551
1552
    /*****************************
1553
     *
1554
     * CLI functions
1555
     *
1556
     *****************************/
1557
1558
    /**
1559
     * Running the functionality of the CLI (crawling URLs from queue)
1560
     */
1561
    public function CLI_run(int $countInARun, int $sleepTime, int $sleepAfterFinish): int
1562
    {
1563
        $result = 0;
1564
        $counter = 0;
1565
1566
        // First, run hooks:
1567
        $this->CLI_runHooks();
1568
1569
        // Clean up the queue
1570
        $this->queueRepository->cleanupQueue();
1571
1572
        // Select entries:
1573
        $rows = $this->queueRepository->fetchRecordsToBeCrawled($countInARun);
1574
1575
        if (! empty($rows)) {
1576
            $quidList = [];
1577
1578
            foreach ($rows as $r) {
1579
                $quidList[] = $r['qid'];
1580
            }
1581
1582
            $processId = $this->CLI_buildProcessId();
1583
1584
            //save the number of assigned queue entries to determine how many have been processed later
1585
            $numberOfAffectedRows = $this->queueRepository->updateProcessIdAndSchedulerForQueueIds($quidList, $processId);
1586
            $this->processRepository->updateProcessAssignItemsCount($numberOfAffectedRows, $processId);
1587
1588
            if ($numberOfAffectedRows !== count($quidList)) {
1589
                $this->CLI_debug('Nothing processed due to multi-process collision (' . $this->CLI_buildProcessId() . ')');
1590
                return ($result | self::CLI_STATUS_ABORTED);
1591
            }
1592
1593
            foreach ($rows as $r) {
1594
                $result |= $this->readUrl($r['qid']);
1595
1596
                $counter++;
1597
                usleep((int) $sleepTime); // Just to relax the system
1598
1599
                // if during the start and the current read url the cli has been disable we need to return from the function
1600
                // mark the process NOT as ended.
1601
                if ($this->getDisabled()) {
1602
                    return ($result | self::CLI_STATUS_ABORTED);
1603
                }
1604
1605
                if (! $this->processRepository->isProcessActive($this->CLI_buildProcessId())) {
1606
                    $this->CLI_debug('conflict / timeout (' . $this->CLI_buildProcessId() . ')');
1607
1608
                    //TODO might need an additional returncode
1609
                    $result |= self::CLI_STATUS_ABORTED;
1610
                    break; //possible timeout
1611
                }
1612
            }
1613
1614
            sleep((int) $sleepAfterFinish);
1615
1616
            $msg = 'Rows: ' . $counter;
1617
            $this->CLI_debug($msg . ' (' . $this->CLI_buildProcessId() . ')');
1618
        } else {
1619
            $this->CLI_debug('Nothing within queue which needs to be processed (' . $this->CLI_buildProcessId() . ')');
1620
        }
1621
1622
        if ($counter > 0) {
1623
            $result |= self::CLI_STATUS_PROCESSED;
1624
        }
1625
1626
        return $result;
1627
    }
1628
1629
    /**
1630
     * Activate hooks
1631
     */
1632
    public function CLI_runHooks(): void
1633
    {
1634
        foreach ($GLOBALS['TYPO3_CONF_VARS']['EXTCONF']['crawler']['cli_hooks'] ?? [] as $objRef) {
1635
            $hookObj = GeneralUtility::makeInstance($objRef);
1636
            if (is_object($hookObj)) {
1637
                $hookObj->crawler_init($this);
1638
            }
1639
        }
1640
    }
1641
1642
    /**
1643
     * Try to acquire a new process with the given id
1644
     * also performs some auto-cleanup for orphan processes
1645
     * @param string $id identification string for the process
1646
     * @return boolean
1647
     * @todo preemption might not be the most elegant way to clean up
1648
     */
1649
    public function CLI_checkAndAcquireNewProcess($id)
1650
    {
1651
        $queryBuilder = GeneralUtility::makeInstance(ConnectionPool::class)->getQueryBuilderForTable($this->tableName);
1652
        $ret = true;
1653
1654
        $systemProcessId = getmypid();
1655
        if ($systemProcessId < 1) {
1656
            return false;
1657
        }
1658
1659
        $processCount = 0;
1660
        $orphanProcesses = [];
1661
1662
        $statement = $queryBuilder
1663
            ->select('process_id', 'ttl')
1664
            ->from('tx_crawler_process')
1665
            ->where(
1666
                'active = 1 AND deleted = 0'
1667
            )
1668
            ->execute();
1669
1670
        $currentTime = $this->getCurrentTime();
1671
1672
        while ($row = $statement->fetch()) {
1673
            if ($row['ttl'] < $currentTime) {
1674
                $orphanProcesses[] = $row['process_id'];
1675
            } else {
1676
                $processCount++;
1677
            }
1678
        }
1679
1680
        // if there are less than allowed active processes then add a new one
1681
        if ($processCount < (int) $this->extensionSettings['processLimit']) {
1682
            $this->CLI_debug('add process ' . $this->CLI_buildProcessId() . ' (' . ($processCount + 1) . '/' . (int) $this->extensionSettings['processLimit'] . ')');
1683
1684
            GeneralUtility::makeInstance(ConnectionPool::class)->getConnectionForTable('tx_crawler_process')->insert(
1685
                'tx_crawler_process',
1686
                [
1687
                    'process_id' => $id,
1688
                    'active' => 1,
1689
                    'ttl' => $currentTime + (int) $this->extensionSettings['processMaxRunTime'],
1690
                    'system_process_id' => $systemProcessId,
1691
                ]
1692
            );
1693
        } else {
1694
            $this->CLI_debug('Processlimit reached (' . ($processCount) . '/' . (int) $this->extensionSettings['processLimit'] . ')');
1695
            $ret = false;
1696
        }
1697
1698
        $this->processRepository->deleteProcessesMarkedAsDeleted();
1699
        $this->CLI_releaseProcesses($orphanProcesses);
1700
1701
        return $ret;
1702
    }
1703
1704
    /**
1705
     * Release a process and the required resources
1706
     *
1707
     * @param mixed $releaseIds string with a single process-id or array with multiple process-ids
1708
     * @return boolean
1709
     */
1710
    public function CLI_releaseProcesses($releaseIds)
1711
    {
1712
        $queryBuilder = GeneralUtility::makeInstance(ConnectionPool::class)->getQueryBuilderForTable($this->tableName);
1713
1714
        if (! is_array($releaseIds)) {
1715
            $releaseIds = [$releaseIds];
1716
        }
1717
1718
        if (empty($releaseIds)) {
1719
            return false;   //nothing to release
1720
        }
1721
1722
        // some kind of 2nd chance algo - this way you need at least 2 processes to have a real cleanup
1723
        // this ensures that a single process can't mess up the entire process table
1724
1725
        // mark all processes as deleted which have no "waiting" queue-entires and which are not active
1726
1727
        $queryBuilder
1728
            ->update($this->tableName, 'q')
1729
            ->where(
1730
                'q.process_id IN(SELECT p.process_id FROM tx_crawler_process as p WHERE p.active = 0)'
1731
            )
1732
            ->set('q.process_scheduled', 0)
1733
            ->set('q.process_id', '')
1734
            ->execute();
1735
1736
        // FIXME: Not entirely sure that this is equivalent to the previous version
1737
        $queryBuilder->resetQueryPart('set');
1738
1739
        $queryBuilder
1740
            ->update('tx_crawler_process')
1741
            ->where(
1742
                $queryBuilder->expr()->eq('active', 0),
1743
                'process_id IN(SELECT q.process_id FROM tx_crawler_queue as q WHERE q.exec_time = 0)'
1744
            )
1745
            ->set('system_process_id', 0)
1746
            ->execute();
1747
1748
        $this->processRepository->markRequestedProcessesAsNotActive($releaseIds);
1749
        $this->queueRepository->unsetProcessScheduledAndProcessIdForQueueEntries($releaseIds);
1750
1751
        return true;
1752
    }
1753
1754
    /**
1755
     * Create a unique Id for the current process
1756
     *
1757
     * @return string  the ID
1758
     */
1759 1
    public function CLI_buildProcessId()
1760
    {
1761 1
        if (! $this->processID) {
1762
            $this->processID = GeneralUtility::shortMD5(microtime(true));
1763
        }
1764 1
        return $this->processID;
1765
    }
1766
1767
    /**
1768
     * Prints a message to the stdout (only if debug-mode is enabled)
1769
     *
1770
     * @param string $msg the message
1771
     */
1772
    public function CLI_debug($msg): void
1773
    {
1774
        if ((int) $this->extensionSettings['processDebug']) {
1775
            echo $msg . "\n";
1776
            flush();
1777
        }
1778
    }
1779
1780
    /**
1781
     * Cleans up entries that stayed for too long in the queue. These are:
1782
     * - processed entries that are over 1.5 days in age
1783
     * - scheduled entries that are over 7 days old
1784
     *
1785
     * @deprecated
1786
     */
1787 1
    public function cleanUpOldQueueEntries(): void
1788
    {
1789 1
        $processedAgeInSeconds = $this->extensionSettings['cleanUpProcessedAge'] * 86400; // 24*60*60 Seconds in 24 hours
1790 1
        $scheduledAgeInSeconds = $this->extensionSettings['cleanUpScheduledAge'] * 86400;
1791
1792 1
        $now = time();
1793 1
        $condition = '(exec_time<>0 AND exec_time<' . ($now - $processedAgeInSeconds) . ') OR scheduled<=' . ($now - $scheduledAgeInSeconds);
1794 1
        $this->flushQueue($condition);
0 ignored issues
show
Deprecated Code introduced by
The function AOE\Crawler\Controller\C...ontroller::flushQueue() has been deprecated. ( Ignorable by Annotation )

If this is a false-positive, you can also ignore this issue in your code via the ignore-deprecated  annotation

1794
        /** @scrutinizer ignore-deprecated */ $this->flushQueue($condition);
Loading history...
1795 1
    }
1796
1797
    /**
1798
     * This method determines duplicates for a queue entry with the same parameters and this timestamp.
1799
     * If the timestamp is in the past, it will check if there is any unprocessed queue entry in the past.
1800
     * If the timestamp is in the future it will check, if the queued entry has exactly the same timestamp
1801
     *
1802
     * @param int $tstamp
1803
     * @param array $fieldArray
1804
     *
1805
     * @return array
1806
     */
1807 7
    protected function getDuplicateRowsIfExist($tstamp, $fieldArray)
1808
    {
1809 7
        $rows = [];
1810
1811 7
        $currentTime = $this->getCurrentTime();
1812
1813 7
        $queryBuilder = GeneralUtility::makeInstance(ConnectionPool::class)->getQueryBuilderForTable($this->tableName);
1814
        $queryBuilder
1815 7
            ->select('qid')
1816 7
            ->from('tx_crawler_queue');
1817
        //if this entry is scheduled with "now"
1818 7
        if ($tstamp <= $currentTime) {
1819 2
            if ($this->extensionSettings['enableTimeslot']) {
1820 1
                $timeBegin = $currentTime - 100;
1821 1
                $timeEnd = $currentTime + 100;
1822
                $queryBuilder
1823 1
                    ->where(
1824 1
                        'scheduled BETWEEN ' . $timeBegin . ' AND ' . $timeEnd . ''
1825
                    )
1826 1
                    ->orWhere(
1827 1
                        $queryBuilder->expr()->lte('scheduled', $currentTime)
1828
                    );
1829
            } else {
1830
                $queryBuilder
1831 1
                    ->where(
1832 2
                        $queryBuilder->expr()->lte('scheduled', $currentTime)
1833
                    );
1834
            }
1835 5
        } elseif ($tstamp > $currentTime) {
1836
            //entry with a timestamp in the future need to have the same schedule time
1837
            $queryBuilder
1838 5
                ->where(
1839 5
                    $queryBuilder->expr()->eq('scheduled', $tstamp)
1840
                );
1841
        }
1842
1843
        $queryBuilder
1844 7
            ->andWhere('NOT exec_time')
1845 7
            ->andWhere('NOT process_id')
1846 7
            ->andWhere($queryBuilder->expr()->eq('page_id', $queryBuilder->createNamedParameter($fieldArray['page_id'], \PDO::PARAM_INT)))
1847 7
            ->andWhere($queryBuilder->expr()->eq('parameters_hash', $queryBuilder->createNamedParameter($fieldArray['parameters_hash'], \PDO::PARAM_STR)));
1848
1849 7
        $statement = $queryBuilder->execute();
1850
1851 7
        while ($row = $statement->fetch()) {
1852 5
            $rows[] = $row['qid'];
1853
        }
1854
1855 7
        return $rows;
1856
    }
1857
1858
    /**
1859
     * Returns a md5 hash generated from a serialized configuration array.
1860
     *
1861
     * @return string
1862
     */
1863 8
    protected function getConfigurationHash(array $configuration)
1864
    {
1865 8
        unset($configuration['paramExpanded']);
1866 8
        unset($configuration['URLs']);
1867 8
        return md5(serialize($configuration));
1868
    }
1869
1870
    /**
1871
     * Build a URL from a Page and the Query String. If the page has a Site configuration, it can be built by using
1872
     * the Site instance.
1873
     *
1874
     * @param int $httpsOrHttp see tx_crawler_configuration.force_ssl
1875
     * @throws \TYPO3\CMS\Core\Exception\SiteNotFoundException
1876
     * @throws \TYPO3\CMS\Core\Routing\InvalidRouteArgumentsException
1877
     */
1878 10
    protected function getUrlFromPageAndQueryParameters(int $pageId, string $queryString, ?string $alternativeBaseUrl, int $httpsOrHttp): UriInterface
1879
    {
1880 10
        $site = GeneralUtility::makeInstance(SiteMatcher::class)->matchByPageId((int) $pageId);
1881 10
        if ($site instanceof Site) {
1882 5
            $queryString = ltrim($queryString, '?&');
1883 5
            $queryParts = [];
1884 5
            parse_str($queryString, $queryParts);
1885 5
            unset($queryParts['id']);
1886
            // workaround as long as we don't have native language support in crawler configurations
1887 5
            if (isset($queryParts['L'])) {
1888
                $queryParts['_language'] = $queryParts['L'];
1889
                unset($queryParts['L']);
1890
                $siteLanguage = $site->getLanguageById((int) $queryParts['_language']);
0 ignored issues
show
Unused Code introduced by
The assignment to $siteLanguage is dead and can be removed.
Loading history...
1891
            } else {
1892 5
                $siteLanguage = $site->getDefaultLanguage();
1893
            }
1894 5
            $url = $site->getRouter()->generateUri($pageId, $queryParts);
1895 5
            if (! empty($alternativeBaseUrl)) {
1896 3
                $alternativeBaseUrl = new Uri($alternativeBaseUrl);
1897 3
                $url = $url->withHost($alternativeBaseUrl->getHost());
1898 3
                $url = $url->withScheme($alternativeBaseUrl->getScheme());
1899 3
                $url = $url->withPort($alternativeBaseUrl->getPort());
1900 3
                if ($userInfo = $alternativeBaseUrl->getUserInfo()) {
1901 5
                    $url = $url->withUserInfo($userInfo);
1902
                }
1903
            }
1904
        } else {
1905
            // Technically this is not possible with site handling, but kept for backwards-compatibility reasons
1906
            // Once EXT:crawler is v10-only compatible, this should be removed completely
1907 5
            $baseUrl = ($alternativeBaseUrl ?: GeneralUtility::getIndpEnv('TYPO3_SITE_URL'));
1908 5
            $cacheHashCalculator = GeneralUtility::makeInstance(CacheHashCalculator::class);
1909 5
            $queryString .= '&cHash=' . $cacheHashCalculator->generateForParameters($queryString);
1910 5
            $url = rtrim($baseUrl, '/') . '/index.php' . $queryString;
1911 5
            $url = new Uri($url);
1912
        }
1913
1914 10
        if ($httpsOrHttp === -1) {
1915 2
            $url = $url->withScheme('http');
1916 8
        } elseif ($httpsOrHttp === 1) {
1917 6
            $url = $url->withScheme('https');
1918
        }
1919
1920 10
        return $url;
1921
    }
1922
1923 1
    protected function swapIfFirstIsLargerThanSecond(array $reg): array
1924
    {
1925
        // Swap if first is larger than last:
1926 1
        if ($reg[1] > $reg[2]) {
1927
            $temp = $reg[2];
1928
            $reg[2] = $reg[1];
1929
            $reg[1] = $temp;
1930
        }
1931
1932 1
        return $reg;
1933
    }
1934
1935
    /**
1936
     * @return BackendUserAuthentication
1937
     */
1938 1
    private function getBackendUser()
1939
    {
1940
        // Make sure the _cli_ user is loaded
1941 1
        Bootstrap::initializeBackendAuthentication();
1942 1
        if ($this->backendUser === null) {
1943 1
            $this->backendUser = $GLOBALS['BE_USER'];
1944
        }
1945 1
        return $this->backendUser;
1946
    }
1947
1948
    /**
1949
     * Get querybuilder for given table
1950
     *
1951
     * @return \TYPO3\CMS\Core\Database\Query\QueryBuilder
1952
     */
1953 12
    private function getQueryBuilder(string $table)
1954
    {
1955 12
        return GeneralUtility::makeInstance(ConnectionPool::class)->getQueryBuilderForTable($table);
1956
    }
1957
}
1958