Passed
Push — issue/729 ( 98ba2c...3e0346 )
by Tomas Norre
87:46 queued 72:59
created

Classes/Controller/CrawlerController.php (17 issues)

1
<?php
2
3
declare(strict_types=1);
4
5
namespace AOE\Crawler\Controller;
6
7
/***************************************************************
8
 *  Copyright notice
9
 *
10
 *  (c) 2020 AOE GmbH <[email protected]>
11
 *
12
 *  All rights reserved
13
 *
14
 *  This script is part of the TYPO3 project. The TYPO3 project is
15
 *  free software; you can redistribute it and/or modify
16
 *  it under the terms of the GNU General Public License as published by
17
 *  the Free Software Foundation; either version 3 of the License, or
18
 *  (at your option) any later version.
19
 *
20
 *  The GNU General Public License can be found at
21
 *  http://www.gnu.org/copyleft/gpl.html.
22
 *
23
 *  This script is distributed in the hope that it will be useful,
24
 *  but WITHOUT ANY WARRANTY; without even the implied warranty of
25
 *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
26
 *  GNU General Public License for more details.
27
 *
28
 *  This copyright notice MUST APPEAR in all copies of the script!
29
 ***************************************************************/
30
31
use AOE\Crawler\Configuration\ExtensionConfigurationProvider;
32
use AOE\Crawler\Converter\JsonCompatibilityConverter;
33
use AOE\Crawler\Crawler;
34
use AOE\Crawler\CrawlStrategy\CrawlStrategyFactory;
35
use AOE\Crawler\Domain\Model\Process;
36
use AOE\Crawler\Domain\Repository\ConfigurationRepository;
37
use AOE\Crawler\Domain\Repository\ProcessRepository;
38
use AOE\Crawler\Domain\Repository\QueueRepository;
39
use AOE\Crawler\QueueExecutor;
40
use AOE\Crawler\Service\ConfigurationService;
41
use AOE\Crawler\Service\UrlService;
42
use AOE\Crawler\Service\UserService;
43
use AOE\Crawler\Utility\SignalSlotUtility;
44
use AOE\Crawler\Value\QueueFilter;
45
use PDO;
46
use Psr\Http\Message\UriInterface;
47
use Psr\Log\LoggerAwareInterface;
48
use Psr\Log\LoggerAwareTrait;
49
use TYPO3\CMS\Backend\Tree\View\PageTreeView;
50
use TYPO3\CMS\Backend\Utility\BackendUtility;
51
use TYPO3\CMS\Core\Authentication\BackendUserAuthentication;
52
use TYPO3\CMS\Core\Compatibility\PublicMethodDeprecationTrait;
53
use TYPO3\CMS\Core\Compatibility\PublicPropertyDeprecationTrait;
54
use TYPO3\CMS\Core\Core\Bootstrap;
55
use TYPO3\CMS\Core\Core\Environment;
56
use TYPO3\CMS\Core\Database\Connection;
57
use TYPO3\CMS\Core\Database\ConnectionPool;
58
use TYPO3\CMS\Core\Database\Query\QueryBuilder;
59
use TYPO3\CMS\Core\Database\Query\Restriction\DeletedRestriction;
60
use TYPO3\CMS\Core\Database\QueryGenerator;
61
use TYPO3\CMS\Core\Domain\Repository\PageRepository;
62
use TYPO3\CMS\Core\Exception\SiteNotFoundException;
63
use TYPO3\CMS\Core\Imaging\Icon;
64
use TYPO3\CMS\Core\Imaging\IconFactory;
65
use TYPO3\CMS\Core\Routing\InvalidRouteArgumentsException;
66
use TYPO3\CMS\Core\Site\Entity\Site;
67
use TYPO3\CMS\Core\Type\Bitmask\Permission;
68
use TYPO3\CMS\Core\TypoScript\Parser\TypoScriptParser;
69
use TYPO3\CMS\Core\Utility\DebugUtility;
70
use TYPO3\CMS\Core\Utility\GeneralUtility;
71
use TYPO3\CMS\Core\Utility\MathUtility;
72
use TYPO3\CMS\Extbase\Object\ObjectManager;
73
74
/**
75
 * Class CrawlerController
76
 *
77
 * @package AOE\Crawler\Controller
78
 */
79
class CrawlerController implements LoggerAwareInterface
80
{
81
    use LoggerAwareTrait;
82
    use PublicMethodDeprecationTrait;
83
    use PublicPropertyDeprecationTrait;
84
85
    public const CLI_STATUS_NOTHING_PROCCESSED = 0;
86
87
    //queue not empty
88
    public const CLI_STATUS_REMAIN = 1;
89
90
    //(some) queue items where processed
91
    public const CLI_STATUS_PROCESSED = 2;
92
93
    //instance didn't finish
94
    public const CLI_STATUS_ABORTED = 4;
95
96
    public const CLI_STATUS_POLLABLE_PROCESSED = 8;
97
98
    /**
99
     * @var integer
100
     */
101
    public $setID = 0;
102
103
    /**
104
     * @var string
105
     */
106
    public $processID = '';
107
108
    /**
109
     * @var array
110
     */
111
    public $duplicateTrack = [];
112
113
    /**
114
     * @var array
115
     */
116
    public $downloadUrls = [];
117
118
    /**
119
     * @var array
120
     */
121
    public $incomingProcInstructions = [];
122
123
    /**
124
     * @var array
125
     */
126
    public $incomingConfigurationSelection = [];
127
128
    /**
129
     * @var bool
130
     */
131
    public $registerQueueEntriesInternallyOnly = false;
132
133
    /**
134
     * @var array
135
     */
136
    public $queueEntries = [];
137
138
    /**
139
     * @var array
140
     */
141
    public $urlList = [];
142
143
    /**
144
     * @var array
145
     */
146
    public $extensionSettings = [];
147
148
    /**
149
     * Mount Point
150
     *
151
     * @var bool
152
     * Todo: Check what this is used for and adjust the type hint or code, as bool doesn't match the current code.
153
     */
154
    public $MP = false;
155
156
    /**
157
     * @var string
158
     * @deprecated
159
     */
160
    protected $processFilename;
161
162
    /**
163
     * Holds the internal access mode can be 'gui','cli' or 'cli_im'
164
     *
165
     * @var string
166
     * @deprecated
167
     */
168
    protected $accessMode;
169
170
    /**
171
     * @var QueueRepository
172
     */
173
    protected $queueRepository;
174
175
    /**
176
     * @var ProcessRepository
177
     */
178
    protected $processRepository;
179
180
    /**
181
     * @var ConfigurationRepository
182
     */
183
    protected $configurationRepository;
184
185
    /**
186
     * @var string
187
     * @deprecated Since v9.2.5 - This will be remove in v10
188
     */
189
    protected $tableName = 'tx_crawler_queue';
190
191
    /**
192
     * @var QueueExecutor
193
     */
194
    protected $queueExecutor;
195
196
    /**
197
     * @var int
198
     */
199
    protected $maximumUrlsToCompile = 10000;
200
201
    /**
202
     * @var IconFactory
203
     */
204
    protected $iconFactory;
205
206
    /**
207
     * @var string[]
208
     */
209
    private $deprecatedPublicMethods = [
210
        'cleanUpOldQueueEntries' => 'Using CrawlerController::cleanUpOldQueueEntries() is deprecated since 9.0.1 and will be removed in v11.x, please use QueueRepository->cleanUpOldQueueEntries() instead.',
211
        'CLI_debug' => 'Using CrawlerController->CLI_debug() is deprecated since 9.1.3 and will be removed in v11.x',
212
        'CLI_releaseProcesses' => 'Using CrawlerController->CLI_releaseProcesses() is deprecated since 9.2.2 and will be removed in v11.x',
213
        'CLI_runHooks' => 'Using CrawlerController->CLI_runHooks() is deprecated since 9.1.5 and will be removed in v11.x',
214
        'getAccessMode' => 'Using CrawlerController->getAccessMode() is deprecated since 9.1.3 and will be removed in v11.x',
215
        'getLogEntriesForPageId' => 'Using CrawlerController->getLogEntriesForPageId() is deprecated since 9.1.5 and will be remove in v11.x',
216
        'getLogEntriesForSetId' => 'Using crawlerController::getLogEntriesForSetId() is deprecated since 9.0.1 and will be removed in v11.x',
217
        'hasGroupAccess' => 'Using CrawlerController->getLogEntriesForPageId() is deprecated since 9.2.2 and will be remove in v11.x, please use UserService::hasGroupAccess() instead.',
218
        'flushQueue' => 'Using CrawlerController::flushQueue() is deprecated since 9.0.1 and will be removed in v11.x, please use QueueRepository->flushQueue() instead.',
219
        'setAccessMode' => 'Using CrawlerController->setAccessMode() is deprecated since 9.1.3 and will be removed in v11.x',
220
        'getDisabled' => 'Using CrawlerController->getDisabled() is deprecated since 9.1.3 and will be removed in v11.x, please use Crawler->isDisabled() instead',
221
        'setDisabled' => 'Using CrawlerController->setDisabled() is deprecated since 9.1.3 and will be removed in v11.x, please use Crawler->setDisabled() instead',
222
        'getProcessFilename' => 'Using CrawlerController->getProcessFilename() is deprecated since 9.1.3 and will be removed in v11.x',
223
        'setProcessFilename' => 'Using CrawlerController->setProcessFilename() is deprecated since 9.1.3 and will be removed in v11.x',
224
        'getDuplicateRowsIfExist' => 'Using CrawlerController->getDuplicateRowsIfExist() is deprecated since 9.1.4 and will be remove in v11.x, please use QueueRepository->getDuplicateQueueItemsIfExists() instead',
225
    ];
226
227
    /**
228
     * @var string[]
229
     */
230
    private $deprecatedPublicProperties = [
231
        'accessMode' => 'Using CrawlerController->accessMode is deprecated since 9.1.3 and will be removed in v11.x',
232
        'processFilename' => 'Using CrawlerController->accessMode is deprecated since 9.1.3 and will be removed in v11.x',
233
    ];
234
235
    /**
236
     * @var BackendUserAuthentication|null
237
     */
238
    private $backendUser;
239
240
    /**
241
     * @var integer
242
     */
243
    private $scheduledTime = 0;
244
245
    /**
246
     * @var integer
247
     */
248
    private $reqMinute = 0;
249
250
    /**
251
     * @var bool
252
     */
253
    private $submitCrawlUrls = false;
254
255
    /**
256
     * @var bool
257
     */
258
    private $downloadCrawlUrls = false;
259
260
    /**
261
     * @var PageRepository
262
     */
263
    private $pageRepository;
264
265
    /**
266
     * @var Crawler
267
     */
268
    private $crawler;
269
270
    /************************************
271
     *
272
     * Getting URLs based on Page TSconfig
273
     *
274
     ************************************/
275
276
    public function __construct()
277
    {
278
        $objectManager = GeneralUtility::makeInstance(ObjectManager::class);
279
        $crawlStrategyFactory = GeneralUtility::makeInstance(CrawlStrategyFactory::class);
280
        $this->queueRepository = $objectManager->get(QueueRepository::class);
281
        $this->processRepository = $objectManager->get(ProcessRepository::class);
282
        $this->configurationRepository = $objectManager->get(ConfigurationRepository::class);
283
        $this->pageRepository = GeneralUtility::makeInstance(PageRepository::class);
284
        $this->queueExecutor = GeneralUtility::makeInstance(QueueExecutor::class, $crawlStrategyFactory);
285
        $this->iconFactory = GeneralUtility::makeInstance(IconFactory::class);
286
        $this->crawler = GeneralUtility::makeInstance(Crawler::class);
287
288
        $this->processFilename = Environment::getVarPath() . '/lock/tx_crawler.proc';
289
290
        /** @var ExtensionConfigurationProvider $configurationProvider */
291
        $configurationProvider = GeneralUtility::makeInstance(ExtensionConfigurationProvider::class);
292
        $settings = $configurationProvider->getExtensionConfiguration();
293
        $this->extensionSettings = is_array($settings) ? $settings : [];
294
295
        if (MathUtility::convertToPositiveInteger($this->extensionSettings['countInARun']) === 0) {
296
            $this->extensionSettings['countInARun'] = 100;
297
        }
298
299
        $this->extensionSettings['processLimit'] = MathUtility::forceIntegerInRange($this->extensionSettings['processLimit'], 1, 99, 1);
300
        $this->setMaximumUrlsToCompile(MathUtility::forceIntegerInRange($this->extensionSettings['maxCompileUrls'], 1, 1000000000, 10000));
301
    }
302
303
    public function setMaximumUrlsToCompile(int $maximumUrlsToCompile): void
304
    {
305
        $this->maximumUrlsToCompile = $maximumUrlsToCompile;
306
    }
307
308
    /**
309
     * Method to set the accessMode can be gui, cli or cli_im
310
     *
311
     * @return string
312
     * @deprecated
313
     */
314
    public function getAccessMode()
315
    {
316
        return $this->accessMode;
317
    }
318
319
    /**
320
     * @param string $accessMode
321
     * @deprecated
322
     */
323
    public function setAccessMode($accessMode): void
324
    {
325
        $this->accessMode = $accessMode;
326
    }
327
328
    /**
329
     * Set disabled status to prevent processes from being processed
330
     * @deprecated
331
     */
332
    public function setDisabled(?bool $disabled = true): void
333
    {
334
        if ($disabled) {
335
            GeneralUtility::writeFile($this->processFilename, 'disabled');
336
        } elseif (is_file($this->processFilename)) {
337
            unlink($this->processFilename);
338
        }
339
    }
340
341
    /**
342
     * Get disable status
343
     * @deprecated
344
     */
345
    public function getDisabled(): bool
346
    {
347
        return is_file($this->processFilename);
348
    }
349
350
    /**
351
     * @param string $filenameWithPath
352
     * @deprecated
353
     */
354
    public function setProcessFilename($filenameWithPath): void
355
    {
356
        $this->processFilename = $filenameWithPath;
357
    }
358
359
    /**
360
     * @return string
361
     * @deprecated
362
     */
363
    public function getProcessFilename()
364
    {
365
        return $this->processFilename;
366
    }
367
368
    /**
369
     * Sets the extensions settings (unserialized pendant of $TYPO3_CONF_VARS['EXT']['extConf']['crawler']).
370
     */
371
    public function setExtensionSettings(array $extensionSettings): void
372
    {
373
        $this->extensionSettings = $extensionSettings;
374
    }
375
376
    /**
377
     * Check if the given page should be crawled
378
     *
379
     * @return false|string false if the page should be crawled (not excluded), true / skipMessage if it should be skipped
380
     */
381
    public function checkIfPageShouldBeSkipped(array $pageRow)
382
    {
383
        // if page is hidden
384
        if (! $this->extensionSettings['crawlHiddenPages'] && $pageRow['hidden']) {
385
            return 'Because page is hidden';
386
        }
387
388
        if (GeneralUtility::inList('3,4,199,254,255', $pageRow['doktype'])) {
389
            return 'Because doktype is not allowed';
390
        }
391
392
        foreach ($GLOBALS['TYPO3_CONF_VARS']['EXTCONF']['crawler']['excludeDoktype'] ?? [] as $key => $doktypeList) {
393
            if (GeneralUtility::inList($doktypeList, $pageRow['doktype'])) {
394
                return 'Doktype was excluded by "' . $key . '"';
395
            }
396
        }
397
398
        // veto hook
399
        foreach ($GLOBALS['TYPO3_CONF_VARS']['EXTCONF']['crawler']['pageVeto'] ?? [] as $key => $func) {
400
            $params = [
401
                'pageRow' => $pageRow,
402
            ];
403
            // expects "false" if page is ok and "true" or a skipMessage if this page should _not_ be crawled
404
            $veto = GeneralUtility::callUserFunction($func, $params, $this);
405
            if ($veto !== false) {
406
                if (is_string($veto)) {
407
                    return $veto;
408
                }
409
                return 'Veto from hook "' . htmlspecialchars($key) . '"';
410
            }
411
        }
412
413
        return false;
414
    }
415
416
    /**
417
     * Wrapper method for getUrlsForPageId()
418
     * It returns an array of configurations and no urls!
419
     *
420
     * @param array $pageRow Page record with at least dok-type and uid columns.
421
     * @param string $skipMessage
422
     * @return array
423
     * @see getUrlsForPageId()
424
     */
425
    public function getUrlsForPageRow(array $pageRow, &$skipMessage = '')
426
    {
427
        if (! is_int($pageRow['uid'])) {
428
            $skipMessage = 'PageUid ' . $pageRow['uid'] . ' was not an integer';
429
            return [];
430
        }
431
432
        $message = $this->checkIfPageShouldBeSkipped($pageRow);
433
        if ($message === false) {
434
            $res = $this->getUrlsForPageId($pageRow['uid']);
435
            $skipMessage = '';
436
        } else {
437
            $skipMessage = $message;
438
            $res = [];
439
        }
440
441
        return $res;
442
    }
443
444
    /**
445
     * Creates a list of URLs from input array (and submits them to queue if asked for)
446
     * See Web > Info module script + "indexed_search"'s crawler hook-client using this!
447
     *
448
     * @param array $vv Information about URLs from pageRow to crawl.
449
     * @param array $pageRow Page row
450
     * @param int $scheduledTime Unix time to schedule indexing to, typically time()
451
     * @param int $reqMinute Number of requests per minute (creates the interleave between requests)
452
     * @param bool $submitCrawlUrls If set, submits the URLs to queue
453
     * @param bool $downloadCrawlUrls If set (and submitcrawlUrls is false) will fill $downloadUrls with entries)
454
     * @param array $duplicateTrack Array which is passed by reference and contains the an id per url to secure we will not crawl duplicates
455
     * @param array $downloadUrls Array which will be filled with URLS for download if flag is set.
456
     * @param array $incomingProcInstructions Array of processing instructions
457
     * @return string List of URLs (meant for display in backend module)
458
     */
459
    public function urlListFromUrlArray(
460
        array $vv,
461
        array $pageRow,
462
        $scheduledTime,
463
        $reqMinute,
464
        $submitCrawlUrls,
465
        $downloadCrawlUrls,
466
        array &$duplicateTrack,
467
        array &$downloadUrls,
468
        array $incomingProcInstructions
469
    ) {
470
        if (! is_array($vv['URLs'])) {
471
            return 'ERROR - no URL generated';
472
        }
473
        $urlLog = [];
474
        $pageId = (int) $pageRow['uid'];
475
        $configurationHash = $this->getConfigurationHash($vv);
476
        $skipInnerCheck = $this->queueRepository->noUnprocessedQueueEntriesForPageWithConfigurationHashExist($pageId, $configurationHash);
477
478
        $urlService = new UrlService();
479
480
        foreach ($vv['URLs'] as $urlQuery) {
481
            if (! $this->drawURLs_PIfilter($vv['subCfg']['procInstrFilter'], $incomingProcInstructions)) {
482
                continue;
483
            }
484
            $url = (string) $urlService->getUrlFromPageAndQueryParameters(
485
                $pageId,
486
                $urlQuery,
487
                $vv['subCfg']['baseUrl'] ?? null,
488
                $vv['subCfg']['force_ssl'] ?? 0
489
            );
490
491
            // Create key by which to determine unique-ness:
492
            $uKey = $url . '|' . $vv['subCfg']['userGroups'] . '|' . $vv['subCfg']['procInstrFilter'];
493
494
            if (isset($duplicateTrack[$uKey])) {
495
                //if the url key is registered just display it and do not resubmit is
496
                $urlLog[] = '<em><span class="text-muted">' . htmlspecialchars($url) . '</span></em>';
497
            } else {
498
                // Scheduled time:
499
                $schTime = $scheduledTime + round(count($duplicateTrack) * (60 / $reqMinute));
500
                $schTime = intval($schTime / 60) * 60;
501
                $formattedDate = BackendUtility::datetime($schTime);
502
                $this->urlList[] = '[' . $formattedDate . '] ' . $url;
503
                $urlList = '[' . $formattedDate . '] ' . htmlspecialchars($url);
504
505
                // Submit for crawling!
506
                if ($submitCrawlUrls) {
507
                    $added = $this->addUrl(
508
                        $pageId,
509
                        $url,
510
                        $vv['subCfg'],
511
                        $scheduledTime,
512
                        $configurationHash,
513
                        $skipInnerCheck
514
                    );
515
                    if ($added === false) {
516
                        $urlList .= ' (URL already existed)';
517
                    }
518
                } elseif ($downloadCrawlUrls) {
519
                    $downloadUrls[$url] = $url;
520
                }
521
                $urlLog[] = $urlList;
522
            }
523
            $duplicateTrack[$uKey] = true;
524
        }
525
526
        return implode('<br>', $urlLog);
527
    }
528
529
    /**
530
     * Returns true if input processing instruction is among registered ones.
531
     *
532
     * @param string $piString PI to test
533
     * @param array $incomingProcInstructions Processing instructions
534
     * @return boolean
535
     */
536
    public function drawURLs_PIfilter($piString, array $incomingProcInstructions)
537
    {
538
        if (empty($incomingProcInstructions)) {
539
            return true;
540
        }
541
542
        foreach ($incomingProcInstructions as $pi) {
543
            if (GeneralUtility::inList($piString, $pi)) {
544
                return true;
545
            }
546
        }
547
        return false;
548
    }
549
550
    public function getPageTSconfigForId(int $id): array
551
    {
552
        if (! $this->MP) {
553
            $pageTSconfig = BackendUtility::getPagesTSconfig($id);
554
        } else {
555
            // TODO: Please check, this makes no sense to split a boolean value.
556
            [, $mountPointId] = explode('-', $this->MP);
557
            $pageTSconfig = BackendUtility::getPagesTSconfig($mountPointId);
558
        }
559
560
        // Call a hook to alter configuration
561
        if (is_array($GLOBALS['TYPO3_CONF_VARS']['EXTCONF']['crawler']['getPageTSconfigForId'])) {
562
            $params = [
563
                'pageId' => $id,
564
                'pageTSConfig' => &$pageTSconfig,
565
            ];
566
            foreach ($GLOBALS['TYPO3_CONF_VARS']['EXTCONF']['crawler']['getPageTSconfigForId'] as $userFunc) {
567
                GeneralUtility::callUserFunction($userFunc, $params, $this);
568
            }
569
        }
570
        return $pageTSconfig;
571
    }
572
573
    /**
574
     * This methods returns an array of configurations.
575
     * Adds no urls!
576
     */
577
    public function getUrlsForPageId(int $pageId): array
578
    {
579
        // Get page TSconfig for page ID
580
        $pageTSconfig = $this->getPageTSconfigForId($pageId);
581
582
        $res = [];
583
584
        // Fetch Crawler Configuration from pageTSconfig
585
        $crawlerCfg = $pageTSconfig['tx_crawler.']['crawlerCfg.']['paramSets.'] ?? [];
586
        foreach ($crawlerCfg as $key => $values) {
587
            if (! is_array($values)) {
588
                continue;
589
            }
590
            $key = str_replace('.', '', $key);
591
            // Sub configuration for a single configuration string:
592
            $subCfg = (array) $crawlerCfg[$key . '.'];
593
            $subCfg['key'] = $key;
594
595
            if (strcmp($subCfg['procInstrFilter'] ?? '', '')) {
596
                $subCfg['procInstrFilter'] = implode(',', GeneralUtility::trimExplode(',', $subCfg['procInstrFilter']));
597
            }
598
            $pidOnlyList = implode(',', GeneralUtility::trimExplode(',', $subCfg['pidsOnly'], true));
599
600
            // process configuration if it is not page-specific or if the specific page is the current page:
601
            // TODO: Check if $pidOnlyList can be kept as Array instead of imploded
602
            if (! strcmp((string) $subCfg['pidsOnly'], '') || GeneralUtility::inList($pidOnlyList, strval($pageId))) {
603
604
                // Explode, process etc.:
605
                $res[$key] = [];
606
                $res[$key]['subCfg'] = $subCfg;
607
                $res[$key]['paramParsed'] = GeneralUtility::explodeUrl2Array($crawlerCfg[$key]);
608
                $res[$key]['paramExpanded'] = $this->expandParameters($res[$key]['paramParsed'], $pageId);
609
                $res[$key]['origin'] = 'pagets';
610
611
                // recognize MP value
612
                if (! $this->MP) {
613
                    $res[$key]['URLs'] = $this->compileUrls($res[$key]['paramExpanded'], ['?id=' . $pageId]);
614
                } else {
615
                    $res[$key]['URLs'] = $this->compileUrls($res[$key]['paramExpanded'], ['?id=' . $pageId . '&MP=' . $this->MP]);
616
                }
617
            }
618
        }
619
620
        // Get configuration from tx_crawler_configuration records up the rootline
621
        $crawlerConfigurations = $this->configurationRepository->getCrawlerConfigurationRecordsFromRootLine($pageId);
622
        foreach ($crawlerConfigurations as $configurationRecord) {
623
624
            // check access to the configuration record
625
            if (empty($configurationRecord['begroups']) || $this->getBackendUser()->isAdmin() || UserService::hasGroupAccess($this->getBackendUser()->user['usergroup_cached_list'], $configurationRecord['begroups'])) {
626
                $pidOnlyList = implode(',', GeneralUtility::trimExplode(',', $configurationRecord['pidsonly'], true));
627
628
                // process configuration if it is not page-specific or if the specific page is the current page:
629
                // TODO: Check if $pidOnlyList can be kept as Array instead of imploded
630
                if (! strcmp($configurationRecord['pidsonly'], '') || GeneralUtility::inList($pidOnlyList, strval($pageId))) {
631
                    $key = $configurationRecord['name'];
632
633
                    // don't overwrite previously defined paramSets
634
                    if (! isset($res[$key])) {
635
636
                        /* @var $TSparserObject TypoScriptParser */
637
                        $TSparserObject = GeneralUtility::makeInstance(TypoScriptParser::class);
638
                        $TSparserObject->parse($configurationRecord['processing_instruction_parameters_ts']);
639
640
                        $subCfg = [
641
                            'procInstrFilter' => $configurationRecord['processing_instruction_filter'],
642
                            'procInstrParams.' => $TSparserObject->setup,
643
                            'baseUrl' => $configurationRecord['base_url'],
644
                            'force_ssl' => (int) $configurationRecord['force_ssl'],
645
                            'userGroups' => $configurationRecord['fegroups'],
646
                            'exclude' => $configurationRecord['exclude'],
647
                            'key' => $key,
648
                        ];
649
650
                        if (! in_array($pageId, $this->expandExcludeString($subCfg['exclude']), true)) {
651
                            $res[$key] = [];
652
                            $res[$key]['subCfg'] = $subCfg;
653
                            $res[$key]['paramParsed'] = GeneralUtility::explodeUrl2Array($configurationRecord['configuration']);
654
                            $res[$key]['paramExpanded'] = $this->expandParameters($res[$key]['paramParsed'], $pageId);
655
                            $res[$key]['URLs'] = $this->compileUrls($res[$key]['paramExpanded'], ['?id=' . $pageId]);
656
                            $res[$key]['origin'] = 'tx_crawler_configuration_' . $configurationRecord['uid'];
657
                        }
658
                    }
659
                }
660
            }
661
        }
662
663
        foreach ($GLOBALS['TYPO3_CONF_VARS']['EXTCONF']['crawler']['processUrls'] ?? [] as $func) {
664
            $params = [
665
                'res' => &$res,
666
            ];
667
            GeneralUtility::callUserFunction($func, $params, $this);
668
        }
669
        return $res;
670
    }
671
672
    /**
673
     * Find all configurations of subpages of a page
674
     * TODO: Write Functional Tests
675
     */
676
    public function getConfigurationsForBranch(int $rootid, int $depth): array
677
    {
678
        $configurationsForBranch = [];
679
        $pageTSconfig = $this->getPageTSconfigForId($rootid);
680
        $sets = $pageTSconfig['tx_crawler.']['crawlerCfg.']['paramSets.'] ?? [];
681
        foreach ($sets as $key => $value) {
682
            if (! is_array($value)) {
683
                continue;
684
            }
685
            $configurationsForBranch[] = substr($key, -1) === '.' ? substr($key, 0, -1) : $key;
686
        }
687
        $pids = [];
688
        $rootLine = BackendUtility::BEgetRootLine($rootid);
689
        foreach ($rootLine as $node) {
690
            $pids[] = $node['uid'];
691
        }
692
        /* @var PageTreeView $tree */
693
        $tree = GeneralUtility::makeInstance(PageTreeView::class);
694
        $perms_clause = $this->getBackendUser()->getPagePermsClause(Permission::PAGE_SHOW);
695
        $tree->init(empty($perms_clause) ? '' : ('AND ' . $perms_clause));
696
        $tree->getTree($rootid, $depth, '');
697
        foreach ($tree->tree as $node) {
698
            $pids[] = $node['row']['uid'];
699
        }
700
701
        $queryBuilder = $this->getQueryBuilder(ConfigurationRepository::TABLE_NAME);
702
        $statement = $queryBuilder
703
            ->select('name')
704
            ->from(ConfigurationRepository::TABLE_NAME)
705
            ->where(
706
                $queryBuilder->expr()->in('pid', $queryBuilder->createNamedParameter($pids, Connection::PARAM_INT_ARRAY))
707
            )
708
            ->execute();
709
710
        while ($row = $statement->fetch()) {
0 ignored issues
show
Deprecated Code introduced by
The function Doctrine\DBAL\ForwardCompatibility\Result::fetch() has been deprecated: Use fetchNumeric(), fetchAssociative() or fetchOne() instead. ( Ignorable by Annotation )

If this is a false-positive, you can also ignore this issue in your code via the ignore-deprecated  annotation

710
        while ($row = /** @scrutinizer ignore-deprecated */ $statement->fetch()) {

This function has been deprecated. The supplier of the function has supplied an explanatory message.

The explanatory message should give you some clue as to whether and when the function will be removed and what other function to use instead.

Loading history...
711
            $configurationsForBranch[] = $row['name'];
712
        }
713
        return $configurationsForBranch;
714
    }
715
716
    /**
717
     * Check if a user has access to an item
718
     * (e.g. get the group list of the current logged in user from $GLOBALS['TSFE']->gr_list)
719
     *
720
     * @param string $groupList Comma-separated list of (fe_)group UIDs from a user
721
     * @param string $accessList Comma-separated list of (fe_)group UIDs of the item to access
722
     * @return bool TRUE if at least one of the users group UIDs is in the access list or the access list is empty
723
     * @see \TYPO3\CMS\Frontend\Page\PageRepository::getMultipleGroupsWhereClause()
724
     * @deprecated
725
     * @codeCoverageIgnore
726
     */
727
    public function hasGroupAccess($groupList, $accessList)
728
    {
729
        if (empty($accessList)) {
730
            return true;
731
        }
732
        foreach (GeneralUtility::intExplode(',', $groupList) as $groupUid) {
733
            if (GeneralUtility::inList($accessList, $groupUid)) {
734
                return true;
735
            }
736
        }
737
        return false;
738
    }
739
740
    /**
741
     * Will expand the parameters configuration to individual values. This follows a certain syntax of the value of each parameter.
742
     * Syntax of values:
743
     * - Basically: If the value is wrapped in [...] it will be expanded according to the following syntax, otherwise the value is taken literally
744
     * - Configuration is splitted by "|" and the parts are processed individually and finally added together
745
     * - For each configuration part:
746
     *         - "[int]-[int]" = Integer range, will be expanded to all values in between, values included, starting from low to high (max. 1000). Example "1-34" or "-40--30"
747
     *         - "_TABLE:[TCA table name];[_PID:[optional page id, default is current page]];[_ENABLELANG:1]" = Look up of table records from PID, filtering out deleted records. Example "_TABLE:tt_content; _PID:123"
748
     *        _ENABLELANG:1 picks only original records without their language overlays
749
     *         - Default: Literal value
750
     *
751
     * @param array $paramArray Array with key (GET var name) and values (value of GET var which is configuration for expansion)
752
     * @param integer $pid Current page ID
753
     * @return array
754
     *
755
     * TODO: Write Functional Tests
756
     */
757
    public function expandParameters($paramArray, $pid)
758
    {
759
        // Traverse parameter names:
760
        foreach ($paramArray as $p => $v) {
761
            $v = trim($v);
762
763
            // If value is encapsulated in square brackets it means there are some ranges of values to find, otherwise the value is literal
764
            if (strpos($v, '[') === 0 && substr($v, -1) === ']') {
765
                // So, find the value inside brackets and reset the paramArray value as an array.
766
                $v = substr($v, 1, -1);
767
                $paramArray[$p] = [];
768
769
                // Explode parts and traverse them:
770
                $parts = explode('|', $v);
771
                foreach ($parts as $pV) {
772
773
                    // Look for integer range: (fx. 1-34 or -40--30 // reads minus 40 to minus 30)
774
                    if (preg_match('/^(-?[0-9]+)\s*-\s*(-?[0-9]+)$/', trim($pV), $reg)) {
775
                        $reg = $this->swapIfFirstIsLargerThanSecond($reg);
776
777
                        // Traverse range, add values:
778
                        // Limit to size of range!
779
                        $runAwayBrake = 1000;
780
                        for ($a = $reg[1]; $a <= $reg[2]; $a++) {
781
                            $paramArray[$p][] = $a;
782
                            $runAwayBrake--;
783
                            if ($runAwayBrake <= 0) {
784
                                break;
785
                            }
786
                        }
787
                    } elseif (strpos(trim($pV), '_TABLE:') === 0) {
788
789
                        // Parse parameters:
790
                        $subparts = GeneralUtility::trimExplode(';', $pV);
791
                        $subpartParams = [];
792
                        foreach ($subparts as $spV) {
793
                            [$pKey, $pVal] = GeneralUtility::trimExplode(':', $spV);
794
                            $subpartParams[$pKey] = $pVal;
795
                        }
796
797
                        // Table exists:
798
                        if (isset($GLOBALS['TCA'][$subpartParams['_TABLE']])) {
799
                            $lookUpPid = isset($subpartParams['_PID']) ? intval($subpartParams['_PID']) : intval($pid);
800
                            $recursiveDepth = isset($subpartParams['_RECURSIVE']) ? intval($subpartParams['_RECURSIVE']) : 0;
801
                            $pidField = isset($subpartParams['_PIDFIELD']) ? trim($subpartParams['_PIDFIELD']) : 'pid';
802
                            $where = $subpartParams['_WHERE'] ?? '';
803
                            $addTable = $subpartParams['_ADDTABLE'] ?? '';
804
805
                            $fieldName = $subpartParams['_FIELD'] ? $subpartParams['_FIELD'] : 'uid';
806
                            if ($fieldName === 'uid' || $GLOBALS['TCA'][$subpartParams['_TABLE']]['columns'][$fieldName]) {
807
                                $queryBuilder = $this->getQueryBuilder($subpartParams['_TABLE']);
808
809
                                if ($recursiveDepth > 0) {
810
                                    /** @var QueryGenerator $queryGenerator */
811
                                    $queryGenerator = GeneralUtility::makeInstance(QueryGenerator::class);
812
                                    $pidList = $queryGenerator->getTreeList($lookUpPid, $recursiveDepth, 0, 1);
813
                                    $pidArray = GeneralUtility::intExplode(',', $pidList);
814
                                } else {
815
                                    $pidArray = [(string) $lookUpPid];
816
                                }
817
818
                                $queryBuilder->getRestrictions()
819
                                    ->removeAll()
820
                                    ->add(GeneralUtility::makeInstance(DeletedRestriction::class));
821
822
                                $queryBuilder
823
                                    ->select($fieldName)
824
                                    ->from($subpartParams['_TABLE'])
825
                                    ->where(
826
                                        $queryBuilder->expr()->in($pidField, $queryBuilder->createNamedParameter($pidArray, Connection::PARAM_INT_ARRAY)),
827
                                        $where
828
                                    );
829
830
                                if (! empty($addTable)) {
831
                                    // TODO: Check if this works as intended!
832
                                    $queryBuilder->add('from', $addTable);
833
                                }
834
                                $transOrigPointerField = $GLOBALS['TCA'][$subpartParams['_TABLE']]['ctrl']['transOrigPointerField'];
835
836
                                if ($subpartParams['_ENABLELANG'] && $transOrigPointerField) {
837
                                    $queryBuilder->andWhere(
838
                                        $queryBuilder->expr()->lte(
839
                                            $transOrigPointerField,
840
                                            0
841
                                        )
842
                                    );
843
                                }
844
845
                                $statement = $queryBuilder->execute();
846
847
                                $rows = [];
848
                                while ($row = $statement->fetch()) {
0 ignored issues
show
Deprecated Code introduced by
The function Doctrine\DBAL\ForwardCompatibility\Result::fetch() has been deprecated: Use fetchNumeric(), fetchAssociative() or fetchOne() instead. ( Ignorable by Annotation )

If this is a false-positive, you can also ignore this issue in your code via the ignore-deprecated  annotation

848
                                while ($row = /** @scrutinizer ignore-deprecated */ $statement->fetch()) {

This function has been deprecated. The supplier of the function has supplied an explanatory message.

The explanatory message should give you some clue as to whether and when the function will be removed and what other function to use instead.

Loading history...
849
                                    $rows[$row[$fieldName]] = $row;
850
                                }
851
852
                                if (is_array($rows)) {
853
                                    $paramArray[$p] = array_merge($paramArray[$p], array_keys($rows));
854
                                }
855
                            }
856
                        }
857
                    } else {
858
                        // Just add value:
859
                        $paramArray[$p][] = $pV;
860
                    }
861
                    // Hook for processing own expandParameters place holder
862
                    if (is_array($GLOBALS['TYPO3_CONF_VARS']['SC_OPTIONS']['crawler/class.tx_crawler_lib.php']['expandParameters'])) {
863
                        $_params = [
864
                            'pObj' => &$this,
865
                            'paramArray' => &$paramArray,
866
                            'currentKey' => $p,
867
                            'currentValue' => $pV,
868
                            'pid' => $pid,
869
                        ];
870
                        foreach ($GLOBALS['TYPO3_CONF_VARS']['SC_OPTIONS']['crawler/class.tx_crawler_lib.php']['expandParameters'] as $_funcRef) {
871
                            GeneralUtility::callUserFunction($_funcRef, $_params, $this);
872
                        }
873
                    }
874
                }
875
876
                // Make unique set of values and sort array by key:
877
                $paramArray[$p] = array_unique($paramArray[$p]);
878
                ksort($paramArray);
879
            } else {
880
                // Set the literal value as only value in array:
881
                $paramArray[$p] = [$v];
882
            }
883
        }
884
885
        return $paramArray;
886
    }
887
888
    /**
889
     * Compiling URLs from parameter array (output of expandParameters())
890
     * The number of URLs will be the multiplication of the number of parameter values for each key
891
     *
892
     * @param array $paramArray Output of expandParameters(): Array with keys (GET var names) and for each an array of values
893
     * @param array $urls URLs accumulated in this array (for recursion)
894
     * @return array
895
     */
896
    public function compileUrls($paramArray, array $urls)
897
    {
898
        if (empty($paramArray)) {
899
            return $urls;
900
        }
901
        $varName = key($paramArray);
902
        $valueSet = array_shift($paramArray);
903
904
        // Traverse value set:
905
        $newUrls = [];
906
        foreach ($urls as $url) {
907
            foreach ($valueSet as $val) {
908
                if (count($newUrls) < $this->getMaximumUrlsToCompile()) {
909
                    $newUrls[] = $url . (strcmp((string) $val, '') ? '&' . rawurlencode($varName) . '=' . rawurlencode((string) $val) : '');
910
                }
911
            }
912
        }
913
        return $this->compileUrls($paramArray, $newUrls);
914
    }
915
916
    /************************************
917
     *
918
     * Crawler log
919
     *
920
     ************************************/
921
922
    /**
923
     * Return array of records from crawler queue for input page ID
924
     *
925
     * @param integer $id Page ID for which to look up log entries.
926
     * @param boolean $doFlush If TRUE, then entries selected at DELETED(!) instead of selected!
927
     * @param boolean $doFullFlush
928
     * @param integer $itemsPerPage Limit the amount of entries per page default is 10
929
     * @return array
930
     *
931
     * @deprecated
932
     */
933
    public function getLogEntriesForPageId($id, QueueFilter $queueFilter, $doFlush = false, $doFullFlush = false, $itemsPerPage = 10)
934
    {
935
        $queryBuilder = GeneralUtility::makeInstance(ConnectionPool::class)->getQueryBuilderForTable($this->tableName);
0 ignored issues
show
Deprecated Code introduced by
The property AOE\Crawler\Controller\C...rController::$tableName has been deprecated: Since v9.2.5 - This will be remove in v10 ( Ignorable by Annotation )

If this is a false-positive, you can also ignore this issue in your code via the ignore-deprecated  annotation

935
        $queryBuilder = GeneralUtility::makeInstance(ConnectionPool::class)->getQueryBuilderForTable(/** @scrutinizer ignore-deprecated */ $this->tableName);

This property has been deprecated. The supplier of the class has supplied an explanatory message.

The explanatory message should give you some clue as to whether and when the property will be removed from the class and what other property to use instead.

Loading history...
936
        $queryBuilder
937
            ->select('*')
938
            ->from($this->tableName)
0 ignored issues
show
Deprecated Code introduced by
The property AOE\Crawler\Controller\C...rController::$tableName has been deprecated: Since v9.2.5 - This will be remove in v10 ( Ignorable by Annotation )

If this is a false-positive, you can also ignore this issue in your code via the ignore-deprecated  annotation

938
            ->from(/** @scrutinizer ignore-deprecated */ $this->tableName)

This property has been deprecated. The supplier of the class has supplied an explanatory message.

The explanatory message should give you some clue as to whether and when the property will be removed from the class and what other property to use instead.

Loading history...
939
            ->where(
940
                $queryBuilder->expr()->eq('page_id', $queryBuilder->createNamedParameter($id, PDO::PARAM_INT))
941
            )
942
            ->orderBy('scheduled', 'DESC');
943
944
        $expressionBuilder = GeneralUtility::makeInstance(ConnectionPool::class)
945
            ->getConnectionForTable($this->tableName)
0 ignored issues
show
Deprecated Code introduced by
The property AOE\Crawler\Controller\C...rController::$tableName has been deprecated: Since v9.2.5 - This will be remove in v10 ( Ignorable by Annotation )

If this is a false-positive, you can also ignore this issue in your code via the ignore-deprecated  annotation

945
            ->getConnectionForTable(/** @scrutinizer ignore-deprecated */ $this->tableName)

This property has been deprecated. The supplier of the class has supplied an explanatory message.

The explanatory message should give you some clue as to whether and when the property will be removed from the class and what other property to use instead.

Loading history...
946
            ->getExpressionBuilder();
947
        $query = $expressionBuilder->andX();
948
        // PHPStorm adds the highlight that the $addWhere is immediately overwritten,
949
        // but the $query = $expressionBuilder->andX() ensures that the $addWhere is written correctly with AND
950
        // between the statements, it's not a mistake in the code.
951
        switch ($queueFilter) {
952
            case 'pending':
953
                $queryBuilder->andWhere($queryBuilder->expr()->eq('exec_time', 0));
954
                break;
955
            case 'finished':
956
                $queryBuilder->andWhere($queryBuilder->expr()->gt('exec_time', 0));
957
                break;
958
        }
959
960
        if ($doFlush) {
961
            $this->queueRepository->flushQueue($queueFilter);
962
        }
963
        if ($itemsPerPage > 0) {
964
            $queryBuilder
965
                ->setMaxResults((int) $itemsPerPage);
966
        }
967
968
        return $queryBuilder->execute()->fetchAll();
969
    }
970
971
    /**
972
     * Return array of records from crawler queue for input set ID
973
     *
974
     * @param int $set_id Set ID for which to look up log entries.
975
     * @param string $filter Filter: "all" => all entries, "pending" => all that is not yet run, "finished" => all complete ones
976
     * @param bool $doFlush If TRUE, then entries selected at DELETED(!) instead of selected!
977
     * @param int $itemsPerPage Limit the amount of entries per page default is 10
978
     * @return array
979
     *
980
     * @deprecated
981
     */
982
    public function getLogEntriesForSetId(int $set_id, string $filter = '', bool $doFlush = false, bool $doFullFlush = false, int $itemsPerPage = 10)
983
    {
984
        $queryBuilder = GeneralUtility::makeInstance(ConnectionPool::class)->getQueryBuilderForTable($this->tableName);
0 ignored issues
show
Deprecated Code introduced by
The property AOE\Crawler\Controller\C...rController::$tableName has been deprecated: Since v9.2.5 - This will be remove in v10 ( Ignorable by Annotation )

If this is a false-positive, you can also ignore this issue in your code via the ignore-deprecated  annotation

984
        $queryBuilder = GeneralUtility::makeInstance(ConnectionPool::class)->getQueryBuilderForTable(/** @scrutinizer ignore-deprecated */ $this->tableName);

This property has been deprecated. The supplier of the class has supplied an explanatory message.

The explanatory message should give you some clue as to whether and when the property will be removed from the class and what other property to use instead.

Loading history...
985
        $queryBuilder
986
            ->select('*')
987
            ->from($this->tableName)
0 ignored issues
show
Deprecated Code introduced by
The property AOE\Crawler\Controller\C...rController::$tableName has been deprecated: Since v9.2.5 - This will be remove in v10 ( Ignorable by Annotation )

If this is a false-positive, you can also ignore this issue in your code via the ignore-deprecated  annotation

987
            ->from(/** @scrutinizer ignore-deprecated */ $this->tableName)

This property has been deprecated. The supplier of the class has supplied an explanatory message.

The explanatory message should give you some clue as to whether and when the property will be removed from the class and what other property to use instead.

Loading history...
988
            ->where(
989
                $queryBuilder->expr()->eq('set_id', $queryBuilder->createNamedParameter($set_id, PDO::PARAM_INT))
990
            )
991
            ->orderBy('scheduled', 'DESC');
992
993
        $expressionBuilder = GeneralUtility::makeInstance(ConnectionPool::class)
994
            ->getConnectionForTable($this->tableName)
0 ignored issues
show
Deprecated Code introduced by
The property AOE\Crawler\Controller\C...rController::$tableName has been deprecated: Since v9.2.5 - This will be remove in v10 ( Ignorable by Annotation )

If this is a false-positive, you can also ignore this issue in your code via the ignore-deprecated  annotation

994
            ->getConnectionForTable(/** @scrutinizer ignore-deprecated */ $this->tableName)

This property has been deprecated. The supplier of the class has supplied an explanatory message.

The explanatory message should give you some clue as to whether and when the property will be removed from the class and what other property to use instead.

Loading history...
995
            ->getExpressionBuilder();
996
        $query = $expressionBuilder->andX();
997
        // PHPStorm adds the highlight that the $addWhere is immediately overwritten,
998
        // but the $query = $expressionBuilder->andX() ensures that the $addWhere is written correctly with AND
999
        // between the statements, it's not a mistake in the code.
1000
        $addWhere = '';
1001
        switch ($filter) {
1002
            case 'pending':
1003
                $queryBuilder->andWhere($queryBuilder->expr()->eq('exec_time', 0));
1004
                $addWhere = $query->add($expressionBuilder->eq('exec_time', 0));
1005
                break;
1006
            case 'finished':
1007
                $queryBuilder->andWhere($queryBuilder->expr()->gt('exec_time', 0));
1008
                $addWhere = $query->add($expressionBuilder->gt('exec_time', 0));
1009
                break;
1010
        }
1011
        if ($doFlush) {
1012
            $addWhere = $query->add($expressionBuilder->eq('set_id', (int) $set_id));
1013
            $this->flushQueue($doFullFlush ? '' : $addWhere);
1014
            return [];
1015
        }
1016
        if ($itemsPerPage > 0) {
1017
            $queryBuilder
1018
                ->setMaxResults((int) $itemsPerPage);
1019
        }
1020
1021
        return $queryBuilder->execute()->fetchAll();
1022
    }
1023
1024
    /**
1025
     * Adding call back entries to log (called from hooks typically, see indexed search class "class.crawler.php"
1026
     *
1027
     * @param integer $setId Set ID
1028
     * @param array $params Parameters to pass to call back function
1029
     * @param string $callBack Call back object reference, eg. 'EXT:indexed_search/class.crawler.php:&tx_indexedsearch_crawler'
1030
     * @param integer $page_id Page ID to attach it to
1031
     * @param integer $schedule Time at which to activate
1032
     */
1033
    public function addQueueEntry_callBack($setId, $params, $callBack, $page_id = 0, $schedule = 0): void
1034
    {
1035
        if (! is_array($params)) {
1036
            $params = [];
1037
        }
1038
        $params['_CALLBACKOBJ'] = $callBack;
1039
1040
        GeneralUtility::makeInstance(ConnectionPool::class)->getConnectionForTable(QueueRepository::TABLE_NAME)
1041
            ->insert(
1042
                QueueRepository::TABLE_NAME,
1043
                [
1044
                    'page_id' => (int) $page_id,
1045
                    'parameters' => json_encode($params),
1046
                    'scheduled' => (int) $schedule ?: $this->getCurrentTime(),
1047
                    'exec_time' => 0,
1048
                    'set_id' => (int) $setId,
1049
                    'result_data' => '',
1050
                ]
1051
            );
1052
    }
1053
1054
    /************************************
1055
     *
1056
     * URL setting
1057
     *
1058
     ************************************/
1059
1060
    /**
1061
     * Setting a URL for crawling:
1062
     *
1063
     * @param integer $id Page ID
1064
     * @param string $url Complete URL
1065
     * @param array $subCfg Sub configuration array (from TS config)
1066
     * @param integer $tstamp Scheduled-time
1067
     * @param string $configurationHash (optional) configuration hash
1068
     * @param bool $skipInnerDuplicationCheck (optional) skip inner duplication check
1069
     * @return bool
1070
     */
1071
    public function addUrl(
1072
        $id,
1073
        $url,
1074
        array $subCfg,
1075
        $tstamp,
1076
        $configurationHash = '',
1077
        $skipInnerDuplicationCheck = false
1078
    ) {
1079
        $urlAdded = false;
1080
        $rows = [];
1081
1082
        // Creating parameters:
1083
        $parameters = [
1084
            'url' => $url,
1085
        ];
1086
1087
        // fe user group simulation:
1088
        $uGs = implode(',', array_unique(GeneralUtility::intExplode(',', $subCfg['userGroups'], true)));
1089
        if ($uGs) {
1090
            $parameters['feUserGroupList'] = $uGs;
1091
        }
1092
1093
        // Setting processing instructions
1094
        $parameters['procInstructions'] = GeneralUtility::trimExplode(',', $subCfg['procInstrFilter']);
1095
        if (is_array($subCfg['procInstrParams.'])) {
1096
            $parameters['procInstrParams'] = $subCfg['procInstrParams.'];
1097
        }
1098
1099
        // Compile value array:
1100
        $parameters_serialized = json_encode($parameters);
1101
        $fieldArray = [
1102
            'page_id' => (int) $id,
1103
            'parameters' => $parameters_serialized,
1104
            'parameters_hash' => GeneralUtility::shortMD5($parameters_serialized),
1105
            'configuration_hash' => $configurationHash,
1106
            'scheduled' => $tstamp,
1107
            'exec_time' => 0,
1108
            'set_id' => (int) $this->setID,
1109
            'result_data' => '',
1110
            'configuration' => $subCfg['key'],
1111
        ];
1112
1113
        if ($this->registerQueueEntriesInternallyOnly) {
1114
            //the entries will only be registered and not stored to the database
1115
            $this->queueEntries[] = $fieldArray;
1116
        } else {
1117
            if (! $skipInnerDuplicationCheck) {
1118
                // check if there is already an equal entry
1119
                $rows = $this->queueRepository->getDuplicateQueueItemsIfExists(
1120
                    (bool) $this->extensionSettings['enableTimeslot'],
1121
                    $tstamp,
1122
                    $this->getCurrentTime(),
1123
                    $fieldArray['page_id'],
1124
                    $fieldArray['parameters_hash']
1125
                );
1126
            }
1127
1128
            if (empty($rows)) {
1129
                $connectionForCrawlerQueue = GeneralUtility::makeInstance(ConnectionPool::class)->getConnectionForTable(QueueRepository::TABLE_NAME);
1130
                $connectionForCrawlerQueue->insert(
1131
                    QueueRepository::TABLE_NAME,
1132
                    $fieldArray
1133
                );
1134
                $uid = $connectionForCrawlerQueue->lastInsertId(QueueRepository::TABLE_NAME, 'qid');
1135
                $rows[] = $uid;
1136
                $urlAdded = true;
1137
1138
                $signalPayload = ['uid' => $uid, 'fieldArray' => $fieldArray];
1139
                SignalSlotUtility::emitSignal(
1140
                    self::class,
1141
                    SignalSlotUtility::SIGNAL_URL_ADDED_TO_QUEUE,
1142
                    $signalPayload
1143
                );
1144
            } else {
1145
                $signalPayload = ['rows' => $rows, 'fieldArray' => $fieldArray];
1146
                SignalSlotUtility::emitSignal(
1147
                    self::class,
1148
                    SignalSlotUtility::SIGNAL_DUPLICATE_URL_IN_QUEUE,
1149
                    $signalPayload
1150
                );
1151
            }
1152
        }
1153
1154
        return $urlAdded;
1155
    }
1156
1157
    /**
1158
     * Returns the current system time
1159
     *
1160
     * @return int
1161
     */
1162
    public function getCurrentTime()
1163
    {
1164
        return time();
1165
    }
1166
1167
    /************************************
1168
     *
1169
     * URL reading
1170
     *
1171
     ************************************/
1172
1173
    /**
1174
     * Read URL for single queue entry
1175
     *
1176
     * @param integer $queueId
1177
     * @param boolean $force If set, will process even if exec_time has been set!
1178
     *
1179
     * @return int|null
1180
     */
1181
    public function readUrl($queueId, $force = false)
1182
    {
1183
        $queryBuilder = GeneralUtility::makeInstance(ConnectionPool::class)->getQueryBuilderForTable(QueueRepository::TABLE_NAME);
1184
        $ret = 0;
1185
        $this->logger->debug('crawler-readurl start ' . microtime(true));
0 ignored issues
show
The method debug() does not exist on null. ( Ignorable by Annotation )

If this is a false-positive, you can also ignore this issue in your code via the ignore-call  annotation

1185
        $this->logger->/** @scrutinizer ignore-call */ 
1186
                       debug('crawler-readurl start ' . microtime(true));

This check looks for calls to methods that do not seem to exist on a given type. It looks for the method on the type itself as well as in inherited classes or implemented interfaces.

This is most likely a typographical error or the method has been renamed.

Loading history...
1186
1187
        $queryBuilder
1188
            ->select('*')
1189
            ->from(QueueRepository::TABLE_NAME)
1190
            ->where(
1191
                $queryBuilder->expr()->eq('qid', $queryBuilder->createNamedParameter($queueId, PDO::PARAM_INT))
1192
            );
1193
        if (! $force) {
1194
            $queryBuilder
1195
                ->andWhere('exec_time = 0')
1196
                ->andWhere('process_scheduled > 0');
1197
        }
1198
        $queueRec = $queryBuilder->execute()->fetch();
1199
1200
        if (! is_array($queueRec)) {
1201
            return;
1202
        }
1203
1204
        SignalSlotUtility::emitSignal(
1205
            self::class,
1206
            SignalSlotUtility::SIGNAL_QUEUEITEM_PREPROCESS,
1207
            [$queueId, &$queueRec]
1208
        );
1209
1210
        // Set exec_time to lock record:
1211
        $field_array = ['exec_time' => $this->getCurrentTime()];
1212
1213
        if (isset($this->processID)) {
1214
            //if mulitprocessing is used we need to store the id of the process which has handled this entry
1215
            $field_array['process_id_completed'] = $this->processID;
1216
        }
1217
1218
        GeneralUtility::makeInstance(ConnectionPool::class)->getConnectionForTable(QueueRepository::TABLE_NAME)
1219
            ->update(
1220
                QueueRepository::TABLE_NAME,
1221
                $field_array,
1222
                ['qid' => (int) $queueId]
1223
            );
1224
1225
        $result = $this->queueExecutor->executeQueueItem($queueRec, $this);
1226
        if ($result['content'] === null) {
1227
            $resultData = 'An errors happened';
1228
        } else {
1229
            /** @var JsonCompatibilityConverter $jsonCompatibilityConverter */
1230
            $jsonCompatibilityConverter = GeneralUtility::makeInstance(JsonCompatibilityConverter::class);
1231
            $resultData = $jsonCompatibilityConverter->convert($result['content']);
1232
1233
            //atm there's no need to point to specific pollable extensions
1234
            if (is_array($resultData) && is_array($GLOBALS['TYPO3_CONF_VARS']['EXTCONF']['crawler']['pollSuccess'])) {
1235
                foreach ($GLOBALS['TYPO3_CONF_VARS']['EXTCONF']['crawler']['pollSuccess'] as $pollable) {
1236
                    // only check the success value if the instruction is runnig
1237
                    // it is important to name the pollSuccess key same as the procInstructions key
1238
                    if (is_array($resultData['parameters']['procInstructions'])
1239
                        && in_array(
1240
                            $pollable,
1241
                            $resultData['parameters']['procInstructions'], true
1242
                        )
1243
                    ) {
1244
                        if (! empty($resultData['success'][$pollable]) && $resultData['success'][$pollable]) {
1245
                            $ret |= self::CLI_STATUS_POLLABLE_PROCESSED;
1246
                        }
1247
                    }
1248
                }
1249
            }
1250
        }
1251
        // Set result in log which also denotes the end of the processing of this entry.
1252
        $field_array = ['result_data' => json_encode($result)];
1253
1254
        SignalSlotUtility::emitSignal(
1255
            self::class,
1256
            SignalSlotUtility::SIGNAL_QUEUEITEM_POSTPROCESS,
1257
            [$queueId, &$field_array]
1258
        );
1259
1260
        GeneralUtility::makeInstance(ConnectionPool::class)->getConnectionForTable(QueueRepository::TABLE_NAME)
1261
            ->update(
1262
                QueueRepository::TABLE_NAME,
1263
                $field_array,
1264
                ['qid' => (int) $queueId]
1265
            );
1266
1267
        $this->logger->debug('crawler-readurl stop ' . microtime(true));
1268
        return $ret;
1269
    }
1270
1271
    /**
1272
     * Read URL for not-yet-inserted log-entry
1273
     *
1274
     * @param array $field_array Queue field array,
1275
     *
1276
     * @return array|bool|mixed|string
1277
     */
1278
    public function readUrlFromArray($field_array)
1279
    {
1280
        // Set exec_time to lock record:
1281
        $field_array['exec_time'] = $this->getCurrentTime();
1282
        $connectionForCrawlerQueue = GeneralUtility::makeInstance(ConnectionPool::class)->getConnectionForTable(QueueRepository::TABLE_NAME);
1283
        $connectionForCrawlerQueue->insert(
1284
            QueueRepository::TABLE_NAME,
1285
            $field_array
1286
        );
1287
        $queueId = $field_array['qid'] = $connectionForCrawlerQueue->lastInsertId(QueueRepository::TABLE_NAME, 'qid');
1288
        $result = $this->queueExecutor->executeQueueItem($field_array, $this);
1289
1290
        // Set result in log which also denotes the end of the processing of this entry.
1291
        $field_array = ['result_data' => json_encode($result)];
1292
1293
        SignalSlotUtility::emitSignal(
1294
            self::class,
1295
            SignalSlotUtility::SIGNAL_QUEUEITEM_POSTPROCESS,
1296
            [$queueId, &$field_array]
1297
        );
1298
1299
        $connectionForCrawlerQueue->update(
1300
            QueueRepository::TABLE_NAME,
1301
            $field_array,
1302
            ['qid' => $queueId]
1303
        );
1304
1305
        return $result;
1306
    }
1307
1308
    /*****************************
1309
     *
1310
     * Compiling URLs to crawl - tools
1311
     *
1312
     *****************************/
1313
1314
    /**
1315
     * @param integer $id Root page id to start from.
1316
     * @param integer $depth Depth of tree, 0=only id-page, 1= on sublevel, 99 = infinite
1317
     * @param integer $scheduledTime Unix Time when the URL is timed to be visited when put in queue
1318
     * @param integer $reqMinute Number of requests per minute (creates the interleave between requests)
1319
     * @param boolean $submitCrawlUrls If set, submits the URLs to queue in database (real crawling)
1320
     * @param boolean $downloadCrawlUrls If set (and submitcrawlUrls is false) will fill $downloadUrls with entries)
1321
     * @param array $incomingProcInstructions Array of processing instructions
1322
     * @param array $configurationSelection Array of configuration keys
1323
     * @return string
1324
     */
1325
    public function getPageTreeAndUrls(
1326
        $id,
1327
        $depth,
1328
        $scheduledTime,
1329
        $reqMinute,
1330
        $submitCrawlUrls,
1331
        $downloadCrawlUrls,
1332
        array $incomingProcInstructions,
1333
        array $configurationSelection
1334
    ) {
1335
        $this->scheduledTime = $scheduledTime;
1336
        $this->reqMinute = $reqMinute;
1337
        $this->submitCrawlUrls = $submitCrawlUrls;
1338
        $this->downloadCrawlUrls = $downloadCrawlUrls;
1339
        $this->incomingProcInstructions = $incomingProcInstructions;
1340
        $this->incomingConfigurationSelection = $configurationSelection;
1341
1342
        $this->duplicateTrack = [];
1343
        $this->downloadUrls = [];
1344
1345
        // Drawing tree:
1346
        /* @var PageTreeView $tree */
1347
        $tree = GeneralUtility::makeInstance(PageTreeView::class);
1348
        $perms_clause = $this->getBackendUser()->getPagePermsClause(Permission::PAGE_SHOW);
1349
        $tree->init('AND ' . $perms_clause);
1350
1351
        $pageInfo = BackendUtility::readPageAccess($id, $perms_clause);
1352
        if (is_array($pageInfo)) {
1353
            // Set root row:
1354
            $tree->tree[] = [
1355
                'row' => $pageInfo,
1356
                'HTML' => $this->iconFactory->getIconForRecord('pages', $pageInfo, Icon::SIZE_SMALL),
1357
            ];
1358
        }
1359
1360
        // Get branch beneath:
1361
        if ($depth) {
1362
            $tree->getTree($id, $depth, '');
1363
        }
1364
1365
        // Traverse page tree:
1366
        $code = '';
1367
1368
        foreach ($tree->tree as $data) {
1369
            $this->MP = false;
1370
1371
            // recognize mount points
1372
            if ($data['row']['doktype'] === PageRepository::DOKTYPE_MOUNTPOINT) {
1373
                $mountpage = $this->pageRepository->getPage($data['row']['uid']);
1374
1375
                // fetch mounted pages
1376
                $this->MP = $mountpage[0]['mount_pid'] . '-' . $data['row']['uid'];
1377
1378
                $mountTree = GeneralUtility::makeInstance(PageTreeView::class);
1379
                $mountTree->init('AND ' . $perms_clause);
1380
                $mountTree->getTree($mountpage[0]['mount_pid'], $depth);
1381
1382
                foreach ($mountTree->tree as $mountData) {
1383
                    $code .= $this->drawURLs_addRowsForPage(
1384
                        $mountData['row'],
1385
                        $mountData['HTML'] . BackendUtility::getRecordTitle('pages', $mountData['row'], true)
1386
                    );
1387
                }
1388
1389
                // replace page when mount_pid_ol is enabled
1390
                if ($mountpage[0]['mount_pid_ol']) {
1391
                    $data['row']['uid'] = $mountpage[0]['mount_pid'];
1392
                } else {
1393
                    // if the mount_pid_ol is not set the MP must not be used for the mountpoint page
1394
                    $this->MP = false;
1395
                }
1396
            }
1397
1398
            $code .= $this->drawURLs_addRowsForPage(
1399
                $data['row'],
1400
                $data['HTML'] . BackendUtility::getRecordTitle('pages', $data['row'], true)
1401
            );
1402
        }
1403
1404
        return $code;
1405
    }
1406
1407
    /**
1408
     * Expands exclude string
1409
     *
1410
     * @param string $excludeString Exclude string
1411
     * @return array
1412
     */
1413
    public function expandExcludeString($excludeString)
1414
    {
1415
        // internal static caches;
1416
        static $expandedExcludeStringCache;
1417
        static $treeCache;
1418
1419
        if (empty($expandedExcludeStringCache[$excludeString])) {
1420
            $pidList = [];
1421
1422
            if (! empty($excludeString)) {
1423
                /** @var PageTreeView $tree */
1424
                $tree = GeneralUtility::makeInstance(PageTreeView::class);
1425
                $tree->init('AND ' . $this->getBackendUser()->getPagePermsClause(Permission::PAGE_SHOW));
1426
1427
                $excludeParts = GeneralUtility::trimExplode(',', $excludeString);
1428
1429
                foreach ($excludeParts as $excludePart) {
1430
                    [$pid, $depth] = GeneralUtility::trimExplode('+', $excludePart);
1431
1432
                    // default is "page only" = "depth=0"
1433
                    if (empty($depth)) {
1434
                        $depth = (stristr($excludePart, '+')) ? 99 : 0;
1435
                    }
1436
1437
                    $pidList[] = (int) $pid;
1438
1439
                    if ($depth > 0) {
1440
                        if (empty($treeCache[$pid][$depth])) {
1441
                            $tree->reset();
1442
                            $tree->getTree($pid, $depth);
1443
                            $treeCache[$pid][$depth] = $tree->tree;
1444
                        }
1445
1446
                        foreach ($treeCache[$pid][$depth] as $data) {
1447
                            $pidList[] = (int) $data['row']['uid'];
1448
                        }
1449
                    }
1450
                }
1451
            }
1452
1453
            $expandedExcludeStringCache[$excludeString] = array_unique($pidList);
1454
        }
1455
1456
        return $expandedExcludeStringCache[$excludeString];
1457
    }
1458
1459
    /**
1460
     * Create the rows for display of the page tree
1461
     * For each page a number of rows are shown displaying GET variable configuration
1462
     */
1463
    public function drawURLs_addRowsForPage(array $pageRow, string $pageTitle): string
1464
    {
1465
        $skipMessage = '';
1466
1467
        // Get list of configurations
1468
        $configurations = $this->getUrlsForPageRow($pageRow, $skipMessage);
1469
        $configurations = ConfigurationService::removeDisallowedConfigurations($this->incomingConfigurationSelection, $configurations);
1470
1471
        // Traverse parameter combinations:
1472
        $c = 0;
1473
        $content = '';
1474
        if (! empty($configurations)) {
1475
            foreach ($configurations as $confKey => $confArray) {
1476
1477
                // Title column:
1478
                if (! $c) {
1479
                    $titleClm = '<td rowspan="' . count($configurations) . '">' . $pageTitle . '</td>';
1480
                } else {
1481
                    $titleClm = '';
1482
                }
1483
1484
                if (! in_array($pageRow['uid'], $this->expandExcludeString($confArray['subCfg']['exclude']), true)) {
1485
1486
                    // URL list:
1487
                    $urlList = $this->urlListFromUrlArray(
1488
                        $confArray,
1489
                        $pageRow,
1490
                        $this->scheduledTime,
1491
                        $this->reqMinute,
1492
                        $this->submitCrawlUrls,
1493
                        $this->downloadCrawlUrls,
1494
                        $this->duplicateTrack,
1495
                        $this->downloadUrls,
1496
                        // if empty the urls won't be filtered by processing instructions
1497
                        $this->incomingProcInstructions
1498
                    );
1499
1500
                    // Expanded parameters:
1501
                    $paramExpanded = '';
1502
                    $calcAccu = [];
1503
                    $calcRes = 1;
1504
                    foreach ($confArray['paramExpanded'] as $gVar => $gVal) {
1505
                        $paramExpanded .= '
1506
                            <tr>
1507
                                <td>' . htmlspecialchars('&' . $gVar . '=') . '<br/>' .
1508
                            '(' . count($gVal) . ')' .
1509
                            '</td>
1510
                                <td nowrap="nowrap">' . nl2br(htmlspecialchars(implode(chr(10), $gVal))) . '</td>
1511
                            </tr>
1512
                        ';
1513
                        $calcRes *= count($gVal);
1514
                        $calcAccu[] = count($gVal);
1515
                    }
1516
                    $paramExpanded = '<table>' . $paramExpanded . '</table>';
1517
                    $paramExpanded .= 'Comb: ' . implode('*', $calcAccu) . '=' . $calcRes;
1518
1519
                    // Options
1520
                    $optionValues = '';
1521
                    if ($confArray['subCfg']['userGroups']) {
1522
                        $optionValues .= 'User Groups: ' . $confArray['subCfg']['userGroups'] . '<br/>';
1523
                    }
1524
                    if ($confArray['subCfg']['procInstrFilter']) {
1525
                        $optionValues .= 'ProcInstr: ' . $confArray['subCfg']['procInstrFilter'] . '<br/>';
1526
                    }
1527
1528
                    // Compile row:
1529
                    $content .= '
1530
                        <tr>
1531
                            ' . $titleClm . '
1532
                            <td>' . htmlspecialchars($confKey) . '</td>
1533
                            <td>' . nl2br(htmlspecialchars(rawurldecode(trim(str_replace('&', chr(10) . '&', GeneralUtility::implodeArrayForUrl('', $confArray['paramParsed'])))))) . '</td>
1534
                            <td>' . $paramExpanded . '</td>
1535
                            <td nowrap="nowrap">' . $urlList . '</td>
1536
                            <td nowrap="nowrap">' . $optionValues . '</td>
1537
                            <td nowrap="nowrap">' . DebugUtility::viewArray($confArray['subCfg']['procInstrParams.']) . '</td>
1538
                        </tr>';
1539
                } else {
1540
                    $content .= '<tr>
1541
                            ' . $titleClm . '
1542
                            <td>' . htmlspecialchars($confKey) . '</td>
1543
                            <td colspan="5"><em>No entries</em> (Page is excluded in this configuration)</td>
1544
                        </tr>';
1545
                }
1546
1547
                $c++;
1548
            }
1549
        } else {
1550
            $message = ! empty($skipMessage) ? ' (' . $skipMessage . ')' : '';
1551
1552
            // Compile row:
1553
            $content .= '
1554
                <tr>
1555
                    <td>' . $pageTitle . '</td>
1556
                    <td colspan="6"><em>No entries</em>' . $message . '</td>
1557
                </tr>';
1558
        }
1559
1560
        return $content;
1561
    }
1562
1563
    /*****************************
1564
     *
1565
     * CLI functions
1566
     *
1567
     *****************************/
1568
1569
    /**
1570
     * Running the functionality of the CLI (crawling URLs from queue)
1571
     */
1572
    public function CLI_run(int $countInARun, int $sleepTime, int $sleepAfterFinish): int
1573
    {
1574
        $result = 0;
1575
        $counter = 0;
1576
1577
        // First, run hooks:
1578
        foreach ($GLOBALS['TYPO3_CONF_VARS']['EXTCONF']['crawler']['cli_hooks'] ?? [] as $objRef) {
1579
            trigger_error(
1580
                'This hook (crawler/cli_hooks) is deprecated since 9.1.5 and will be removed when dropping support for TYPO3 9LTS and 10LTS',
1581
                E_USER_DEPRECATED
1582
            );
1583
            $hookObj = GeneralUtility::makeInstance($objRef);
1584
            if (is_object($hookObj)) {
1585
                $hookObj->crawler_init($this);
1586
            }
1587
        }
1588
1589
        // Clean up the queue
1590
        $this->queueRepository->cleanupQueue();
1591
1592
        // Select entries:
1593
        $rows = $this->queueRepository->fetchRecordsToBeCrawled($countInARun);
1594
1595
        if (! empty($rows)) {
1596
            $quidList = [];
1597
1598
            foreach ($rows as $r) {
1599
                $quidList[] = $r['qid'];
1600
            }
1601
1602
            $processId = $this->CLI_buildProcessId();
1603
1604
            //save the number of assigned queue entries to determine how many have been processed later
1605
            $numberOfAffectedRows = $this->queueRepository->updateProcessIdAndSchedulerForQueueIds($quidList, $processId);
1606
            $this->processRepository->updateProcessAssignItemsCount($numberOfAffectedRows, $processId);
1607
1608
            if ($numberOfAffectedRows !== count($quidList)) {
1609
                return ($result | self::CLI_STATUS_ABORTED);
1610
            }
1611
1612
            foreach ($rows as $r) {
1613
                $result |= $this->readUrl($r['qid']);
1614
1615
                $counter++;
1616
                // Just to relax the system
1617
                usleep((int) $sleepTime);
1618
1619
                // if during the start and the current read url the cli has been disable we need to return from the function
1620
                // mark the process NOT as ended.
1621
                if ($this->crawler->isDisabled()) {
1622
                    return ($result | self::CLI_STATUS_ABORTED);
1623
                }
1624
1625
                if (! $this->processRepository->isProcessActive($this->CLI_buildProcessId())) {
1626
                    $this->CLI_debug('conflict / timeout (' . $this->CLI_buildProcessId() . ')');
1627
                    $result |= self::CLI_STATUS_ABORTED;
1628
                    //possible timeout
1629
                    break;
1630
                }
1631
            }
1632
1633
            sleep((int) $sleepAfterFinish);
1634
        }
1635
1636
        if ($counter > 0) {
1637
            $result |= self::CLI_STATUS_PROCESSED;
1638
        }
1639
1640
        return $result;
1641
    }
1642
1643
    /**
1644
     * Activate hooks
1645
     * @deprecated
1646
     */
1647
    public function CLI_runHooks(): void
1648
    {
1649
        foreach ($GLOBALS['TYPO3_CONF_VARS']['EXTCONF']['crawler']['cli_hooks'] ?? [] as $objRef) {
1650
            $hookObj = GeneralUtility::makeInstance($objRef);
1651
            if (is_object($hookObj)) {
1652
                $hookObj->crawler_init($this);
1653
            }
1654
        }
1655
    }
1656
1657
    /**
1658
     * Try to acquire a new process with the given id
1659
     * also performs some auto-cleanup for orphan processes
1660
     * @param string $id identification string for the process
1661
     * @return boolean
1662
     * @todo preemption might not be the most elegant way to clean up
1663
     */
1664
    public function CLI_checkAndAcquireNewProcess($id)
1665
    {
1666
        $ret = true;
1667
1668
        $systemProcessId = getmypid();
1669
        if (! $systemProcessId) {
1670
            return false;
1671
        }
1672
1673
        $processCount = 0;
1674
        $orphanProcesses = [];
1675
1676
        $activeProcesses = $this->processRepository->findAllActive();
1677
        $currentTime = $this->getCurrentTime();
1678
1679
        /** @var Process $process */
1680
        foreach ($activeProcesses as $process) {
1681
            if ($process->getTtl() < $currentTime) {
1682
                $orphanProcesses[] = $process->getProcessId();
1683
            } else {
1684
                $processCount++;
1685
            }
1686
        }
1687
1688
        // if there are less than allowed active processes then add a new one
1689
        if ($processCount < (int) $this->extensionSettings['processLimit']) {
1690
            $this->processRepository->addProcess($id, $systemProcessId);
1691
        } else {
1692
            $ret = false;
1693
        }
1694
1695
        $this->processRepository->deleteProcessesMarkedAsDeleted();
1696
        $this->processRepository->markRequestedProcessesAsNotActive($orphanProcesses);
1697
        $this->queueRepository->unsetProcessScheduledAndProcessIdForQueueEntries($orphanProcesses);
1698
1699
        return $ret;
1700
    }
1701
1702
    /**
1703
     * Release a process and the required resources
1704
     *
1705
     * @param mixed $releaseIds string with a single process-id or array with multiple process-ids
1706
     * @return boolean
1707
     * @deprecated
1708
     */
1709
    public function CLI_releaseProcesses($releaseIds)
1710
    {
1711
        $queryBuilder = GeneralUtility::makeInstance(ConnectionPool::class)->getQueryBuilderForTable($this->tableName);
0 ignored issues
show
Deprecated Code introduced by
The property AOE\Crawler\Controller\C...rController::$tableName has been deprecated: Since v9.2.5 - This will be remove in v10 ( Ignorable by Annotation )

If this is a false-positive, you can also ignore this issue in your code via the ignore-deprecated  annotation

1711
        $queryBuilder = GeneralUtility::makeInstance(ConnectionPool::class)->getQueryBuilderForTable(/** @scrutinizer ignore-deprecated */ $this->tableName);

This property has been deprecated. The supplier of the class has supplied an explanatory message.

The explanatory message should give you some clue as to whether and when the property will be removed from the class and what other property to use instead.

Loading history...
1712
1713
        if (! is_array($releaseIds)) {
1714
            $releaseIds = [$releaseIds];
1715
        }
1716
1717
        if (empty($releaseIds)) {
1718
            //nothing to release
1719
            return false;
1720
        }
1721
1722
        // some kind of 2nd chance algo - this way you need at least 2 processes to have a real cleanup
1723
        // this ensures that a single process can't mess up the entire process table
1724
1725
        // mark all processes as deleted which have no "waiting" queue-entires and which are not active
1726
1727
        // ReleaseQueueEntries
1728
        $queryBuilder
1729
            ->update(QueueRepository::TABLE_NAME, 'q')
1730
            ->where(
1731
                'q.process_id IN(SELECT p.process_id FROM tx_crawler_process as p WHERE p.active = 0)'
1732
            )
1733
            ->set('q.process_scheduled', 0)
1734
            ->set('q.process_id', '')
1735
            ->execute();
1736
1737
        // FIXME: Not entirely sure that this is equivalent to the previous version
1738
        $queryBuilder->resetQueryPart('set');
1739
1740
        // ReleaseProcessEntries
1741
        $queryBuilder
1742
            ->update(ProcessRepository::TABLE_NAME)
1743
            ->where(
1744
                $queryBuilder->expr()->eq('active', 0),
1745
                'process_id IN(SELECT q.process_id FROM tx_crawler_queue as q WHERE q.exec_time = 0)'
1746
            )
1747
            ->set('system_process_id', 0)
1748
            ->execute();
1749
1750
        $this->processRepository->markRequestedProcessesAsNotActive($releaseIds);
1751
        $this->queueRepository->unsetProcessScheduledAndProcessIdForQueueEntries($releaseIds);
1752
1753
        return true;
1754
    }
1755
1756
    /**
1757
     * Create a unique Id for the current process
1758
     *
1759
     * @return string the ID
1760
     */
1761
    public function CLI_buildProcessId()
1762
    {
1763
        if (! $this->processID) {
1764
            $this->processID = GeneralUtility::shortMD5(microtime(true));
1765
        }
1766
        return $this->processID;
1767
    }
1768
1769
    /**
1770
     * Prints a message to the stdout (only if debug-mode is enabled)
1771
     *
1772
     * @param string $msg the message
1773
     * @deprecated
1774
     * @codeCoverageIgnore
1775
     */
1776
    public function CLI_debug($msg): void
1777
    {
1778
        if ((int) $this->extensionSettings['processDebug']) {
1779
            echo $msg . "\n";
1780
            flush();
1781
        }
1782
    }
1783
1784
    /**
1785
     * Cleans up entries that stayed for too long in the queue. These are:
1786
     * - processed entries that are over 1.5 days in age
1787
     * - scheduled entries that are over 7 days old
1788
     *
1789
     * @deprecated
1790
     */
1791
    public function cleanUpOldQueueEntries(): void
1792
    {
1793
        // 24*60*60 Seconds in 24 hours
1794
        $processedAgeInSeconds = $this->extensionSettings['cleanUpProcessedAge'] * 86400;
1795
        $scheduledAgeInSeconds = $this->extensionSettings['cleanUpScheduledAge'] * 86400;
1796
1797
        $now = time();
1798
        $condition = '(exec_time<>0 AND exec_time<' . ($now - $processedAgeInSeconds) . ') OR scheduled<=' . ($now - $scheduledAgeInSeconds);
1799
        $this->flushQueue($condition);
1800
    }
1801
1802
    /**
1803
     * Removes queue entries
1804
     *
1805
     * @param string $where SQL related filter for the entries which should be removed
1806
     *
1807
     * @deprecated
1808
     */
1809
    protected function flushQueue($where = ''): void
1810
    {
1811
        $realWhere = strlen((string) $where) > 0 ? $where : '1=1';
1812
1813
        $queryBuilder = $this->getQueryBuilder($this->tableName);
0 ignored issues
show
Deprecated Code introduced by
The property AOE\Crawler\Controller\C...rController::$tableName has been deprecated: Since v9.2.5 - This will be remove in v10 ( Ignorable by Annotation )

If this is a false-positive, you can also ignore this issue in your code via the ignore-deprecated  annotation

1813
        $queryBuilder = $this->getQueryBuilder(/** @scrutinizer ignore-deprecated */ $this->tableName);

This property has been deprecated. The supplier of the class has supplied an explanatory message.

The explanatory message should give you some clue as to whether and when the property will be removed from the class and what other property to use instead.

Loading history...
1814
1815
        $groups = $queryBuilder
0 ignored issues
show
Deprecated Code introduced by
The function Doctrine\DBAL\ForwardCom...lity\Result::fetchAll() has been deprecated: Use fetchAllNumeric(), fetchAllAssociative() or fetchFirstColumn() instead. ( Ignorable by Annotation )

If this is a false-positive, you can also ignore this issue in your code via the ignore-deprecated  annotation

1815
        $groups = /** @scrutinizer ignore-deprecated */ $queryBuilder

This function has been deprecated. The supplier of the function has supplied an explanatory message.

The explanatory message should give you some clue as to whether and when the function will be removed and what other function to use instead.

Loading history...
1816
            ->selectLiteral('DISTINCT set_id')
1817
            ->from($this->tableName)
0 ignored issues
show
Deprecated Code introduced by
The property AOE\Crawler\Controller\C...rController::$tableName has been deprecated: Since v9.2.5 - This will be remove in v10 ( Ignorable by Annotation )

If this is a false-positive, you can also ignore this issue in your code via the ignore-deprecated  annotation

1817
            ->from(/** @scrutinizer ignore-deprecated */ $this->tableName)

This property has been deprecated. The supplier of the class has supplied an explanatory message.

The explanatory message should give you some clue as to whether and when the property will be removed from the class and what other property to use instead.

Loading history...
1818
            ->where($realWhere)
1819
            ->execute()
1820
            ->fetchAll();
1821
        if (is_array($groups)) {
1822
            foreach ($groups as $group) {
1823
                $subSet = $queryBuilder
0 ignored issues
show
Deprecated Code introduced by
The function Doctrine\DBAL\ForwardCom...lity\Result::fetchAll() has been deprecated: Use fetchAllNumeric(), fetchAllAssociative() or fetchFirstColumn() instead. ( Ignorable by Annotation )

If this is a false-positive, you can also ignore this issue in your code via the ignore-deprecated  annotation

1823
                $subSet = /** @scrutinizer ignore-deprecated */ $queryBuilder

This function has been deprecated. The supplier of the function has supplied an explanatory message.

The explanatory message should give you some clue as to whether and when the function will be removed and what other function to use instead.

Loading history...
1824
                    ->select('qid', 'set_id')
1825
                    ->from($this->tableName)
0 ignored issues
show
Deprecated Code introduced by
The property AOE\Crawler\Controller\C...rController::$tableName has been deprecated: Since v9.2.5 - This will be remove in v10 ( Ignorable by Annotation )

If this is a false-positive, you can also ignore this issue in your code via the ignore-deprecated  annotation

1825
                    ->from(/** @scrutinizer ignore-deprecated */ $this->tableName)

This property has been deprecated. The supplier of the class has supplied an explanatory message.

The explanatory message should give you some clue as to whether and when the property will be removed from the class and what other property to use instead.

Loading history...
1826
                    ->where(
1827
                        $realWhere,
1828
                        $queryBuilder->expr()->eq('set_id', $group['set_id'])
1829
                    )
1830
                    ->execute()
1831
                    ->fetchAll();
1832
1833
                $payLoad = ['subSet' => $subSet];
1834
                SignalSlotUtility::emitSignal(
1835
                    self::class,
1836
                    SignalSlotUtility::SIGNAL_QUEUE_ENTRY_FLUSH,
1837
                    $payLoad
1838
                );
1839
            }
1840
        }
1841
1842
        $queryBuilder
1843
            ->delete($this->tableName)
0 ignored issues
show
Deprecated Code introduced by
The property AOE\Crawler\Controller\C...rController::$tableName has been deprecated: Since v9.2.5 - This will be remove in v10 ( Ignorable by Annotation )

If this is a false-positive, you can also ignore this issue in your code via the ignore-deprecated  annotation

1843
            ->delete(/** @scrutinizer ignore-deprecated */ $this->tableName)

This property has been deprecated. The supplier of the class has supplied an explanatory message.

The explanatory message should give you some clue as to whether and when the property will be removed from the class and what other property to use instead.

Loading history...
1844
            ->where($realWhere)
1845
            ->execute();
1846
    }
1847
1848
    /**
1849
     * This method determines duplicates for a queue entry with the same parameters and this timestamp.
1850
     * If the timestamp is in the past, it will check if there is any unprocessed queue entry in the past.
1851
     * If the timestamp is in the future it will check, if the queued entry has exactly the same timestamp
1852
     *
1853
     * @param int $tstamp
1854
     * @param array $fieldArray
1855
     *
1856
     * @return array
1857
     * @deprecated
1858
     */
1859
    protected function getDuplicateRowsIfExist($tstamp, $fieldArray)
1860
    {
1861
        $rows = [];
1862
1863
        $currentTime = $this->getCurrentTime();
1864
1865
        $queryBuilder = GeneralUtility::makeInstance(ConnectionPool::class)->getQueryBuilderForTable($this->tableName);
0 ignored issues
show
Deprecated Code introduced by
The property AOE\Crawler\Controller\C...rController::$tableName has been deprecated: Since v9.2.5 - This will be remove in v10 ( Ignorable by Annotation )

If this is a false-positive, you can also ignore this issue in your code via the ignore-deprecated  annotation

1865
        $queryBuilder = GeneralUtility::makeInstance(ConnectionPool::class)->getQueryBuilderForTable(/** @scrutinizer ignore-deprecated */ $this->tableName);

This property has been deprecated. The supplier of the class has supplied an explanatory message.

The explanatory message should give you some clue as to whether and when the property will be removed from the class and what other property to use instead.

Loading history...
1866
        $queryBuilder
1867
            ->select('qid')
1868
            ->from(QueueRepository::TABLE_NAME);
1869
        //if this entry is scheduled with "now"
1870
        if ($tstamp <= $currentTime) {
1871
            if ($this->extensionSettings['enableTimeslot']) {
1872
                $timeBegin = $currentTime - 100;
1873
                $timeEnd = $currentTime + 100;
1874
                $queryBuilder
1875
                    ->where(
1876
                        'scheduled BETWEEN ' . $timeBegin . ' AND ' . $timeEnd . ''
1877
                    )
1878
                    ->orWhere(
1879
                        $queryBuilder->expr()->lte('scheduled', $currentTime)
1880
                    );
1881
            } else {
1882
                $queryBuilder
1883
                    ->where(
1884
                        $queryBuilder->expr()->lte('scheduled', $currentTime)
1885
                    );
1886
            }
1887
        } elseif ($tstamp > $currentTime) {
1888
            //entry with a timestamp in the future need to have the same schedule time
1889
            $queryBuilder
1890
                ->where(
1891
                    $queryBuilder->expr()->eq('scheduled', $tstamp)
1892
                );
1893
        }
1894
1895
        $queryBuilder
1896
            ->andWhere('NOT exec_time')
1897
            ->andWhere('NOT process_id')
1898
            ->andWhere($queryBuilder->expr()->eq('page_id', $queryBuilder->createNamedParameter($fieldArray['page_id'], PDO::PARAM_INT)))
1899
            ->andWhere($queryBuilder->expr()->eq('parameters_hash', $queryBuilder->createNamedParameter($fieldArray['parameters_hash'], PDO::PARAM_STR)));
1900
1901
        $statement = $queryBuilder->execute();
1902
1903
        while ($row = $statement->fetch()) {
1904
            $rows[] = $row['qid'];
1905
        }
1906
1907
        return $rows;
1908
    }
1909
1910
    /**
1911
     * Returns a md5 hash generated from a serialized configuration array.
1912
     *
1913
     * @return string
1914
     */
1915
    protected function getConfigurationHash(array $configuration)
1916
    {
1917
        unset($configuration['paramExpanded']);
1918
        unset($configuration['URLs']);
1919
        return md5(serialize($configuration));
1920
    }
1921
1922
    /**
1923
     * Build a URL from a Page and the Query String. If the page has a Site configuration, it can be built by using
1924
     * the Site instance.
1925
     *
1926
     * @param int $httpsOrHttp see tx_crawler_configuration.force_ssl
1927
     * @throws SiteNotFoundException
1928
     * @throws InvalidRouteArgumentsException
1929
     *
1930
     * @deprecated Using CrawlerController::getUrlFromPageAndQueryParameters() is deprecated since 9.1.1 and will be removed in v11.x, please use UrlService->getUrlFromPageAndQueryParameters() instead.
1931
     * @codeCoverageIgnore
1932
     */
1933
    protected function getUrlFromPageAndQueryParameters(int $pageId, string $queryString, ?string $alternativeBaseUrl, int $httpsOrHttp): UriInterface
1934
    {
1935
        $urlService = new UrlService();
1936
        return $urlService->getUrlFromPageAndQueryParameters($pageId, $queryString, $alternativeBaseUrl, $httpsOrHttp);
1937
    }
1938
1939
    protected function swapIfFirstIsLargerThanSecond(array $reg): array
1940
    {
1941
        // Swap if first is larger than last:
1942
        if ($reg[1] > $reg[2]) {
1943
            $temp = $reg[2];
1944
            $reg[2] = $reg[1];
1945
            $reg[1] = $temp;
1946
        }
1947
1948
        return $reg;
1949
    }
1950
1951
    private function getMaximumUrlsToCompile(): int
1952
    {
1953
        return $this->maximumUrlsToCompile;
1954
    }
1955
1956
    /**
1957
     * @return BackendUserAuthentication
1958
     */
1959
    private function getBackendUser()
1960
    {
1961
        // Make sure the _cli_ user is loaded
1962
        Bootstrap::initializeBackendAuthentication();
1963
        if ($this->backendUser === null) {
1964
            $this->backendUser = $GLOBALS['BE_USER'];
1965
        }
1966
        return $this->backendUser;
1967
    }
1968
1969
    /**
1970
     * Get querybuilder for given table
1971
     *
1972
     * @return QueryBuilder
1973
     */
1974
    private function getQueryBuilder(string $table)
1975
    {
1976
        return GeneralUtility::makeInstance(ConnectionPool::class)->getQueryBuilderForTable($table);
1977
    }
1978
}
1979