Passed
Push — Cleanup/CrawlerController ( e10c7c...cb7b19 )
by Tomas Norre
16:53
created

CrawlerController::expandExcludeString()   A

Complexity

Conditions 1
Paths 1

Size

Total Lines 3
Code Lines 1

Duplication

Lines 0
Ratio 0 %

Code Coverage

Tests 0
CRAP Score 2

Importance

Changes 1
Bugs 0 Features 0
Metric Value
cc 1
eloc 1
nc 1
nop 1
dl 0
loc 3
rs 10
c 1
b 0
f 0
ccs 0
cts 2
cp 0
crap 2
1
<?php
2
3
declare(strict_types=1);
4
5
namespace AOE\Crawler\Controller;
6
7
/***************************************************************
8
 *  Copyright notice
9
 *
10
 *  (c) 2020 AOE GmbH <[email protected]>
11
 *
12
 *  All rights reserved
13
 *
14
 *  This script is part of the TYPO3 project. The TYPO3 project is
15
 *  free software; you can redistribute it and/or modify
16
 *  it under the terms of the GNU General Public License as published by
17
 *  the Free Software Foundation; either version 3 of the License, or
18
 *  (at your option) any later version.
19
 *
20
 *  The GNU General Public License can be found at
21
 *  http://www.gnu.org/copyleft/gpl.html.
22
 *
23
 *  This script is distributed in the hope that it will be useful,
24
 *  but WITHOUT ANY WARRANTY; without even the implied warranty of
25
 *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
26
 *  GNU General Public License for more details.
27
 *
28
 *  This copyright notice MUST APPEAR in all copies of the script!
29
 ***************************************************************/
30
31
use AOE\Crawler\Configuration\ExtensionConfigurationProvider;
32
use AOE\Crawler\Converter\JsonCompatibilityConverter;
33
use AOE\Crawler\Crawler;
34
use AOE\Crawler\CrawlStrategy\CrawlStrategyFactory;
35
use AOE\Crawler\Domain\Model\Process;
36
use AOE\Crawler\Domain\Repository\ConfigurationRepository;
37
use AOE\Crawler\Domain\Repository\ProcessRepository;
38
use AOE\Crawler\Domain\Repository\QueueRepository;
39
use AOE\Crawler\QueueExecutor;
40
use AOE\Crawler\Service\ConfigurationService;
41
use AOE\Crawler\Service\PageService;
42
use AOE\Crawler\Service\UrlService;
43
use AOE\Crawler\Utility\SignalSlotUtility;
44
use AOE\Crawler\Value\QueueFilter;
45
use PDO;
46
use Psr\Http\Message\UriInterface;
47
use Psr\Log\LoggerAwareInterface;
48
use Psr\Log\LoggerAwareTrait;
49
use TYPO3\CMS\Backend\Tree\View\PageTreeView;
50
use TYPO3\CMS\Backend\Utility\BackendUtility;
51
use TYPO3\CMS\Core\Authentication\BackendUserAuthentication;
52
use TYPO3\CMS\Core\Compatibility\PublicMethodDeprecationTrait;
53
use TYPO3\CMS\Core\Compatibility\PublicPropertyDeprecationTrait;
54
use TYPO3\CMS\Core\Core\Bootstrap;
55
use TYPO3\CMS\Core\Core\Environment;
56
use TYPO3\CMS\Core\Database\Connection;
57
use TYPO3\CMS\Core\Database\ConnectionPool;
58
use TYPO3\CMS\Core\Database\Query\QueryBuilder;
59
use TYPO3\CMS\Core\Database\Query\Restriction\DeletedRestriction;
60
use TYPO3\CMS\Core\Database\QueryGenerator;
61
use TYPO3\CMS\Core\Domain\Repository\PageRepository;
62
use TYPO3\CMS\Core\Exception\SiteNotFoundException;
63
use TYPO3\CMS\Core\Imaging\Icon;
64
use TYPO3\CMS\Core\Imaging\IconFactory;
65
use TYPO3\CMS\Core\Routing\InvalidRouteArgumentsException;
66
use TYPO3\CMS\Core\Site\Entity\Site;
67
use TYPO3\CMS\Core\Type\Bitmask\Permission;
68
use TYPO3\CMS\Core\Utility\DebugUtility;
69
use TYPO3\CMS\Core\Utility\GeneralUtility;
70
use TYPO3\CMS\Core\Utility\MathUtility;
71
use TYPO3\CMS\Extbase\Object\ObjectManager;
72
73
/**
74
 * Class CrawlerController
75
 *
76
 * @package AOE\Crawler\Controller
77
 */
78
class CrawlerController implements LoggerAwareInterface
79
{
80
    use LoggerAwareTrait;
81
    use PublicMethodDeprecationTrait;
82
    use PublicPropertyDeprecationTrait;
83
84
    public const CLI_STATUS_NOTHING_PROCCESSED = 0;
85
86
    //queue not empty
87
    public const CLI_STATUS_REMAIN = 1;
88
89
    //(some) queue items where processed
90
    public const CLI_STATUS_PROCESSED = 2;
91
92
    //instance didn't finish
93
    public const CLI_STATUS_ABORTED = 4;
94
95
    public const CLI_STATUS_POLLABLE_PROCESSED = 8;
96
97
    /**
98
     * @var integer
99
     */
100
    public $setID = 0;
101
102
    /**
103
     * @var string
104
     */
105
    public $processID = '';
106
107
    /**
108
     * @var array
109
     */
110
    public $duplicateTrack = [];
111
112
    /**
113
     * @var array
114
     */
115
    public $downloadUrls = [];
116
117
    /**
118
     * @var array
119
     */
120
    public $incomingProcInstructions = [];
121
122
    /**
123
     * @var array
124
     */
125
    public $incomingConfigurationSelection = [];
126
127
    /**
128
     * @var bool
129
     */
130
    public $registerQueueEntriesInternallyOnly = false;
131
132
    /**
133
     * @var array
134
     */
135
    public $queueEntries = [];
136
137
    /**
138
     * @var array
139
     */
140
    public $urlList = [];
141
142
    /**
143
     * @var array
144
     */
145
    public $extensionSettings = [];
146
147
    /**
148
     * Mount Point
149
     *
150
     * @var bool
151
     * Todo: Check what this is used for and adjust the type hint or code, as bool doesn't match the current code.
152
     */
153
    public $MP = false;
154
155
    /**
156
     * @var string
157
     * @deprecated
158
     */
159
    protected $processFilename;
160
161
    /**
162
     * Holds the internal access mode can be 'gui','cli' or 'cli_im'
163
     *
164
     * @var string
165
     * @deprecated
166
     */
167
    protected $accessMode;
168
169
    /**
170
     * @var QueueRepository
171
     */
172
    protected $queueRepository;
173
174
    /**
175
     * @var ProcessRepository
176
     */
177
    protected $processRepository;
178
179
    /**
180
     * @var ConfigurationRepository
181
     */
182
    protected $configurationRepository;
183
184
    /**
185
     * @var string
186
     * @deprecated Since v9.2.5 - This will be remove in v10
187
     */
188
    protected $tableName = 'tx_crawler_queue';
189
190
    /**
191
     * @var QueueExecutor
192
     */
193
    protected $queueExecutor;
194
195
    /**
196
     * @var int
197
     */
198
    protected $maximumUrlsToCompile = 10000;
199
200
    /**
201
     * @var IconFactory
202
     */
203
    protected $iconFactory;
204
205
    /**
206
     * @var string[]
207
     */
208
    private $deprecatedPublicMethods = [
0 ignored issues
show
introduced by
The private property $deprecatedPublicMethods is not used, and could be removed.
Loading history...
209
        'compileUrls' => 'Using CrawlerController->compileUrls() is deprecated since 9.2.5, and will be removed in v11.x',
210
        'cleanUpOldQueueEntries' => 'Using CrawlerController::cleanUpOldQueueEntries() is deprecated since 9.0.1 and will be removed in v11.x, please use QueueRepository->cleanUpOldQueueEntries() instead.',
211
        'CLI_debug' => 'Using CrawlerController->CLI_debug() is deprecated since 9.1.3 and will be removed in v11.x',
212
        'CLI_releaseProcesses' => 'Using CrawlerController->CLI_releaseProcesses() is deprecated since 9.2.2 and will be removed in v11.x',
213
        'CLI_runHooks' => 'Using CrawlerController->CLI_runHooks() is deprecated since 9.1.5 and will be removed in v11.x',
214
        'expandExcludeString' => 'Using CrawlerController->expandExcludeString() is deprecated since 9.2.5 and will be removed in v11.x',
215
        'getAccessMode' => 'Using CrawlerController->getAccessMode() is deprecated since 9.1.3 and will be removed in v11.x',
216
        'getLogEntriesForPageId' => 'Using CrawlerController->getLogEntriesForPageId() is deprecated since 9.1.5 and will be remove in v11.x',
217
        'getLogEntriesForSetId' => 'Using crawlerController::getLogEntriesForSetId() is deprecated since 9.0.1 and will be removed in v11.x',
218
        'hasGroupAccess' => 'Using CrawlerController->getLogEntriesForPageId() is deprecated since 9.2.2 and will be remove in v11.x, please use UserService::hasGroupAccess() instead.',
219
        'flushQueue' => 'Using CrawlerController::flushQueue() is deprecated since 9.0.1 and will be removed in v11.x, please use QueueRepository->flushQueue() instead.',
220
        'setAccessMode' => 'Using CrawlerController->setAccessMode() is deprecated since 9.1.3 and will be removed in v11.x',
221
        'getDisabled' => 'Using CrawlerController->getDisabled() is deprecated since 9.1.3 and will be removed in v11.x, please use Crawler->isDisabled() instead',
222
        'setDisabled' => 'Using CrawlerController->setDisabled() is deprecated since 9.1.3 and will be removed in v11.x, please use Crawler->setDisabled() instead',
223
        'getProcessFilename' => 'Using CrawlerController->getProcessFilename() is deprecated since 9.1.3 and will be removed in v11.x',
224
        'setProcessFilename' => 'Using CrawlerController->setProcessFilename() is deprecated since 9.1.3 and will be removed in v11.x',
225
        'getDuplicateRowsIfExist' => 'Using CrawlerController->getDuplicateRowsIfExist() is deprecated since 9.1.4 and will be remove in v11.x, please use QueueRepository->getDuplicateQueueItemsIfExists() instead',
226
        'checkIfPageShouldBeSkipped' => 'Using CrawlerController->checkIfPageShouldBeSkipped() is deprecated since 9.2.5 and will be removed in v11.x',
227
        'swapIfFirstIsLargerThanSecond' => 'Using CrawlerController->swapIfFirstIsLargerThanSecond() is deprecated since 9.2.5, and will be removed in v11.x',
228
        'expandParameters' => 'Using CrawlerController->expandParameters() is deprecated since 9.2.5, and will be removed in v11.x',
229 41
    ];
230
231 41
    /**
232 41
     * @var string[]
233 41
     */
234 41
    private $deprecatedPublicProperties = [
235 41
        'accessMode' => 'Using CrawlerController->accessMode is deprecated since 9.1.3 and will be removed in v11.x',
236 41
        'processFilename' => 'Using CrawlerController->accessMode is deprecated since 9.1.3 and will be removed in v11.x',
237
    ];
238 41
239
    /**
240
     * @var BackendUserAuthentication|null
241 41
     */
242 41
    private $backendUser;
243 41
244
    /**
245
     * @var integer
246 41
     */
247
    private $scheduledTime = 0;
248
249
    /**
250 41
     * @var integer
251 41
     */
252 41
    private $reqMinute = 0;
253
254
    /**
255
     * @var bool
256
     */
257
    private $submitCrawlUrls = false;
258
259 1
    /**
260
     * @var bool
261 1
     */
262
    private $downloadCrawlUrls = false;
263
264
    /**
265
     * @var PageRepository
266
     */
267 1
    private $pageRepository;
268
269 1
    /**
270 1
     * @var Crawler
271
     */
272
    private $crawler;
273
274
    /**
275
     * @var ConfigurationService
276
     */
277 2
    private $configurationService;
278
279 2
    /**
280 1
     * @var UrlService
281
     */
282 1
    private $urlService;
283 1
284
    /************************************
285
     *
286 2
     * Getting URLs based on Page TSconfig
287
     *
288
     ************************************/
289
290
    public function __construct()
291
    {
292
        $objectManager = GeneralUtility::makeInstance(ObjectManager::class);
293 2
        $crawlStrategyFactory = GeneralUtility::makeInstance(CrawlStrategyFactory::class);
294
        $this->queueRepository = $objectManager->get(QueueRepository::class);
295 2
        $this->processRepository = $objectManager->get(ProcessRepository::class);
296
        $this->configurationRepository = $objectManager->get(ConfigurationRepository::class);
297
        $this->pageRepository = GeneralUtility::makeInstance(PageRepository::class);
298
        $this->queueExecutor = GeneralUtility::makeInstance(QueueExecutor::class, $crawlStrategyFactory);
299
        $this->iconFactory = GeneralUtility::makeInstance(IconFactory::class);
300
        $this->crawler = GeneralUtility::makeInstance(Crawler::class);
301 3
        $this->configurationService = GeneralUtility::makeInstance(ConfigurationService::class);
302
        $this->urlService = GeneralUtility::makeInstance(UrlService::class);
303 3
304 3
        $this->processFilename = Environment::getVarPath() . '/lock/tx_crawler.proc';
0 ignored issues
show
Deprecated Code introduced by
The property AOE\Crawler\Controller\C...oller::$processFilename has been deprecated. ( Ignorable by Annotation )

If this is a false-positive, you can also ignore this issue in your code via the ignore-deprecated  annotation

304
        /** @scrutinizer ignore-deprecated */ $this->processFilename = Environment::getVarPath() . '/lock/tx_crawler.proc';
Loading history...
305
306
        /** @var ExtensionConfigurationProvider $configurationProvider */
307
        $configurationProvider = GeneralUtility::makeInstance(ExtensionConfigurationProvider::class);
308
        $settings = $configurationProvider->getExtensionConfiguration();
309 1
        $this->extensionSettings = is_array($settings) ? $settings : [];
310
311 1
        if (MathUtility::convertToPositiveInteger($this->extensionSettings['countInARun']) === 0) {
312
            $this->extensionSettings['countInARun'] = 100;
313
        }
314
315
        $this->extensionSettings['processLimit'] = MathUtility::forceIntegerInRange($this->extensionSettings['processLimit'], 1, 99, 1);
316
        $this->setMaximumUrlsToCompile(MathUtility::forceIntegerInRange($this->extensionSettings['maxCompileUrls'], 1, 1000000000, 10000));
317 12
    }
318
319 12
    public function setMaximumUrlsToCompile(int $maximumUrlsToCompile): void
320 12
    {
321
        $this->maximumUrlsToCompile = $maximumUrlsToCompile;
322
    }
323
324
    /**
325
     * Method to set the accessMode can be gui, cli or cli_im
326
     *
327 8
     * @return string
328
     * @deprecated
329 8
     */
330 8
    public function getAccessMode()
331
    {
332
        return $this->accessMode;
0 ignored issues
show
Deprecated Code introduced by
The property AOE\Crawler\Controller\C...Controller::$accessMode has been deprecated. ( Ignorable by Annotation )

If this is a false-positive, you can also ignore this issue in your code via the ignore-deprecated  annotation

332
        return /** @scrutinizer ignore-deprecated */ $this->accessMode;
Loading history...
333 8
    }
334 8
335 1
    /**
336 1
     * @param string $accessMode
337
     * @deprecated
338
     */
339
    public function setAccessMode($accessMode): void
340 8
    {
341 7
        $this->accessMode = $accessMode;
0 ignored issues
show
Deprecated Code introduced by
The property AOE\Crawler\Controller\C...Controller::$accessMode has been deprecated. ( Ignorable by Annotation )

If this is a false-positive, you can also ignore this issue in your code via the ignore-deprecated  annotation

341
        /** @scrutinizer ignore-deprecated */ $this->accessMode = $accessMode;
Loading history...
342 3
    }
343 3
344
    /**
345
     * Set disabled status to prevent processes from being processed
346
     * @deprecated
347 8
     */
348 4
    public function setDisabled(?bool $disabled = true): void
349 1
    {
350 1
        if ($disabled) {
351 1
            GeneralUtility::writeFile($this->processFilename, 'disabled');
0 ignored issues
show
Deprecated Code introduced by
The property AOE\Crawler\Controller\C...oller::$processFilename has been deprecated. ( Ignorable by Annotation )

If this is a false-positive, you can also ignore this issue in your code via the ignore-deprecated  annotation

351
            GeneralUtility::writeFile(/** @scrutinizer ignore-deprecated */ $this->processFilename, 'disabled');
Loading history...
352 1
        } elseif (is_file($this->processFilename)) {
0 ignored issues
show
Deprecated Code introduced by
The property AOE\Crawler\Controller\C...oller::$processFilename has been deprecated. ( Ignorable by Annotation )

If this is a false-positive, you can also ignore this issue in your code via the ignore-deprecated  annotation

352
        } elseif (is_file(/** @scrutinizer ignore-deprecated */ $this->processFilename)) {
Loading history...
353
            unlink($this->processFilename);
0 ignored issues
show
Deprecated Code introduced by
The property AOE\Crawler\Controller\C...oller::$processFilename has been deprecated. ( Ignorable by Annotation )

If this is a false-positive, you can also ignore this issue in your code via the ignore-deprecated  annotation

353
            unlink(/** @scrutinizer ignore-deprecated */ $this->processFilename);
Loading history...
354
        }
355
    }
356
357 8
    /**
358
     * Get disable status
359 3
     * @deprecated
360
     */
361
    public function getDisabled(): bool
362
    {
363
        return is_file($this->processFilename);
0 ignored issues
show
Deprecated Code introduced by
The property AOE\Crawler\Controller\C...oller::$processFilename has been deprecated. ( Ignorable by Annotation )

If this is a false-positive, you can also ignore this issue in your code via the ignore-deprecated  annotation

363
        return is_file(/** @scrutinizer ignore-deprecated */ $this->processFilename);
Loading history...
364
    }
365
366
    /**
367
     * @param string $filenameWithPath
368
     * @deprecated
369
     */
370
    public function setProcessFilename($filenameWithPath): void
371
    {
372
        $this->processFilename = $filenameWithPath;
0 ignored issues
show
Deprecated Code introduced by
The property AOE\Crawler\Controller\C...oller::$processFilename has been deprecated. ( Ignorable by Annotation )

If this is a false-positive, you can also ignore this issue in your code via the ignore-deprecated  annotation

372
        /** @scrutinizer ignore-deprecated */ $this->processFilename = $filenameWithPath;
Loading history...
373
    }
374
375
    /**
376
     * @return string
377
     * @deprecated
378 8
     */
379
    public function getProcessFilename()
380
    {
381
        return $this->processFilename;
0 ignored issues
show
Deprecated Code introduced by
The property AOE\Crawler\Controller\C...oller::$processFilename has been deprecated. ( Ignorable by Annotation )

If this is a false-positive, you can also ignore this issue in your code via the ignore-deprecated  annotation

381
        return /** @scrutinizer ignore-deprecated */ $this->processFilename;
Loading history...
382
    }
383
384
    /**
385
     * Sets the extensions settings (unserialized pendant of $TYPO3_CONF_VARS['EXT']['extConf']['crawler']).
386
     */
387
    public function setExtensionSettings(array $extensionSettings): void
388
    {
389
        $this->extensionSettings = $extensionSettings;
390 4
    }
391
392 4
    /**
393
     * Check if the given page should be crawled
394 4
     *
395 3
     * @return false|string false if the page should be crawled (not excluded), true / skipMessage if it should be skipped
396 3
     * @deprecated
397
     */
398 1
    public function checkIfPageShouldBeSkipped(array $pageRow)
399 1
    {
400
        $pageService = GeneralUtility::makeInstance(PageService::class);
401
        return $pageService->checkIfPageShouldBeSkipped($pageRow);
402 4
    }
403
404
    /**
405
     * Wrapper method for getUrlsForPageId()
406
     * It returns an array of configurations and no urls!
407
     *
408
     * @param array $pageRow Page record with at least dok-type and uid columns.
409
     * @param string $skipMessage
410
     * @return array
411
     * @see getUrlsForPageId()
412
     */
413
    public function getUrlsForPageRow(array $pageRow, &$skipMessage = '')
414
    {
415
        if (! is_int($pageRow['uid'])) {
416
            $skipMessage = 'PageUid ' . $pageRow['uid'] . ' was not an integer';
417
            return [];
418
        }
419
420 2
        $message = $this->getPageService()->checkIfPageShouldBeSkipped($pageRow);
421
        if ($message === false) {
422
            $res = $this->getUrlsForPageId($pageRow['uid']);
423
            $skipMessage = '';
424
        } else {
425
            $skipMessage = $message;
426
            $res = [];
427
        }
428
429
        return $res;
430
    }
431 2
432
    /**
433
     * Creates a list of URLs from input array (and submits them to queue if asked for)
434 2
     * See Web > Info module script + "indexed_search"'s crawler hook-client using this!
435 2
     *
436 2
     * @param array $vv Information about URLs from pageRow to crawl.
437 2
     * @param array $pageRow Page row
438
     * @param int $scheduledTime Unix time to schedule indexing to, typically time()
439 2
     * @param int $reqMinute Number of requests per minute (creates the interleave between requests)
440 2
     * @param bool $submitCrawlUrls If set, submits the URLs to queue
441
     * @param bool $downloadCrawlUrls If set (and submitcrawlUrls is false) will fill $downloadUrls with entries)
442
     * @param array $duplicateTrack Array which is passed by reference and contains the an id per url to secure we will not crawl duplicates
443 2
     * @param array $downloadUrls Array which will be filled with URLS for download if flag is set.
444 2
     * @param array $incomingProcInstructions Array of processing instructions
445 2
     * @return string List of URLs (meant for display in backend module)
446 2
     */
447 2
    public function urlListFromUrlArray(
448
        array $vv,
449
        array $pageRow,
450
        $scheduledTime,
451 2
        $reqMinute,
452
        $submitCrawlUrls,
453 2
        $downloadCrawlUrls,
454
        array &$duplicateTrack,
455
        array &$downloadUrls,
456
        array $incomingProcInstructions
457
    ) {
458 2
        if (! is_array($vv['URLs'])) {
459 2
            return 'ERROR - no URL generated';
460 2
        }
461 2
        $urlLog = [];
462 2
        $pageId = (int) $pageRow['uid'];
463
        $configurationHash = $this->getConfigurationHash($vv);
464
        $skipInnerCheck = $this->queueRepository->noUnprocessedQueueEntriesForPageWithConfigurationHashExist($pageId, $configurationHash);
465 2
466 2
        $urlService = new UrlService();
467 2
468 2
        foreach ($vv['URLs'] as $urlQuery) {
469 2
            if (! $this->drawURLs_PIfilter($vv['subCfg']['procInstrFilter'], $incomingProcInstructions)) {
470 2
                continue;
471 2
            }
472 2
            $url = (string) $urlService->getUrlFromPageAndQueryParameters(
473
                $pageId,
474 2
                $urlQuery,
475 2
                $vv['subCfg']['baseUrl'] ?? null,
476
                $vv['subCfg']['force_ssl'] ?? 0
477
            );
478
479
            // Create key by which to determine unique-ness:
480 2
            $uKey = $url . '|' . $vv['subCfg']['userGroups'] . '|' . $vv['subCfg']['procInstrFilter'];
481
482 2
            if (isset($duplicateTrack[$uKey])) {
483
                //if the url key is registered just display it and do not resubmit is
484
                $urlLog[] = '<em><span class="text-muted">' . htmlspecialchars($url) . '</span></em>';
485 2
            } else {
486
                // Scheduled time:
487
                $schTime = $scheduledTime + round(count($duplicateTrack) * (60 / $reqMinute));
488
                $schTime = intval($schTime / 60) * 60;
489
                $formattedDate = BackendUtility::datetime($schTime);
490
                $this->urlList[] = '[' . $formattedDate . '] ' . $url;
491
                $urlList = '[' . $formattedDate . '] ' . htmlspecialchars($url);
492
493
                // Submit for crawling!
494
                if ($submitCrawlUrls) {
495 5
                    $added = $this->addUrl(
496
                        $pageId,
497 5
                        $url,
498 1
                        $vv['subCfg'],
499
                        $scheduledTime,
500
                        $configurationHash,
501 4
                        $skipInnerCheck
502 4
                    );
503 2
                    if ($added === false) {
504
                        $urlList .= ' (URL already existed)';
505
                    }
506 2
                } elseif ($downloadCrawlUrls) {
507
                    $downloadUrls[$url] = $url;
508
                }
509 3
                $urlLog[] = $urlList;
510
            }
511 3
            $duplicateTrack[$uKey] = true;
512 3
        }
513
514
        return implode('<br>', $urlLog);
515
    }
516
517
    /**
518
     * Returns true if input processing instruction is among registered ones.
519
     *
520 3
     * @param string $piString PI to test
521
     * @param array $incomingProcInstructions Processing instructions
522
     * @return boolean
523
     */
524
    public function drawURLs_PIfilter($piString, array $incomingProcInstructions)
525
    {
526
        if (empty($incomingProcInstructions)) {
527
            return true;
528
        }
529 3
530
        foreach ($incomingProcInstructions as $pi) {
531
            if (GeneralUtility::inList($piString, $pi)) {
532
                return true;
533
            }
534
        }
535
        return false;
536 2
    }
537
538
    public function getPageTSconfigForId(int $id): array
539 2
    {
540
        if (! $this->MP) {
541 2
            $pageTSconfig = BackendUtility::getPagesTSconfig($id);
542
        } else {
543
            // TODO: Please check, this makes no sense to split a boolean value.
544 2
            [, $mountPointId] = explode('-', $this->MP);
0 ignored issues
show
Bug introduced by
$this->MP of type true is incompatible with the type string expected by parameter $string of explode(). ( Ignorable by Annotation )

If this is a false-positive, you can also ignore this issue in your code via the ignore-type  annotation

544
            [, $mountPointId] = explode('-', /** @scrutinizer ignore-type */ $this->MP);
Loading history...
545 2
            $pageTSconfig = BackendUtility::getPagesTSconfig($mountPointId);
0 ignored issues
show
Bug introduced by
$mountPointId of type string is incompatible with the type integer expected by parameter $id of TYPO3\CMS\Backend\Utilit...ity::getPagesTSconfig(). ( Ignorable by Annotation )

If this is a false-positive, you can also ignore this issue in your code via the ignore-type  annotation

545
            $pageTSconfig = BackendUtility::getPagesTSconfig(/** @scrutinizer ignore-type */ $mountPointId);
Loading history...
546 1
        }
547 1
548
        // Call a hook to alter configuration
549 1
        if (is_array($GLOBALS['TYPO3_CONF_VARS']['EXTCONF']['crawler']['getPageTSconfigForId'])) {
550
            $params = [
551 1
                'pageId' => $id,
552 1
                'pageTSConfig' => &$pageTSconfig,
553
            ];
554 1
            foreach ($GLOBALS['TYPO3_CONF_VARS']['EXTCONF']['crawler']['getPageTSconfigForId'] as $userFunc) {
555 1
                GeneralUtility::callUserFunction($userFunc, $params, $this);
556
            }
557 1
        }
558
        return $pageTSconfig;
559
    }
560
561 1
    /**
562
     * This methods returns an array of configurations.
563
     * Adds no urls!
564 1
     */
565 1
    public function getUrlsForPageId(int $pageId): array
566 1
    {
567 1
        // Get page TSconfig for page ID
568 1
        $pageTSconfig = $this->getPageTSconfigForId($pageId);
569
570
        $mountPoint = is_string($this->MP) ? $this->MP : '';
0 ignored issues
show
introduced by
The condition is_string($this->MP) is always false.
Loading history...
571 1
572 1
        $res = [];
573
574
        // Fetch Crawler Configuration from pageTSConfig
575
        $res = $this->configurationService->getConfigurationFromPageTS($pageTSconfig, $pageId, $res, $mountPoint);
576
577
        // Get configuration from tx_crawler_configuration records up the rootline
578
        $res = $this->configurationService->getConfigurationFromDatabase($pageId, $res);
579
580 2
        foreach ($GLOBALS['TYPO3_CONF_VARS']['EXTCONF']['crawler']['processUrls'] ?? [] as $func) {
581 2
            $params = [
582
                'res' => &$res,
583
            ];
584 1
            GeneralUtility::callUserFunction($func, $params, $this);
585 1
        }
586
        return $res;
587
    }
588
589 1
    /**
590 1
     * Find all configurations of subpages of a page
591
     * TODO: Write Functional Tests
592
     */
593 1
    public function getConfigurationsForBranch(int $rootid, int $depth): array
594
    {
595
        $configurationsForBranch = [];
596 1
        $pageTSconfig = $this->getPageTSconfigForId($rootid);
597 1
        $sets = $pageTSconfig['tx_crawler.']['crawlerCfg.']['paramSets.'] ?? [];
598
        foreach ($sets as $key => $value) {
599
            if (! is_array($value)) {
600 1
                continue;
601 1
            }
602 1
            $configurationsForBranch[] = substr($key, -1) === '.' ? substr($key, 0, -1) : $key;
603 1
        }
604 1
        $pids = [];
605 1
        $rootLine = BackendUtility::BEgetRootLine($rootid);
606 1
        foreach ($rootLine as $node) {
607
            $pids[] = $node['uid'];
608
        }
609 1
        /* @var PageTreeView $tree */
610 1
        $tree = GeneralUtility::makeInstance(PageTreeView::class);
611 1
        $perms_clause = $this->getBackendUser()->getPagePermsClause(Permission::PAGE_SHOW);
612 1
        $tree->init(empty($perms_clause) ? '' : ('AND ' . $perms_clause));
613 1
        $tree->getTree($rootid, $depth, '');
614 1
        foreach ($tree->tree as $node) {
615 1
            $pids[] = $node['row']['uid'];
616
        }
617
618
        $configurations = $this->configurationRepository->getCrawlerConfigurationRecordsFromRootLine($rootid, $pids);
619
620
        foreach($configurations as $configuration) {
621
            $configurationsForBranch[] = $configuration['name'];
622 2
        }
623
        return $configurationsForBranch;
624
    }
625
626
    /**
627
     * Check if a user has access to an item
628 2
     * (e.g. get the group list of the current logged in user from $GLOBALS['TSFE']->gr_list)
629
     *
630
     * @param string $groupList Comma-separated list of (fe_)group UIDs from a user
631
     * @param string $accessList Comma-separated list of (fe_)group UIDs of the item to access
632
     * @return bool TRUE if at least one of the users group UIDs is in the access list or the access list is empty
633
     * @see \TYPO3\CMS\Frontend\Page\PageRepository::getMultipleGroupsWhereClause()
634
     * @deprecated
635 1
     * @codeCoverageIgnore
636
     */
637 1
    public function hasGroupAccess($groupList, $accessList)
638 1
    {
639 1
        if (empty($accessList)) {
640 1
            return true;
641
        }
642
        foreach (GeneralUtility::intExplode(',', $groupList) as $groupUid) {
643
            if (GeneralUtility::inList($accessList, $groupUid)) {
644
                return true;
645
            }
646 1
        }
647 1
        return false;
648 1
    }
649 1
650
    /**
651
     * Will expand the parameters configuration to individual values. This follows a certain syntax of the value of each parameter.
652 1
     * Syntax of values:
653 1
     * - Basically: If the value is wrapped in [...] it will be expanded according to the following syntax, otherwise the value is taken literally
654 1
     * - Configuration is splitted by "|" and the parts are processed individually and finally added together
655 1
     * - For each configuration part:
656 1
     *         - "[int]-[int]" = Integer range, will be expanded to all values in between, values included, starting from low to high (max. 1000). Example "1-34" or "-40--30"
657
     *         - "_TABLE:[TCA table name];[_PID:[optional page id, default is current page]];[_ENABLELANG:1]" = Look up of table records from PID, filtering out deleted records. Example "_TABLE:tt_content; _PID:123"
658
     *        _ENABLELANG:1 picks only original records without their language overlays
659
     *         - Default: Literal value
660 1
     *
661
     * @param array $paramArray Array with key (GET var name) and values (value of GET var which is configuration for expansion)
662 1
     * @param integer $pid Current page ID
663 1
     * @return array
664 1
     * @deprecated
665 1
     * @codeCoverageIgnore
666
     *
667 1
     */
668
    public function expandParameters($paramArray, $pid)
669 1
    {
670 1
        // Traverse parameter names:
671
        foreach ($paramArray as $p => $v) {
672 1
            $v = trim($v);
673
674
            // If value is encapsulated in square brackets it means there are some ranges of values to find, otherwise the value is literal
675
            if (strpos($v, '[') === 0 && substr($v, -1) === ']') {
676
                // So, find the value inside brackets and reset the paramArray value as an array.
677
                $v = substr($v, 1, -1);
678
                $paramArray[$p] = [];
679
680
                // Explode parts and traverse them:
681
                $parts = explode('|', $v);
682
                foreach ($parts as $pV) {
683
684 3
                    // Look for integer range: (fx. 1-34 or -40--30 // reads minus 40 to minus 30)
685
                    if (preg_match('/^(-?[0-9]+)\s*-\s*(-?[0-9]+)$/', trim($pV), $reg)) {
686 3
                        $reg = $this->swapIfFirstIsLargerThanSecond($reg);
0 ignored issues
show
Deprecated Code introduced by
The function AOE\Crawler\Controller\C...rstIsLargerThanSecond() has been deprecated. ( Ignorable by Annotation )

If this is a false-positive, you can also ignore this issue in your code via the ignore-deprecated  annotation

686
                        $reg = /** @scrutinizer ignore-deprecated */ $this->swapIfFirstIsLargerThanSecond($reg);
Loading history...
687 1
688
                        // Traverse range, add values:
689 2
                        // Limit to size of range!
690 2
                        $runAwayBrake = 1000;
691 1
                        for ($a = $reg[1]; $a <= $reg[2]; $a++) {
692
                            $paramArray[$p][] = $a;
693
                            $runAwayBrake--;
694 1
                            if ($runAwayBrake <= 0) {
695
                                break;
696
                            }
697
                        }
698
                    } elseif (strpos(trim($pV), '_TABLE:') === 0) {
699
700
                        // Parse parameters:
701
                        $subparts = GeneralUtility::trimExplode(';', $pV);
702
                        $subpartParams = [];
703
                        foreach ($subparts as $spV) {
704
                            [$pKey, $pVal] = GeneralUtility::trimExplode(':', $spV);
705
                            $subpartParams[$pKey] = $pVal;
706
                        }
707
708
                        // Table exists:
709
                        if (isset($GLOBALS['TCA'][$subpartParams['_TABLE']])) {
710
                            $lookUpPid = isset($subpartParams['_PID']) ? intval($subpartParams['_PID']) : intval($pid);
711
                            $recursiveDepth = isset($subpartParams['_RECURSIVE']) ? intval($subpartParams['_RECURSIVE']) : 0;
712
                            $pidField = isset($subpartParams['_PIDFIELD']) ? trim($subpartParams['_PIDFIELD']) : 'pid';
713
                            $where = $subpartParams['_WHERE'] ?? '';
714 9
                            $addTable = $subpartParams['_ADDTABLE'] ?? '';
715
716
                            $fieldName = $subpartParams['_FIELD'] ? $subpartParams['_FIELD'] : 'uid';
717 9
                            if ($fieldName === 'uid' || $GLOBALS['TCA'][$subpartParams['_TABLE']]['columns'][$fieldName]) {
718 9
                                $queryBuilder = $this->getQueryBuilder($subpartParams['_TABLE']);
719
720
                                if ($recursiveDepth > 0) {
721 9
                                    /** @var QueryGenerator $queryGenerator */
722
                                    $queryGenerator = GeneralUtility::makeInstance(QueryGenerator::class);
723 9
                                    $pidList = $queryGenerator->getTreeList($lookUpPid, $recursiveDepth, 0, 1);
724 9
                                    $pidArray = GeneralUtility::intExplode(',', $pidList);
725
                                } else {
726
                                    $pidArray = [(string) $lookUpPid];
727 9
                                }
728 9
729
                                $queryBuilder->getRestrictions()
730
                                    ->removeAll()
731 9
                                    ->add(GeneralUtility::makeInstance(DeletedRestriction::class));
732 1
733
                                $queryBuilder
734
                                    ->select($fieldName)
735 1
                                    ->from($subpartParams['_TABLE'])
736 1
                                    ->where(
737 1
                                        $queryBuilder->expr()->in($pidField, $queryBuilder->createNamedParameter($pidArray, Connection::PARAM_INT_ARRAY)),
738 1
                                        $where
739 1
                                    );
740
741
                                if (! empty($addTable)) {
742
                                    // TODO: Check if this works as intended!
743 8
                                    $queryBuilder->add('from', $addTable);
744
                                }
745
                                $transOrigPointerField = $GLOBALS['TCA'][$subpartParams['_TABLE']]['ctrl']['transOrigPointerField'];
746 6
747 6
                                if ($subpartParams['_ENABLELANG'] && $transOrigPointerField) {
748 6
                                    $queryBuilder->andWhere(
749 6
                                        $queryBuilder->expr()->lte(
750 6
                                            $transOrigPointerField,
751
                                            0
752
                                        )
753
                                    );
754 6
                                }
755 6
756 6
                                $statement = $queryBuilder->execute();
757 6
758 6
                                $rows = [];
759 6
                                while ($row = $statement->fetch()) {
760
                                    $rows[$row[$fieldName]] = $row;
761 6
                                }
762 6
763 6
                                if (is_array($rows)) {
764
                                    $paramArray[$p] = array_merge($paramArray[$p], array_keys($rows));
765 6
                                }
766
                            }
767 2
                        }
768 2
                    } else {
769 2
                        // Just add value:
770
                        $paramArray[$p][] = $pV;
771 4
                    }
772
                    // Hook for processing own expandParameters place holder
773
                    if (is_array($GLOBALS['TYPO3_CONF_VARS']['SC_OPTIONS']['crawler/class.tx_crawler_lib.php']['expandParameters'])) {
774 6
                        $_params = [
775 6
                            'pObj' => &$this,
776 6
                            'paramArray' => &$paramArray,
777
                            'currentKey' => $p,
778
                            'currentValue' => $pV,
779 6
                            'pid' => $pid,
780 6
                        ];
781 6
                        foreach ($GLOBALS['TYPO3_CONF_VARS']['SC_OPTIONS']['crawler/class.tx_crawler_lib.php']['expandParameters'] as $_funcRef) {
782 6
                            GeneralUtility::callUserFunction($_funcRef, $_params, $this);
783 6
                        }
784
                    }
785
                }
786 6
787
                // Make unique set of values and sort array by key:
788
                $paramArray[$p] = array_unique($paramArray[$p]);
789
                ksort($paramArray);
790 6
            } else {
791
                // Set the literal value as only value in array:
792 6
                $paramArray[$p] = [$v];
793
            }
794
        }
795
796
        return $paramArray;
797
    }
798
799
    /**
800
     * Compiling URLs from parameter array (output of expandParameters())
801 6
     * The number of URLs will be the multiplication of the number of parameter values for each key
802
     *
803 6
     * @param array $paramArray Output of expandParameters(): Array with keys (GET var names) and for each an array of values
804 6
     * @param array $urls URLs accumulated in this array (for recursion)
805 6
     * @deprecated
806
     * @codeCoverageIgnore
807
     */
808 6
    public function compileUrls(array $paramArray, array $urls): array
809 6
    {
810
        return $this->urlService->compileUrls($paramArray, $urls, $this->getMaximumUrlsToCompile());
811
    }
812
813
    /************************************
814 2
     *
815
     * Crawler log
816
     *
817 9
     ************************************/
818
819
    /**
820
     * Return array of records from crawler queue for input page ID
821
     *
822
     * @param integer $id Page ID for which to look up log entries.
823
     * @param boolean $doFlush If TRUE, then entries selected at DELETED(!) instead of selected!
824
     * @param boolean $doFullFlush
825
     * @param integer $itemsPerPage Limit the amount of entries per page default is 10
826
     * @return array
827
     *
828
     * @deprecated
829
     */
830
    public function getLogEntriesForPageId($id, QueueFilter $queueFilter, $doFlush = false, $doFullFlush = false, $itemsPerPage = 10)
0 ignored issues
show
Unused Code introduced by
The parameter $doFullFlush is not used and could be removed. ( Ignorable by Annotation )

If this is a false-positive, you can also ignore this issue in your code via the ignore-unused  annotation

830
    public function getLogEntriesForPageId($id, QueueFilter $queueFilter, $doFlush = false, /** @scrutinizer ignore-unused */ $doFullFlush = false, $itemsPerPage = 10)

This check looks for parameters that have been defined for a function or method, but which are not used in the method body.

Loading history...
831
    {
832 9
        $queryBuilder = GeneralUtility::makeInstance(ConnectionPool::class)->getQueryBuilderForTable($this->tableName);
0 ignored issues
show
Deprecated Code introduced by
The property AOE\Crawler\Controller\C...rController::$tableName has been deprecated: Since v9.2.5 - This will be remove in v10 ( Ignorable by Annotation )

If this is a false-positive, you can also ignore this issue in your code via the ignore-deprecated  annotation

832
        $queryBuilder = GeneralUtility::makeInstance(ConnectionPool::class)->getQueryBuilderForTable(/** @scrutinizer ignore-deprecated */ $this->tableName);

This property has been deprecated. The supplier of the class has supplied an explanatory message.

The explanatory message should give you some clue as to whether and when the property will be removed from the class and what other property to use instead.

Loading history...
833 9
        $queryBuilder
834
            ->select('*')
835
            ->from($this->tableName)
0 ignored issues
show
Deprecated Code introduced by
The property AOE\Crawler\Controller\C...rController::$tableName has been deprecated: Since v9.2.5 - This will be remove in v10 ( Ignorable by Annotation )

If this is a false-positive, you can also ignore this issue in your code via the ignore-deprecated  annotation

835
            ->from(/** @scrutinizer ignore-deprecated */ $this->tableName)

This property has been deprecated. The supplier of the class has supplied an explanatory message.

The explanatory message should give you some clue as to whether and when the property will be removed from the class and what other property to use instead.

Loading history...
836 2
            ->where(
837
                $queryBuilder->expr()->eq('page_id', $queryBuilder->createNamedParameter($id, PDO::PARAM_INT))
838
            )
839
            ->orderBy('scheduled', 'DESC');
840 9
841
        $expressionBuilder = GeneralUtility::makeInstance(ConnectionPool::class)
842
            ->getConnectionForTable($this->tableName)
0 ignored issues
show
Deprecated Code introduced by
The property AOE\Crawler\Controller\C...rController::$tableName has been deprecated: Since v9.2.5 - This will be remove in v10 ( Ignorable by Annotation )

If this is a false-positive, you can also ignore this issue in your code via the ignore-deprecated  annotation

842
            ->getConnectionForTable(/** @scrutinizer ignore-deprecated */ $this->tableName)

This property has been deprecated. The supplier of the class has supplied an explanatory message.

The explanatory message should give you some clue as to whether and when the property will be removed from the class and what other property to use instead.

Loading history...
843
            ->getExpressionBuilder();
844
        $query = $expressionBuilder->andX();
0 ignored issues
show
Unused Code introduced by
The assignment to $query is dead and can be removed.
Loading history...
845
        // PHPStorm adds the highlight that the $addWhere is immediately overwritten,
846
        // but the $query = $expressionBuilder->andX() ensures that the $addWhere is written correctly with AND
847
        // between the statements, it's not a mistake in the code.
848
        switch ($queueFilter) {
849
            case 'pending':
850
                $queryBuilder->andWhere($queryBuilder->expr()->eq('exec_time', 0));
851 5
                break;
852
            case 'finished':
853 5
                $queryBuilder->andWhere($queryBuilder->expr()->gt('exec_time', 0));
854 5
                break;
855
        }
856
857 4
        if ($doFlush) {
858 4
            $this->queueRepository->flushQueue($queueFilter);
859 4
        }
860
        if ($itemsPerPage > 0) {
861
            $queryBuilder
862 4
                ->setMaxResults((int) $itemsPerPage);
863 4
        }
864 3
865 3
        return $queryBuilder->execute()->fetchAll();
866
    }
867 3
868
    /**
869
     * Return array of records from crawler queue for input set ID
870
     *
871
     * @param int $set_id Set ID for which to look up log entries.
872 4
     * @param string $filter Filter: "all" => all entries, "pending" => all that is not yet run, "finished" => all complete ones
873
     * @param bool $doFlush If TRUE, then entries selected at DELETED(!) instead of selected!
874
     * @param int $itemsPerPage Limit the amount of entries per page default is 10
875
     * @return array
876
     *
877
     * @deprecated
878
     */
879
    public function getLogEntriesForSetId(int $set_id, string $filter = '', bool $doFlush = false, bool $doFullFlush = false, int $itemsPerPage = 10)
880
    {
881
        $queryBuilder = GeneralUtility::makeInstance(ConnectionPool::class)->getQueryBuilderForTable($this->tableName);
0 ignored issues
show
Deprecated Code introduced by
The property AOE\Crawler\Controller\C...rController::$tableName has been deprecated: Since v9.2.5 - This will be remove in v10 ( Ignorable by Annotation )

If this is a false-positive, you can also ignore this issue in your code via the ignore-deprecated  annotation

881
        $queryBuilder = GeneralUtility::makeInstance(ConnectionPool::class)->getQueryBuilderForTable(/** @scrutinizer ignore-deprecated */ $this->tableName);

This property has been deprecated. The supplier of the class has supplied an explanatory message.

The explanatory message should give you some clue as to whether and when the property will be removed from the class and what other property to use instead.

Loading history...
882
        $queryBuilder
883
            ->select('*')
884
            ->from($this->tableName)
0 ignored issues
show
Deprecated Code introduced by
The property AOE\Crawler\Controller\C...rController::$tableName has been deprecated: Since v9.2.5 - This will be remove in v10 ( Ignorable by Annotation )

If this is a false-positive, you can also ignore this issue in your code via the ignore-deprecated  annotation

884
            ->from(/** @scrutinizer ignore-deprecated */ $this->tableName)

This property has been deprecated. The supplier of the class has supplied an explanatory message.

The explanatory message should give you some clue as to whether and when the property will be removed from the class and what other property to use instead.

Loading history...
885
            ->where(
886
                $queryBuilder->expr()->eq('set_id', $queryBuilder->createNamedParameter($set_id, PDO::PARAM_INT))
887
            )
888
            ->orderBy('scheduled', 'DESC');
889
890
        $expressionBuilder = GeneralUtility::makeInstance(ConnectionPool::class)
891 4
            ->getConnectionForTable($this->tableName)
0 ignored issues
show
Deprecated Code introduced by
The property AOE\Crawler\Controller\C...rController::$tableName has been deprecated: Since v9.2.5 - This will be remove in v10 ( Ignorable by Annotation )

If this is a false-positive, you can also ignore this issue in your code via the ignore-deprecated  annotation

891
            ->getConnectionForTable(/** @scrutinizer ignore-deprecated */ $this->tableName)

This property has been deprecated. The supplier of the class has supplied an explanatory message.

The explanatory message should give you some clue as to whether and when the property will be removed from the class and what other property to use instead.

Loading history...
892
            ->getExpressionBuilder();
893 4
        $query = $expressionBuilder->andX();
894
        // PHPStorm adds the highlight that the $addWhere is immediately overwritten,
895 4
        // but the $query = $expressionBuilder->andX() ensures that the $addWhere is written correctly with AND
896 4
        // between the statements, it's not a mistake in the code.
897 4
        $addWhere = '';
898 4
        switch ($filter) {
899
            case 'pending':
900 4
                $queryBuilder->andWhere($queryBuilder->expr()->eq('exec_time', 0));
901
                $addWhere = $query->add($expressionBuilder->eq('exec_time', 0));
0 ignored issues
show
Unused Code introduced by
The assignment to $addWhere is dead and can be removed.
Loading history...
902 4
                break;
903 4
            case 'finished':
904 4
                $queryBuilder->andWhere($queryBuilder->expr()->gt('exec_time', 0));
905 4
                $addWhere = $query->add($expressionBuilder->gt('exec_time', 0));
906
                break;
907
        }
908
        if ($doFlush) {
909 4
            $addWhere = $query->add($expressionBuilder->eq('set_id', (int) $set_id));
910 4
            $this->flushQueue($doFullFlush ? '' : $addWhere);
0 ignored issues
show
Deprecated Code introduced by
The function AOE\Crawler\Controller\C...ontroller::flushQueue() has been deprecated. ( Ignorable by Annotation )

If this is a false-positive, you can also ignore this issue in your code via the ignore-deprecated  annotation

910
            /** @scrutinizer ignore-deprecated */ $this->flushQueue($doFullFlush ? '' : $addWhere);
Loading history...
911
            return [];
912 4
        }
913
        if ($itemsPerPage > 0) {
914
            $queryBuilder
915 4
                ->setMaxResults((int) $itemsPerPage);
916
        }
917
918
        return $queryBuilder->execute()->fetchAll();
919
    }
920
921
    /**
922
     * Adding call back entries to log (called from hooks typically, see indexed search class "class.crawler.php"
923
     *
924
     * @param integer $setId Set ID
925
     * @param array $params Parameters to pass to call back function
926
     * @param string $callBack Call back object reference, eg. 'EXT:indexed_search/class.crawler.php:&tx_indexedsearch_crawler'
927
     * @param integer $page_id Page ID to attach it to
928
     * @param integer $schedule Time at which to activate
929 6
     */
930
    public function addQueueEntry_callBack($setId, $params, $callBack, $page_id = 0, $schedule = 0): void
931 6
    {
932
        if (! is_array($params)) {
0 ignored issues
show
introduced by
The condition is_array($params) is always true.
Loading history...
933 6
            $params = [];
934 6
        }
935 6
        $params['_CALLBACKOBJ'] = $callBack;
936 6
937
        GeneralUtility::makeInstance(ConnectionPool::class)->getConnectionForTable(QueueRepository::TABLE_NAME)
938 6
            ->insert(
939
                QueueRepository::TABLE_NAME,
940 6
                [
941 6
                    'page_id' => (int) $page_id,
942 6
                    'parameters' => json_encode($params),
943 6
                    'scheduled' => (int) $schedule ?: $this->getCurrentTime(),
944
                    'exec_time' => 0,
945
                    'set_id' => (int) $setId,
946
                    'result_data' => '',
947 6
                ]
948 6
            );
949 6
    }
950 1
951 1
    /************************************
952 1
     *
953 5
     * URL setting
954 1
     *
955 1
     ************************************/
956 1
957
    /**
958 6
     * Setting a URL for crawling:
959 4
     *
960 4
     * @param integer $id Page ID
961 4
     * @param string $url Complete URL
962
     * @param array $subCfg Sub configuration array (from TS config)
963 2
     * @param integer $tstamp Scheduled-time
964
     * @param string $configurationHash (optional) configuration hash
965 2
     * @param bool $skipInnerDuplicationCheck (optional) skip inner duplication check
966
     * @return bool
967
     */
968 2
    public function addUrl(
969
        $id,
970
        $url,
971
        array $subCfg,
972
        $tstamp,
973
        $configurationHash = '',
974
        $skipInnerDuplicationCheck = false
975
    ) {
976
        $urlAdded = false;
977
        $rows = [];
978
979
        // Creating parameters:
980
        $parameters = [
981
            'url' => $url,
982
        ];
983
984
        // fe user group simulation:
985
        $uGs = implode(',', array_unique(GeneralUtility::intExplode(',', $subCfg['userGroups'], true)));
986
        if ($uGs) {
987
            $parameters['feUserGroupList'] = $uGs;
988
        }
989
990
        // Setting processing instructions
991
        $parameters['procInstructions'] = GeneralUtility::trimExplode(',', $subCfg['procInstrFilter']);
992
        if (is_array($subCfg['procInstrParams.'])) {
993
            $parameters['procInstrParams'] = $subCfg['procInstrParams.'];
994
        }
995
996
        // Compile value array:
997
        $parameters_serialized = json_encode($parameters);
998
        $fieldArray = [
999
            'page_id' => (int) $id,
1000
            'parameters' => $parameters_serialized,
1001
            'parameters_hash' => GeneralUtility::shortMD5($parameters_serialized),
1002
            'configuration_hash' => $configurationHash,
1003
            'scheduled' => $tstamp,
1004
            'exec_time' => 0,
1005
            'set_id' => (int) $this->setID,
1006
            'result_data' => '',
1007
            'configuration' => $subCfg['key'],
1008
        ];
1009
1010
        if ($this->registerQueueEntriesInternallyOnly) {
1011
            //the entries will only be registered and not stored to the database
1012
            $this->queueEntries[] = $fieldArray;
1013
        } else {
1014
            if (! $skipInnerDuplicationCheck) {
1015
                // check if there is already an equal entry
1016
                $rows = $this->queueRepository->getDuplicateQueueItemsIfExists(
1017
                    (bool) $this->extensionSettings['enableTimeslot'],
1018 6
                    $tstamp,
1019
                    $this->getCurrentTime(),
1020
                    $fieldArray['page_id'],
1021
                    $fieldArray['parameters_hash']
1022
                );
1023
            }
1024
1025
            if (empty($rows)) {
1026 6
                $connectionForCrawlerQueue = GeneralUtility::makeInstance(ConnectionPool::class)->getConnectionForTable(QueueRepository::TABLE_NAME);
1027 6
                $connectionForCrawlerQueue->insert(
1028
                    QueueRepository::TABLE_NAME,
1029
                    $fieldArray
1030
                );
1031 6
                $uid = $connectionForCrawlerQueue->lastInsertId(QueueRepository::TABLE_NAME, 'qid');
1032
                $rows[] = $uid;
1033
                $urlAdded = true;
1034
1035 6
                $signalPayload = ['uid' => $uid, 'fieldArray' => $fieldArray];
1036 6
                SignalSlotUtility::emitSignal(
0 ignored issues
show
Deprecated Code introduced by
The function AOE\Crawler\Utility\Sign...otUtility::emitSignal() has been deprecated. ( Ignorable by Annotation )

If this is a false-positive, you can also ignore this issue in your code via the ignore-deprecated  annotation

1036
                /** @scrutinizer ignore-deprecated */ SignalSlotUtility::emitSignal(
Loading history...
1037 1
                    self::class,
1038
                    SignalSlotUtility::SIGNAL_URL_ADDED_TO_QUEUE,
1039
                    $signalPayload
1040
                );
1041 6
            } else {
1042 6
                $signalPayload = ['rows' => $rows, 'fieldArray' => $fieldArray];
1043 3
                SignalSlotUtility::emitSignal(
0 ignored issues
show
Deprecated Code introduced by
The function AOE\Crawler\Utility\Sign...otUtility::emitSignal() has been deprecated. ( Ignorable by Annotation )

If this is a false-positive, you can also ignore this issue in your code via the ignore-deprecated  annotation

1043
                /** @scrutinizer ignore-deprecated */ SignalSlotUtility::emitSignal(
Loading history...
1044
                    self::class,
1045
                    SignalSlotUtility::SIGNAL_DUPLICATE_URL_IN_QUEUE,
1046
                    $signalPayload
1047 6
                );
1048
            }
1049 6
        }
1050 6
1051 6
        return $urlAdded;
1052 6
    }
1053 6
1054 6
    /**
1055 6
     * Returns the current system time
1056 6
     *
1057 6
     * @return int
1058
     */
1059
    public function getCurrentTime()
1060 6
    {
1061
        return time();
1062 1
    }
1063
1064 5
    /************************************
1065
     *
1066 4
     * URL reading
1067
     *
1068
     ************************************/
1069 5
1070 4
    /**
1071 4
     * Read URL for single queue entry
1072 4
     *
1073 4
     * @param integer $queueId
1074
     * @param boolean $force If set, will process even if exec_time has been set!
1075 4
     *
1076 4
     * @return int|null
1077 4
     */
1078 4
    public function readUrl($queueId, $force = false)
1079
    {
1080 1
        $queryBuilder = GeneralUtility::makeInstance(ConnectionPool::class)->getQueryBuilderForTable(QueueRepository::TABLE_NAME);
1081
        $ret = 0;
1082
        $this->logger->debug('crawler-readurl start ' . microtime(true));
0 ignored issues
show
Bug introduced by
The method debug() does not exist on null. ( Ignorable by Annotation )

If this is a false-positive, you can also ignore this issue in your code via the ignore-call  annotation

1082
        $this->logger->/** @scrutinizer ignore-call */ 
1083
                       debug('crawler-readurl start ' . microtime(true));

This check looks for calls to methods that do not seem to exist on a given type. It looks for the method on the type itself as well as in inherited classes or implemented interfaces.

This is most likely a typographical error or the method has been renamed.

Loading history...
1083
1084 6
        $queryBuilder
1085
            ->select('*')
1086
            ->from(QueueRepository::TABLE_NAME)
1087
            ->where(
1088
                $queryBuilder->expr()->eq('qid', $queryBuilder->createNamedParameter($queueId, PDO::PARAM_INT))
1089
            );
1090
        if (! $force) {
1091
            $queryBuilder
1092
                ->andWhere('exec_time = 0')
1093
                ->andWhere('process_scheduled > 0');
1094
        }
1095
        $queueRec = $queryBuilder->execute()->fetch();
1096
1097
        if (! is_array($queueRec)) {
1098
            return;
1099
        }
1100
1101
        SignalSlotUtility::emitSignal(
0 ignored issues
show
Deprecated Code introduced by
The function AOE\Crawler\Utility\Sign...otUtility::emitSignal() has been deprecated. ( Ignorable by Annotation )

If this is a false-positive, you can also ignore this issue in your code via the ignore-deprecated  annotation

1101
        /** @scrutinizer ignore-deprecated */ SignalSlotUtility::emitSignal(
Loading history...
1102
            self::class,
1103
            SignalSlotUtility::SIGNAL_QUEUEITEM_PREPROCESS,
1104
            [$queueId, &$queueRec]
1105
        );
1106
1107
        // Set exec_time to lock record:
1108
        $field_array = ['exec_time' => $this->getCurrentTime()];
1109
1110
        if (isset($this->processID)) {
1111
            //if mulitprocessing is used we need to store the id of the process which has handled this entry
1112
            $field_array['process_id_completed'] = $this->processID;
1113
        }
1114
1115
        GeneralUtility::makeInstance(ConnectionPool::class)->getConnectionForTable(QueueRepository::TABLE_NAME)
1116
            ->update(
1117
                QueueRepository::TABLE_NAME,
1118
                $field_array,
1119
                ['qid' => (int) $queueId]
1120
            );
1121
1122
        $result = $this->queueExecutor->executeQueueItem($queueRec, $this);
1123
        if ($result['content'] === null) {
1124
            $resultData = 'An errors happened';
0 ignored issues
show
Unused Code introduced by
The assignment to $resultData is dead and can be removed.
Loading history...
1125
        } else {
1126
            /** @var JsonCompatibilityConverter $jsonCompatibilityConverter */
1127
            $jsonCompatibilityConverter = GeneralUtility::makeInstance(JsonCompatibilityConverter::class);
1128
            $resultData = $jsonCompatibilityConverter->convert($result['content']);
1129
1130
            //atm there's no need to point to specific pollable extensions
1131
            if (is_array($resultData) && is_array($GLOBALS['TYPO3_CONF_VARS']['EXTCONF']['crawler']['pollSuccess'])) {
1132
                foreach ($GLOBALS['TYPO3_CONF_VARS']['EXTCONF']['crawler']['pollSuccess'] as $pollable) {
1133
                    // only check the success value if the instruction is runnig
1134
                    // it is important to name the pollSuccess key same as the procInstructions key
1135
                    if (is_array($resultData['parameters']['procInstructions'])
1136
                        && in_array(
1137
                            $pollable,
1138
                            $resultData['parameters']['procInstructions'], true
1139
                        )
1140
                    ) {
1141
                        if (! empty($resultData['success'][$pollable]) && $resultData['success'][$pollable]) {
1142
                            $ret |= self::CLI_STATUS_POLLABLE_PROCESSED;
1143
                        }
1144
                    }
1145
                }
1146
            }
1147
        }
1148
        // Set result in log which also denotes the end of the processing of this entry.
1149
        $field_array = ['result_data' => json_encode($result)];
1150
1151
        SignalSlotUtility::emitSignal(
0 ignored issues
show
Deprecated Code introduced by
The function AOE\Crawler\Utility\Sign...otUtility::emitSignal() has been deprecated. ( Ignorable by Annotation )

If this is a false-positive, you can also ignore this issue in your code via the ignore-deprecated  annotation

1151
        /** @scrutinizer ignore-deprecated */ SignalSlotUtility::emitSignal(
Loading history...
1152
            self::class,
1153
            SignalSlotUtility::SIGNAL_QUEUEITEM_POSTPROCESS,
1154
            [$queueId, &$field_array]
1155
        );
1156
1157
        GeneralUtility::makeInstance(ConnectionPool::class)->getConnectionForTable(QueueRepository::TABLE_NAME)
1158
            ->update(
1159
                QueueRepository::TABLE_NAME,
1160
                $field_array,
1161
                ['qid' => (int) $queueId]
1162
            );
1163
1164
        $this->logger->debug('crawler-readurl stop ' . microtime(true));
1165
        return $ret;
1166
    }
1167
1168
    /**
1169
     * Read URL for not-yet-inserted log-entry
1170
     *
1171
     * @param array $field_array Queue field array,
1172
     *
1173
     * @return array|bool|mixed|string
1174
     */
1175
    public function readUrlFromArray($field_array)
1176
    {
1177
        // Set exec_time to lock record:
1178
        $field_array['exec_time'] = $this->getCurrentTime();
1179
        $connectionForCrawlerQueue = GeneralUtility::makeInstance(ConnectionPool::class)->getConnectionForTable(QueueRepository::TABLE_NAME);
1180
        $connectionForCrawlerQueue->insert(
1181
            QueueRepository::TABLE_NAME,
1182
            $field_array
1183
        );
1184
        $queueId = $field_array['qid'] = $connectionForCrawlerQueue->lastInsertId(QueueRepository::TABLE_NAME, 'qid');
1185
        $result = $this->queueExecutor->executeQueueItem($field_array, $this);
1186
1187
        // Set result in log which also denotes the end of the processing of this entry.
1188
        $field_array = ['result_data' => json_encode($result)];
1189
1190
        SignalSlotUtility::emitSignal(
0 ignored issues
show
Deprecated Code introduced by
The function AOE\Crawler\Utility\Sign...otUtility::emitSignal() has been deprecated. ( Ignorable by Annotation )

If this is a false-positive, you can also ignore this issue in your code via the ignore-deprecated  annotation

1190
        /** @scrutinizer ignore-deprecated */ SignalSlotUtility::emitSignal(
Loading history...
1191
            self::class,
1192
            SignalSlotUtility::SIGNAL_QUEUEITEM_POSTPROCESS,
1193
            [$queueId, &$field_array]
1194
        );
1195
1196
        $connectionForCrawlerQueue->update(
1197
            QueueRepository::TABLE_NAME,
1198
            $field_array,
1199
            ['qid' => $queueId]
1200
        );
1201
1202
        return $result;
1203
    }
1204
1205
    /*****************************
1206
     *
1207
     * Compiling URLs to crawl - tools
1208
     *
1209
     *****************************/
1210
1211
    /**
1212
     * @param integer $id Root page id to start from.
1213
     * @param integer $depth Depth of tree, 0=only id-page, 1= on sublevel, 99 = infinite
1214
     * @param integer $scheduledTime Unix Time when the URL is timed to be visited when put in queue
1215
     * @param integer $reqMinute Number of requests per minute (creates the interleave between requests)
1216
     * @param boolean $submitCrawlUrls If set, submits the URLs to queue in database (real crawling)
1217
     * @param boolean $downloadCrawlUrls If set (and submitcrawlUrls is false) will fill $downloadUrls with entries)
1218
     * @param array $incomingProcInstructions Array of processing instructions
1219
     * @param array $configurationSelection Array of configuration keys
1220
     * @return string
1221
     */
1222
    public function getPageTreeAndUrls(
1223
        $id,
1224
        $depth,
1225
        $scheduledTime,
1226
        $reqMinute,
1227
        $submitCrawlUrls,
1228
        $downloadCrawlUrls,
1229
        array $incomingProcInstructions,
1230
        array $configurationSelection
1231
    ) {
1232
        $this->scheduledTime = $scheduledTime;
1233
        $this->reqMinute = $reqMinute;
1234
        $this->submitCrawlUrls = $submitCrawlUrls;
1235
        $this->downloadCrawlUrls = $downloadCrawlUrls;
1236
        $this->incomingProcInstructions = $incomingProcInstructions;
1237
        $this->incomingConfigurationSelection = $configurationSelection;
1238
1239
        $this->duplicateTrack = [];
1240
        $this->downloadUrls = [];
1241
1242
        // Drawing tree:
1243
        /* @var PageTreeView $tree */
1244
        $tree = GeneralUtility::makeInstance(PageTreeView::class);
1245
        $perms_clause = $this->getBackendUser()->getPagePermsClause(Permission::PAGE_SHOW);
1246
        $tree->init('AND ' . $perms_clause);
1247
1248
        $pageInfo = BackendUtility::readPageAccess($id, $perms_clause);
1249
        if (is_array($pageInfo)) {
1250
            // Set root row:
1251
            $tree->tree[] = [
1252
                'row' => $pageInfo,
1253
                'HTML' => $this->iconFactory->getIconForRecord('pages', $pageInfo, Icon::SIZE_SMALL),
1254
            ];
1255
        }
1256
1257
        // Get branch beneath:
1258
        if ($depth) {
1259
            $tree->getTree($id, $depth, '');
1260
        }
1261
1262
        // Traverse page tree:
1263
        $code = '';
1264
1265
        foreach ($tree->tree as $data) {
1266
            $this->MP = false;
1267
1268
            // recognize mount points
1269
            if ($data['row']['doktype'] === PageRepository::DOKTYPE_MOUNTPOINT) {
1270
                $mountpage = $this->pageRepository->getPage($data['row']['uid']);
1271
1272
                // fetch mounted pages
1273
                $this->MP = $mountpage[0]['mount_pid'] . '-' . $data['row']['uid'];
0 ignored issues
show
Documentation Bug introduced by
The property $MP was declared of type boolean, but $mountpage[0]['mount_pid...' . $data['row']['uid'] is of type string. Maybe add a type cast?

This check looks for assignments to scalar types that may be of the wrong type.

To ensure the code behaves as expected, it may be a good idea to add an explicit type cast.

$answer = 42;

$correct = false;

$correct = (bool) $answer;
Loading history...
1274
1275
                $mountTree = GeneralUtility::makeInstance(PageTreeView::class);
1276
                $mountTree->init('AND ' . $perms_clause);
1277
                $mountTree->getTree($mountpage[0]['mount_pid'], $depth);
1278
1279
                foreach ($mountTree->tree as $mountData) {
1280
                    $code .= $this->drawURLs_addRowsForPage(
1281
                        $mountData['row'],
1282
                        $mountData['HTML'] . BackendUtility::getRecordTitle('pages', $mountData['row'], true)
1283
                    );
1284
                }
1285
1286
                // replace page when mount_pid_ol is enabled
1287
                if ($mountpage[0]['mount_pid_ol']) {
1288
                    $data['row']['uid'] = $mountpage[0]['mount_pid'];
1289
                } else {
1290
                    // if the mount_pid_ol is not set the MP must not be used for the mountpoint page
1291
                    $this->MP = false;
1292
                }
1293
            }
1294
1295
            $code .= $this->drawURLs_addRowsForPage(
1296
                $data['row'],
1297
                $data['HTML'] . BackendUtility::getRecordTitle('pages', $data['row'], true)
1298
            );
1299
        }
1300
1301
        return $code;
1302
    }
1303
1304
    /**
1305
     * Expands exclude string
1306
     *
1307
     * @param string $excludeString Exclude string
1308
     * @return array
1309
     * @deprecated
1310
     */
1311
    public function expandExcludeString($excludeString)
1312
    {
1313
        return $this->configurationService->expandExcludeString($excludeString);
1314
    }
1315
1316
    /**
1317
     * Create the rows for display of the page tree
1318
     * For each page a number of rows are shown displaying GET variable configuration
1319
     */
1320
    public function drawURLs_addRowsForPage(array $pageRow, string $pageTitle): string
1321
    {
1322
        $skipMessage = '';
1323
1324
        // Get list of configurations
1325
        $configurations = $this->getUrlsForPageRow($pageRow, $skipMessage);
1326
        $configurations = ConfigurationService::removeDisallowedConfigurations($this->incomingConfigurationSelection, $configurations);
1327
1328
        // Traverse parameter combinations:
1329
        $c = 0;
1330
        $content = '';
1331
        if (! empty($configurations)) {
1332
            foreach ($configurations as $confKey => $confArray) {
1333
1334
                // Title column:
1335
                if (! $c) {
1336
                    $titleClm = '<td rowspan="' . count($configurations) . '">' . $pageTitle . '</td>';
1337
                } else {
1338
                    $titleClm = '';
1339
                }
1340
1341
                if (! in_array($pageRow['uid'], $this->configurationService->expandExcludeString($confArray['subCfg']['exclude']), true)) {
1342
1343
                    // URL list:
1344
                    $urlList = $this->urlListFromUrlArray(
1345
                        $confArray,
1346
                        $pageRow,
1347
                        $this->scheduledTime,
1348
                        $this->reqMinute,
1349
                        $this->submitCrawlUrls,
1350
                        $this->downloadCrawlUrls,
1351
                        $this->duplicateTrack,
1352 1
                        $this->downloadUrls,
1353
                        // if empty the urls won't be filtered by processing instructions
1354
                        $this->incomingProcInstructions
1355 1
                    );
1356 1
1357
                    // Expanded parameters:
1358 1
                    $paramExpanded = '';
1359 1
                    $calcAccu = [];
1360
                    $calcRes = 1;
1361 1
                    foreach ($confArray['paramExpanded'] as $gVar => $gVal) {
1362
                        $paramExpanded .= '
1363
                            <tr>
1364
                                <td>' . htmlspecialchars('&' . $gVar . '=') . '<br/>' .
1365
                            '(' . count($gVal) . ')' .
1366
                            '</td>
1367
                                <td nowrap="nowrap">' . nl2br(htmlspecialchars(implode(chr(10), $gVal))) . '</td>
1368
                            </tr>
1369
                        ';
1370
                        $calcRes *= count($gVal);
1371
                        $calcAccu[] = count($gVal);
1372
                    }
1373
                    $paramExpanded = '<table>' . $paramExpanded . '</table>';
1374
                    $paramExpanded .= 'Comb: ' . implode('*', $calcAccu) . '=' . $calcRes;
1375
1376
                    // Options
1377
                    $optionValues = '';
1378
                    if ($confArray['subCfg']['userGroups']) {
1379
                        $optionValues .= 'User Groups: ' . $confArray['subCfg']['userGroups'] . '<br/>';
1380
                    }
1381
                    if ($confArray['subCfg']['procInstrFilter']) {
1382
                        $optionValues .= 'ProcInstr: ' . $confArray['subCfg']['procInstrFilter'] . '<br/>';
1383
                    }
1384
1385
                    // Compile row:
1386
                    $content .= '
1387
                        <tr>
1388
                            ' . $titleClm . '
1389
                            <td>' . htmlspecialchars($confKey) . '</td>
1390
                            <td>' . nl2br(htmlspecialchars(rawurldecode(trim(str_replace('&', chr(10) . '&', GeneralUtility::implodeArrayForUrl('', $confArray['paramParsed'])))))) . '</td>
1391
                            <td>' . $paramExpanded . '</td>
1392 1
                            <td nowrap="nowrap">' . $urlList . '</td>
1393
                            <td nowrap="nowrap">' . $optionValues . '</td>
1394
                            <td nowrap="nowrap">' . DebugUtility::viewArray($confArray['subCfg']['procInstrParams.']) . '</td>
1395 1
                        </tr>';
1396
                } else {
1397
                    $content .= '<tr>
1398
                            ' . $titleClm . '
1399
                            <td>' . htmlspecialchars($confKey) . '</td>
1400
                            <td colspan="5"><em>No entries</em> (Page is excluded in this configuration)</td>
1401
                        </tr>';
1402
                }
1403
1404
                $c++;
1405
            }
1406
        } else {
1407
            $message = ! empty($skipMessage) ? ' (' . $skipMessage . ')' : '';
1408
1409
            // Compile row:
1410
            $content .= '
1411
                <tr>
1412
                    <td>' . $pageTitle . '</td>
1413
                    <td colspan="6"><em>No entries</em>' . $message . '</td>
1414
                </tr>';
1415
        }
1416
1417
        return $content;
1418
    }
1419
1420
    /*****************************
1421
     *
1422
     * CLI functions
1423
     *
1424
     *****************************/
1425
1426
    /**
1427
     * Running the functionality of the CLI (crawling URLs from queue)
1428
     */
1429
    public function CLI_run(int $countInARun, int $sleepTime, int $sleepAfterFinish): int
1430
    {
1431
        $result = 0;
1432
        $counter = 0;
1433
1434
        // First, run hooks:
1435
        foreach ($GLOBALS['TYPO3_CONF_VARS']['EXTCONF']['crawler']['cli_hooks'] ?? [] as $objRef) {
1436
            trigger_error(
1437
                'This hook (crawler/cli_hooks) is deprecated since 9.1.5 and will be removed when dropping support for TYPO3 9LTS and 10LTS',
1438
                E_USER_DEPRECATED
1439
            );
1440
            $hookObj = GeneralUtility::makeInstance($objRef);
1441
            if (is_object($hookObj)) {
1442
                $hookObj->crawler_init($this);
1443
            }
1444
        }
1445
1446
        // Clean up the queue
1447
        $this->queueRepository->cleanupQueue();
1448
1449
        // Select entries:
1450
        $rows = $this->queueRepository->fetchRecordsToBeCrawled($countInARun);
1451
1452
        if (! empty($rows)) {
1453
            $quidList = [];
1454
1455
            foreach ($rows as $r) {
1456
                $quidList[] = $r['qid'];
1457
            }
1458
1459
            $processId = $this->CLI_buildProcessId();
1460
1461
            //save the number of assigned queue entries to determine how many have been processed later
1462
            $numberOfAffectedRows = $this->queueRepository->updateProcessIdAndSchedulerForQueueIds($quidList, $processId);
1463
            $this->processRepository->updateProcessAssignItemsCount($numberOfAffectedRows, $processId);
1464
1465
            if ($numberOfAffectedRows !== count($quidList)) {
1466
                return ($result | self::CLI_STATUS_ABORTED);
1467
            }
1468
1469
            foreach ($rows as $r) {
1470
                $result |= $this->readUrl($r['qid']);
1471
1472
                $counter++;
1473
                // Just to relax the system
1474
                usleep((int) $sleepTime);
1475
1476
                // if during the start and the current read url the cli has been disable we need to return from the function
1477
                // mark the process NOT as ended.
1478
                if ($this->crawler->isDisabled()) {
1479
                    return ($result | self::CLI_STATUS_ABORTED);
1480
                }
1481
1482
                if (! $this->processRepository->isProcessActive($this->CLI_buildProcessId())) {
1483
                    $this->CLI_debug('conflict / timeout (' . $this->CLI_buildProcessId() . ')');
0 ignored issues
show
Deprecated Code introduced by
The function AOE\Crawler\Controller\C...Controller::CLI_debug() has been deprecated. ( Ignorable by Annotation )

If this is a false-positive, you can also ignore this issue in your code via the ignore-deprecated  annotation

1483
                    /** @scrutinizer ignore-deprecated */ $this->CLI_debug('conflict / timeout (' . $this->CLI_buildProcessId() . ')');
Loading history...
1484
                    $result |= self::CLI_STATUS_ABORTED;
1485
                    //possible timeout
1486
                    break;
1487
                }
1488
            }
1489
1490
            sleep((int) $sleepAfterFinish);
1491
        }
1492
1493
        if ($counter > 0) {
1494
            $result |= self::CLI_STATUS_PROCESSED;
1495
        }
1496
1497
        return $result;
1498
    }
1499
1500
    /**
1501
     * Activate hooks
1502
     * @deprecated
1503
     */
1504
    public function CLI_runHooks(): void
1505
    {
1506
        foreach ($GLOBALS['TYPO3_CONF_VARS']['EXTCONF']['crawler']['cli_hooks'] ?? [] as $objRef) {
1507
            $hookObj = GeneralUtility::makeInstance($objRef);
1508
            if (is_object($hookObj)) {
1509
                $hookObj->crawler_init($this);
1510
            }
1511
        }
1512
    }
1513
1514
    /**
1515
     * Try to acquire a new process with the given id
1516
     * also performs some auto-cleanup for orphan processes
1517
     * @param string $id identification string for the process
1518
     * @return boolean
1519
     * @todo preemption might not be the most elegant way to clean up
1520
     */
1521
    public function CLI_checkAndAcquireNewProcess($id)
1522
    {
1523
        $ret = true;
1524
1525
        $systemProcessId = getmypid();
1526
        if (! $systemProcessId) {
1527
            return false;
1528
        }
1529
1530
        $processCount = 0;
1531
        $orphanProcesses = [];
1532
1533
        $activeProcesses = $this->processRepository->findAllActive();
1534
        $currentTime = $this->getCurrentTime();
1535
1536
        /** @var Process $process */
1537
        foreach ($activeProcesses as $process) {
1538
            if ($process->getTtl() < $currentTime) {
1539
                $orphanProcesses[] = $process->getProcessId();
1540
            } else {
1541
                $processCount++;
1542
            }
1543
        }
1544
1545
        // if there are less than allowed active processes then add a new one
1546
        if ($processCount < (int) $this->extensionSettings['processLimit']) {
1547
            $this->processRepository->addProcess($id, $systemProcessId);
1548
        } else {
1549
            $ret = false;
1550
        }
1551
1552
        $this->processRepository->deleteProcessesMarkedAsDeleted();
1553
        $this->processRepository->markRequestedProcessesAsNotActive($orphanProcesses);
1554
        $this->queueRepository->unsetProcessScheduledAndProcessIdForQueueEntries($orphanProcesses);
1555
1556
        return $ret;
1557
    }
1558
1559
    /**
1560
     * Release a process and the required resources
1561
     *
1562
     * @param mixed $releaseIds string with a single process-id or array with multiple process-ids
1563
     * @return boolean
1564
     * @deprecated
1565
     */
1566
    public function CLI_releaseProcesses($releaseIds)
1567
    {
1568
        $queryBuilder = GeneralUtility::makeInstance(ConnectionPool::class)->getQueryBuilderForTable($this->tableName);
0 ignored issues
show
Deprecated Code introduced by
The property AOE\Crawler\Controller\C...rController::$tableName has been deprecated: Since v9.2.5 - This will be remove in v10 ( Ignorable by Annotation )

If this is a false-positive, you can also ignore this issue in your code via the ignore-deprecated  annotation

1568
        $queryBuilder = GeneralUtility::makeInstance(ConnectionPool::class)->getQueryBuilderForTable(/** @scrutinizer ignore-deprecated */ $this->tableName);

This property has been deprecated. The supplier of the class has supplied an explanatory message.

The explanatory message should give you some clue as to whether and when the property will be removed from the class and what other property to use instead.

Loading history...
1569
1570
        if (! is_array($releaseIds)) {
1571
            $releaseIds = [$releaseIds];
1572
        }
1573
1574
        if (empty($releaseIds)) {
1575
            //nothing to release
1576
            return false;
1577
        }
1578
1579
        // some kind of 2nd chance algo - this way you need at least 2 processes to have a real cleanup
1580
        // this ensures that a single process can't mess up the entire process table
1581
1582
        // mark all processes as deleted which have no "waiting" queue-entires and which are not active
1583
1584
        // ReleaseQueueEntries
1585
        $queryBuilder
1586
            ->update(QueueRepository::TABLE_NAME, 'q')
1587
            ->where(
1588
                'q.process_id IN(SELECT p.process_id FROM tx_crawler_process as p WHERE p.active = 0)'
1589
            )
1590
            ->set('q.process_scheduled', 0)
1591
            ->set('q.process_id', '')
1592
            ->execute();
1593
1594
        // FIXME: Not entirely sure that this is equivalent to the previous version
1595
        $queryBuilder->resetQueryPart('set');
1596
1597
        // ReleaseProcessEntries
1598
        $queryBuilder
1599
            ->update(ProcessRepository::TABLE_NAME)
1600
            ->where(
1601
                $queryBuilder->expr()->eq('active', 0),
1602
                'process_id IN(SELECT q.process_id FROM tx_crawler_queue as q WHERE q.exec_time = 0)'
1603
            )
1604
            ->set('system_process_id', 0)
1605
            ->execute();
1606
1607
        $this->processRepository->markRequestedProcessesAsNotActive($releaseIds);
1608
        $this->queueRepository->unsetProcessScheduledAndProcessIdForQueueEntries($releaseIds);
1609
1610
        return true;
1611
    }
1612
1613
    /**
1614
     * Create a unique Id for the current process
1615
     *
1616
     * @return string the ID
1617
     */
1618
    public function CLI_buildProcessId()
1619
    {
1620
        if (! $this->processID) {
1621
            $this->processID = GeneralUtility::shortMD5(microtime(true));
1622
        }
1623
        return $this->processID;
1624
    }
1625
1626
    /**
1627
     * Prints a message to the stdout (only if debug-mode is enabled)
1628
     *
1629
     * @param string $msg the message
1630
     * @deprecated
1631
     * @codeCoverageIgnore
1632
     */
1633
    public function CLI_debug($msg): void
1634
    {
1635
        if ((int) $this->extensionSettings['processDebug']) {
1636
            echo $msg . "\n";
1637
            flush();
1638
        }
1639
    }
1640
1641
    /**
1642
     * Cleans up entries that stayed for too long in the queue. These are:
1643
     * - processed entries that are over 1.5 days in age
1644
     * - scheduled entries that are over 7 days old
1645
     *
1646
     * @deprecated
1647
     */
1648
    public function cleanUpOldQueueEntries(): void
1649
    {
1650
        // 24*60*60 Seconds in 24 hours
1651
        $processedAgeInSeconds = $this->extensionSettings['cleanUpProcessedAge'] * 86400;
1652
        $scheduledAgeInSeconds = $this->extensionSettings['cleanUpScheduledAge'] * 86400;
1653
1654
        $now = time();
1655
        $condition = '(exec_time<>0 AND exec_time<' . ($now - $processedAgeInSeconds) . ') OR scheduled<=' . ($now - $scheduledAgeInSeconds);
1656
        $this->flushQueue($condition);
0 ignored issues
show
Deprecated Code introduced by
The function AOE\Crawler\Controller\C...ontroller::flushQueue() has been deprecated. ( Ignorable by Annotation )

If this is a false-positive, you can also ignore this issue in your code via the ignore-deprecated  annotation

1656
        /** @scrutinizer ignore-deprecated */ $this->flushQueue($condition);
Loading history...
1657
    }
1658
1659
    /**
1660
     * Removes queue entries
1661
     *
1662
     * @param string $where SQL related filter for the entries which should be removed
1663
     *
1664
     * @deprecated
1665
     */
1666
    protected function flushQueue($where = ''): void
1667
    {
1668
        $realWhere = strlen((string) $where) > 0 ? $where : '1=1';
1669
1670
        $queryBuilder = $this->getQueryBuilder($this->tableName);
0 ignored issues
show
Deprecated Code introduced by
The property AOE\Crawler\Controller\C...rController::$tableName has been deprecated: Since v9.2.5 - This will be remove in v10 ( Ignorable by Annotation )

If this is a false-positive, you can also ignore this issue in your code via the ignore-deprecated  annotation

1670
        $queryBuilder = $this->getQueryBuilder(/** @scrutinizer ignore-deprecated */ $this->tableName);

This property has been deprecated. The supplier of the class has supplied an explanatory message.

The explanatory message should give you some clue as to whether and when the property will be removed from the class and what other property to use instead.

Loading history...
1671
1672
        $groups = $queryBuilder
1673
            ->selectLiteral('DISTINCT set_id')
1674
            ->from($this->tableName)
0 ignored issues
show
Deprecated Code introduced by
The property AOE\Crawler\Controller\C...rController::$tableName has been deprecated: Since v9.2.5 - This will be remove in v10 ( Ignorable by Annotation )

If this is a false-positive, you can also ignore this issue in your code via the ignore-deprecated  annotation

1674
            ->from(/** @scrutinizer ignore-deprecated */ $this->tableName)

This property has been deprecated. The supplier of the class has supplied an explanatory message.

The explanatory message should give you some clue as to whether and when the property will be removed from the class and what other property to use instead.

Loading history...
1675
            ->where($realWhere)
1676
            ->execute()
1677
            ->fetchAll();
1678
        if (is_array($groups)) {
0 ignored issues
show
introduced by
The condition is_array($groups) is always true.
Loading history...
1679
            foreach ($groups as $group) {
1680
                $subSet = $queryBuilder
1681
                    ->select('qid', 'set_id')
1682
                    ->from($this->tableName)
0 ignored issues
show
Deprecated Code introduced by
The property AOE\Crawler\Controller\C...rController::$tableName has been deprecated: Since v9.2.5 - This will be remove in v10 ( Ignorable by Annotation )

If this is a false-positive, you can also ignore this issue in your code via the ignore-deprecated  annotation

1682
                    ->from(/** @scrutinizer ignore-deprecated */ $this->tableName)

This property has been deprecated. The supplier of the class has supplied an explanatory message.

The explanatory message should give you some clue as to whether and when the property will be removed from the class and what other property to use instead.

Loading history...
1683
                    ->where(
1684
                        $realWhere,
1685
                        $queryBuilder->expr()->eq('set_id', $group['set_id'])
1686
                    )
1687
                    ->execute()
1688
                    ->fetchAll();
1689
1690
                $payLoad = ['subSet' => $subSet];
1691
                SignalSlotUtility::emitSignal(
0 ignored issues
show
Deprecated Code introduced by
The function AOE\Crawler\Utility\Sign...otUtility::emitSignal() has been deprecated. ( Ignorable by Annotation )

If this is a false-positive, you can also ignore this issue in your code via the ignore-deprecated  annotation

1691
                /** @scrutinizer ignore-deprecated */ SignalSlotUtility::emitSignal(
Loading history...
1692
                    self::class,
1693
                    SignalSlotUtility::SIGNAL_QUEUE_ENTRY_FLUSH,
1694
                    $payLoad
1695
                );
1696
            }
1697
        }
1698
1699
        $queryBuilder
1700
            ->delete($this->tableName)
0 ignored issues
show
Deprecated Code introduced by
The property AOE\Crawler\Controller\C...rController::$tableName has been deprecated: Since v9.2.5 - This will be remove in v10 ( Ignorable by Annotation )

If this is a false-positive, you can also ignore this issue in your code via the ignore-deprecated  annotation

1700
            ->delete(/** @scrutinizer ignore-deprecated */ $this->tableName)

This property has been deprecated. The supplier of the class has supplied an explanatory message.

The explanatory message should give you some clue as to whether and when the property will be removed from the class and what other property to use instead.

Loading history...
1701
            ->where($realWhere)
1702
            ->execute();
1703
    }
1704
1705
    /**
1706
     * This method determines duplicates for a queue entry with the same parameters and this timestamp.
1707
     * If the timestamp is in the past, it will check if there is any unprocessed queue entry in the past.
1708
     * If the timestamp is in the future it will check, if the queued entry has exactly the same timestamp
1709
     *
1710
     * @param int $tstamp
1711
     * @param array $fieldArray
1712
     *
1713
     * @return array
1714
     * @deprecated
1715
     */
1716 1
    protected function getDuplicateRowsIfExist($tstamp, $fieldArray)
1717
    {
1718 1
        $rows = [];
1719
1720
        $currentTime = $this->getCurrentTime();
1721 1
1722
        $queryBuilder = GeneralUtility::makeInstance(ConnectionPool::class)->getQueryBuilderForTable($this->tableName);
0 ignored issues
show
Deprecated Code introduced by
The property AOE\Crawler\Controller\C...rController::$tableName has been deprecated: Since v9.2.5 - This will be remove in v10 ( Ignorable by Annotation )

If this is a false-positive, you can also ignore this issue in your code via the ignore-deprecated  annotation

1722
        $queryBuilder = GeneralUtility::makeInstance(ConnectionPool::class)->getQueryBuilderForTable(/** @scrutinizer ignore-deprecated */ $this->tableName);

This property has been deprecated. The supplier of the class has supplied an explanatory message.

The explanatory message should give you some clue as to whether and when the property will be removed from the class and what other property to use instead.

Loading history...
1723
        $queryBuilder
1724
            ->select('qid')
1725
            ->from(QueueRepository::TABLE_NAME);
1726
        //if this entry is scheduled with "now"
1727
        if ($tstamp <= $currentTime) {
1728
            if ($this->extensionSettings['enableTimeslot']) {
1729
                $timeBegin = $currentTime - 100;
1730
                $timeEnd = $currentTime + 100;
1731
                $queryBuilder
1732
                    ->where(
1733
                        'scheduled BETWEEN ' . $timeBegin . ' AND ' . $timeEnd . ''
1734
                    )
1735
                    ->orWhere(
1736
                        $queryBuilder->expr()->lte('scheduled', $currentTime)
1737
                    );
1738
            } else {
1739
                $queryBuilder
1740
                    ->where(
1741
                        $queryBuilder->expr()->lte('scheduled', $currentTime)
1742
                    );
1743
            }
1744 1
        } elseif ($tstamp > $currentTime) {
1745
            //entry with a timestamp in the future need to have the same schedule time
1746 1
            $queryBuilder
1747 1
                ->where(
1748
                    $queryBuilder->expr()->eq('scheduled', $tstamp)
1749 1
                );
1750 1
        }
1751 1
1752 1
        $queryBuilder
1753
            ->andWhere('NOT exec_time')
1754
            ->andWhere('NOT process_id')
1755
            ->andWhere($queryBuilder->expr()->eq('page_id', $queryBuilder->createNamedParameter($fieldArray['page_id'], PDO::PARAM_INT)))
1756
            ->andWhere($queryBuilder->expr()->eq('parameters_hash', $queryBuilder->createNamedParameter($fieldArray['parameters_hash'], PDO::PARAM_STR)));
1757
1758
        $statement = $queryBuilder->execute();
1759
1760
        while ($row = $statement->fetch()) {
1761 5
            $rows[] = $row['qid'];
1762
        }
1763 5
1764
        return $rows;
1765 5
    }
1766
1767 5
    /**
1768
     * Returns a md5 hash generated from a serialized configuration array.
1769
     *
1770
     * @return string
1771
     */
1772
    protected function getConfigurationHash(array $configuration)
1773
    {
1774
        unset($configuration['paramExpanded']);
1775
        unset($configuration['URLs']);
1776
        return md5(serialize($configuration));
1777
    }
1778
1779
    /**
1780
     * Build a URL from a Page and the Query String. If the page has a Site configuration, it can be built by using
1781
     * the Site instance.
1782
     *
1783
     * @param int $httpsOrHttp see tx_crawler_configuration.force_ssl
1784
     * @throws SiteNotFoundException
1785
     * @throws InvalidRouteArgumentsException
1786
     *
1787
     * @deprecated Using CrawlerController::getUrlFromPageAndQueryParameters() is deprecated since 9.1.1 and will be removed in v11.x, please use UrlService->getUrlFromPageAndQueryParameters() instead.
1788
     * @codeCoverageIgnore
1789
     */
1790
    protected function getUrlFromPageAndQueryParameters(int $pageId, string $queryString, ?string $alternativeBaseUrl, int $httpsOrHttp): UriInterface
1791 5
    {
1792 5
        $urlService = new UrlService();
1793 5
        return $urlService->getUrlFromPageAndQueryParameters($pageId, $queryString, $alternativeBaseUrl, $httpsOrHttp);
1794 5
    }
1795
1796
    /**
1797
     * @deprecated
1798
     */
1799
    protected function swapIfFirstIsLargerThanSecond(array $reg): array
1800
    {
1801
        // Swap if first is larger than last:
1802
        if ($reg[1] > $reg[2]) {
1803
            $temp = $reg[2];
1804
            $reg[2] = $reg[1];
1805
            $reg[1] = $temp;
1806 7
        }
1807
1808 7
        return $reg;
1809
    }
1810 7
1811
    protected function getPageService(): PageService
1812 7
    {
1813
        return new PageService();
1814 7
    }
1815 7
1816
    private function getMaximumUrlsToCompile(): int
1817 7
    {
1818 2
        return $this->maximumUrlsToCompile;
1819 1
    }
1820 1
1821
    /**
1822 1
     * @return BackendUserAuthentication
1823 1
     */
1824
    private function getBackendUser()
1825 1
    {
1826 1
        // Make sure the _cli_ user is loaded
1827
        Bootstrap::initializeBackendAuthentication();
1828
        if ($this->backendUser === null) {
1829
            $this->backendUser = $GLOBALS['BE_USER'];
1830 1
        }
1831 2
        return $this->backendUser;
1832
    }
1833
1834 5
    /**
1835
     * Get querybuilder for given table
1836
     *
1837 5
     * @return QueryBuilder
1838 5
     */
1839
    private function getQueryBuilder(string $table)
1840
    {
1841
        return GeneralUtility::makeInstance(ConnectionPool::class)->getQueryBuilderForTable($table);
1842
    }
1843
}
1844