Passed
Push — main ( d75d73...89511e )
by Tomas Norre
29:12 queued 24:29
created

CrawlerController::expandExcludeString()   A

Complexity

Conditions 1
Paths 1

Size

Total Lines 3
Code Lines 1

Duplication

Lines 0
Ratio 0 %

Code Coverage

Tests 2
CRAP Score 1

Importance

Changes 1
Bugs 0 Features 0
Metric Value
cc 1
eloc 1
nc 1
nop 1
dl 0
loc 3
ccs 2
cts 2
cp 1
crap 1
rs 10
c 1
b 0
f 0
1
<?php
2
3
declare(strict_types=1);
4
5
namespace AOE\Crawler\Controller;
6
7
/***************************************************************
8
 *  Copyright notice
9
 *
10
 *  (c) 2020 AOE GmbH <[email protected]>
11
 *
12
 *  All rights reserved
13
 *
14
 *  This script is part of the TYPO3 project. The TYPO3 project is
15
 *  free software; you can redistribute it and/or modify
16
 *  it under the terms of the GNU General Public License as published by
17
 *  the Free Software Foundation; either version 3 of the License, or
18
 *  (at your option) any later version.
19
 *
20
 *  The GNU General Public License can be found at
21
 *  http://www.gnu.org/copyleft/gpl.html.
22
 *
23
 *  This script is distributed in the hope that it will be useful,
24
 *  but WITHOUT ANY WARRANTY; without even the implied warranty of
25
 *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
26
 *  GNU General Public License for more details.
27
 *
28
 *  This copyright notice MUST APPEAR in all copies of the script!
29
 ***************************************************************/
30
31
use AOE\Crawler\Configuration\ExtensionConfigurationProvider;
32
use AOE\Crawler\Converter\JsonCompatibilityConverter;
33
use AOE\Crawler\Crawler;
34
use AOE\Crawler\CrawlStrategy\CrawlStrategyFactory;
35
use AOE\Crawler\Domain\Model\Process;
36
use AOE\Crawler\Domain\Repository\ConfigurationRepository;
37
use AOE\Crawler\Domain\Repository\ProcessRepository;
38
use AOE\Crawler\Domain\Repository\QueueRepository;
39
use AOE\Crawler\QueueExecutor;
40
use AOE\Crawler\Service\ConfigurationService;
41
use AOE\Crawler\Service\PageService;
42
use AOE\Crawler\Service\UrlService;
43
use AOE\Crawler\Utility\SignalSlotUtility;
44
use AOE\Crawler\Value\QueueFilter;
45
use PDO;
46
use Psr\Http\Message\UriInterface;
47
use Psr\Log\LoggerAwareInterface;
48
use Psr\Log\LoggerAwareTrait;
49
use TYPO3\CMS\Backend\Tree\View\PageTreeView;
50
use TYPO3\CMS\Backend\Utility\BackendUtility;
51
use TYPO3\CMS\Core\Authentication\BackendUserAuthentication;
52
use TYPO3\CMS\Core\Compatibility\PublicMethodDeprecationTrait;
53
use TYPO3\CMS\Core\Compatibility\PublicPropertyDeprecationTrait;
54
use TYPO3\CMS\Core\Core\Bootstrap;
55
use TYPO3\CMS\Core\Core\Environment;
56
use TYPO3\CMS\Core\Database\Connection;
57
use TYPO3\CMS\Core\Database\ConnectionPool;
58
use TYPO3\CMS\Core\Database\Query\QueryBuilder;
59
use TYPO3\CMS\Core\Database\Query\Restriction\DeletedRestriction;
60
use TYPO3\CMS\Core\Database\QueryGenerator;
61
use TYPO3\CMS\Core\Domain\Repository\PageRepository;
62
use TYPO3\CMS\Core\Exception\SiteNotFoundException;
63
use TYPO3\CMS\Core\Imaging\Icon;
64
use TYPO3\CMS\Core\Imaging\IconFactory;
65
use TYPO3\CMS\Core\Routing\InvalidRouteArgumentsException;
66
use TYPO3\CMS\Core\Site\Entity\Site;
67
use TYPO3\CMS\Core\Type\Bitmask\Permission;
68
use TYPO3\CMS\Core\Utility\DebugUtility;
69
use TYPO3\CMS\Core\Utility\GeneralUtility;
70
use TYPO3\CMS\Core\Utility\MathUtility;
71
use TYPO3\CMS\Extbase\Object\ObjectManager;
72
73
/**
74
 * Class CrawlerController
75
 *
76
 * @package AOE\Crawler\Controller
77
 */
78
class CrawlerController implements LoggerAwareInterface
79
{
80
    use LoggerAwareTrait;
81
    use PublicMethodDeprecationTrait;
82
    use PublicPropertyDeprecationTrait;
83
84
    /**
85
     * @deprecated since 9.2.5 will be removed in v11.x
86
     */
87
    public const CLI_STATUS_NOTHING_PROCCESSED = 0;
88
89
    /**
90
     * queue not empty
91
     * @deprecated since 9.2.5 will be removed in v11.x
92
     */
93
    public const CLI_STATUS_REMAIN = 1;
94
95
    /**
96
     * (some) queue items where processed
97
     * @deprecated since 9.2.5 will be removed in v11.x
98
     */
99
    public const CLI_STATUS_PROCESSED = 2;
100
101
    /**
102
     * instance didn't finish
103
     * @deprecated since 9.2.5 will be removed in v11.x
104
     */
105
    public const CLI_STATUS_ABORTED = 4;
106
107
    /**
108
     * @deprecated since 9.2.5 will be removed in v11.x
109
     */
110
    public const CLI_STATUS_POLLABLE_PROCESSED = 8;
111
112
    /**
113
     * @var integer
114
     */
115
    public $setID = 0;
116
117
    /**
118
     * @var string
119
     */
120
    public $processID = '';
121
122
    /**
123
     * @var array
124
     */
125
    public $duplicateTrack = [];
126
127
    /**
128
     * @var array
129
     */
130
    public $downloadUrls = [];
131
132
    /**
133
     * @var array
134
     */
135
    public $incomingProcInstructions = [];
136
137
    /**
138
     * @var array
139
     */
140
    public $incomingConfigurationSelection = [];
141
142
    /**
143
     * @var bool
144
     */
145
    public $registerQueueEntriesInternallyOnly = false;
146
147
    /**
148
     * @var array
149
     */
150
    public $queueEntries = [];
151
152
    /**
153
     * @var array
154
     */
155
    public $urlList = [];
156
157
    /**
158
     * @var array
159
     */
160
    public $extensionSettings = [];
161
162
    /**
163
     * Mount Point
164
     *
165
     * @var bool
166
     * Todo: Check what this is used for and adjust the type hint or code, as bool doesn't match the current code.
167
     */
168
    public $MP = false;
169
170
    /**
171
     * @var string
172
     * @deprecated
173
     */
174
    protected $processFilename;
175
176
    /**
177
     * Holds the internal access mode can be 'gui','cli' or 'cli_im'
178
     *
179
     * @var string
180
     * @deprecated
181
     */
182
    protected $accessMode;
183
184
    /**
185
     * @var QueueRepository
186
     */
187
    protected $queueRepository;
188
189
    /**
190
     * @var ProcessRepository
191
     */
192
    protected $processRepository;
193
194
    /**
195
     * @var ConfigurationRepository
196
     */
197
    protected $configurationRepository;
198
199
    /**
200
     * @var string
201
     * @deprecated Since v9.2.5 - This will be remove in v10
202
     */
203
    protected $tableName = 'tx_crawler_queue';
204
205
    /**
206
     * @var QueueExecutor
207
     */
208
    protected $queueExecutor;
209
210
    /**
211
     * @var int
212
     */
213
    protected $maximumUrlsToCompile = 10000;
214
215
    /**
216
     * @var IconFactory
217
     */
218
    protected $iconFactory;
219
220
    /**
221
     * @var string[]
222
     */
223
    private $deprecatedPublicMethods = [
0 ignored issues
show
introduced by
The private property $deprecatedPublicMethods is not used, and could be removed.
Loading history...
224
        'compileUrls' => 'Using CrawlerController->compileUrls() is deprecated since 9.2.5, and will be removed in v11.x',
225
        'cleanUpOldQueueEntries' => 'Using CrawlerController::cleanUpOldQueueEntries() is deprecated since 9.0.1 and will be removed in v11.x, please use QueueRepository->cleanUpOldQueueEntries() instead.',
226
        'CLI_buildProcessId' => 'Using CrawlerController->CLI_buildProcessId() is deprecated since 9.2.5 and will be removed in v11.x',
227
        'CLI_checkAndAcquireNewProcess' => 'Using CrawlerController->CLI_checkAndAcquireNewProcess() is deprecated since 9.2.5 and will be removed in v11.x',
228
        'CLI_debug' => 'Using CrawlerController->CLI_debug() is deprecated since 9.1.3 and will be removed in v11.x',
229
        'CLI_releaseProcesses' => 'Using CrawlerController->CLI_releaseProcesses() is deprecated since 9.2.2 and will be removed in v11.x',
230
        'CLI_run' => 'Using CrawlerController->CLI_run() is deprecated since 9.2.2 and will be removed in v11.x',
231
        'CLI_runHooks' => 'Using CrawlerController->CLI_runHooks() is deprecated since 9.1.5 and will be removed in v11.x',
232
        'expandExcludeString' => 'Using CrawlerController->expandExcludeString() is deprecated since 9.2.5 and will be removed in v11.x',
233
        'getAccessMode' => 'Using CrawlerController->getAccessMode() is deprecated since 9.1.3 and will be removed in v11.x',
234
        'getLogEntriesForPageId' => 'Using CrawlerController->getLogEntriesForPageId() is deprecated since 9.1.5 and will be remove in v11.x',
235
        'getLogEntriesForSetId' => 'Using crawlerController::getLogEntriesForSetId() is deprecated since 9.0.1 and will be removed in v11.x',
236
        'hasGroupAccess' => 'Using CrawlerController->getLogEntriesForPageId() is deprecated since 9.2.2 and will be remove in v11.x, please use UserService::hasGroupAccess() instead.',
237
        'flushQueue' => 'Using CrawlerController::flushQueue() is deprecated since 9.0.1 and will be removed in v11.x, please use QueueRepository->flushQueue() instead.',
238
        'setAccessMode' => 'Using CrawlerController->setAccessMode() is deprecated since 9.1.3 and will be removed in v11.x',
239
        'getDisabled' => 'Using CrawlerController->getDisabled() is deprecated since 9.1.3 and will be removed in v11.x, please use Crawler->isDisabled() instead',
240
        'setDisabled' => 'Using CrawlerController->setDisabled() is deprecated since 9.1.3 and will be removed in v11.x, please use Crawler->setDisabled() instead',
241
        'getProcessFilename' => 'Using CrawlerController->getProcessFilename() is deprecated since 9.1.3 and will be removed in v11.x',
242
        'setProcessFilename' => 'Using CrawlerController->setProcessFilename() is deprecated since 9.1.3 and will be removed in v11.x',
243
        'getDuplicateRowsIfExist' => 'Using CrawlerController->getDuplicateRowsIfExist() is deprecated since 9.1.4 and will be remove in v11.x, please use QueueRepository->getDuplicateQueueItemsIfExists() instead',
244
        'checkIfPageShouldBeSkipped' => 'Using CrawlerController->checkIfPageShouldBeSkipped() is deprecated since 9.2.5 and will be removed in v11.x',
245
        'swapIfFirstIsLargerThanSecond' => 'Using CrawlerController->swapIfFirstIsLargerThanSecond() is deprecated since 9.2.5, and will be removed in v11.x',
246
        'expandParameters' => 'Using CrawlerController->expandParameters() is deprecated since 9.2.5, and will be removed in v11.x',
247
    ];
248
249
    /**
250
     * @var string[]
251
     */
252
    private $deprecatedPublicProperties = [
253
        'accessMode' => 'Using CrawlerController->accessMode is deprecated since 9.1.3 and will be removed in v11.x',
254
        'processFilename' => 'Using CrawlerController->accessMode is deprecated since 9.1.3 and will be removed in v11.x',
255
    ];
256
257
    /**
258
     * @var BackendUserAuthentication|null
259
     */
260
    private $backendUser;
261
262
    /**
263
     * @var integer
264
     */
265
    private $scheduledTime = 0;
266
267
    /**
268
     * @var integer
269
     */
270
    private $reqMinute = 0;
271
272
    /**
273
     * @var bool
274
     */
275
    private $submitCrawlUrls = false;
276
277
    /**
278
     * @var bool
279
     */
280
    private $downloadCrawlUrls = false;
281
282
    /**
283
     * @var PageRepository
284
     */
285
    private $pageRepository;
286
287
    /**
288
     * @var Crawler
289
     */
290
    private $crawler;
291
292
    /**
293
     * @var ConfigurationService
294
     */
295
    private $configurationService;
296
297
    /**
298
     * @var UrlService
299
     */
300
    private $urlService;
301
302
    /************************************
303
     *
304
     * Getting URLs based on Page TSconfig
305
     *
306
     ************************************/
307
308 41
    public function __construct()
309
    {
310 41
        $objectManager = GeneralUtility::makeInstance(ObjectManager::class);
311 41
        $crawlStrategyFactory = GeneralUtility::makeInstance(CrawlStrategyFactory::class);
312 41
        $this->queueRepository = $objectManager->get(QueueRepository::class);
313 41
        $this->processRepository = $objectManager->get(ProcessRepository::class);
314 41
        $this->configurationRepository = $objectManager->get(ConfigurationRepository::class);
315 41
        $this->pageRepository = GeneralUtility::makeInstance(PageRepository::class);
316 41
        $this->queueExecutor = GeneralUtility::makeInstance(QueueExecutor::class, $crawlStrategyFactory);
317 41
        $this->iconFactory = GeneralUtility::makeInstance(IconFactory::class);
318 41
        $this->crawler = GeneralUtility::makeInstance(Crawler::class);
319 41
        $this->configurationService = GeneralUtility::makeInstance(ConfigurationService::class);
320 41
        $this->urlService = GeneralUtility::makeInstance(UrlService::class);
321
322 41
        $this->processFilename = Environment::getVarPath() . '/lock/tx_crawler.proc';
0 ignored issues
show
Deprecated Code introduced by
The property AOE\Crawler\Controller\C...oller::$processFilename has been deprecated. ( Ignorable by Annotation )

If this is a false-positive, you can also ignore this issue in your code via the ignore-deprecated  annotation

322
        /** @scrutinizer ignore-deprecated */ $this->processFilename = Environment::getVarPath() . '/lock/tx_crawler.proc';
Loading history...
323
324
        /** @var ExtensionConfigurationProvider $configurationProvider */
325 41
        $configurationProvider = GeneralUtility::makeInstance(ExtensionConfigurationProvider::class);
326 41
        $settings = $configurationProvider->getExtensionConfiguration();
327 41
        $this->extensionSettings = is_array($settings) ? $settings : [];
328
329 41
        if (MathUtility::convertToPositiveInteger($this->extensionSettings['countInARun']) === 0) {
330
            $this->extensionSettings['countInARun'] = 100;
331
        }
332
333 41
        $this->extensionSettings['processLimit'] = MathUtility::forceIntegerInRange($this->extensionSettings['processLimit'], 1, 99, 1);
334 41
        $this->setMaximumUrlsToCompile(MathUtility::forceIntegerInRange($this->extensionSettings['maxCompileUrls'], 1, 1000000000, 10000));
335 41
    }
336
337 41
    public function setMaximumUrlsToCompile(int $maximumUrlsToCompile): void
338
    {
339 41
        $this->maximumUrlsToCompile = $maximumUrlsToCompile;
340 41
    }
341
342
    /**
343
     * Method to set the accessMode can be gui, cli or cli_im
344
     *
345
     * @return string
346
     * @deprecated
347
     */
348 1
    public function getAccessMode()
349
    {
350 1
        return $this->accessMode;
0 ignored issues
show
Deprecated Code introduced by
The property AOE\Crawler\Controller\C...Controller::$accessMode has been deprecated. ( Ignorable by Annotation )

If this is a false-positive, you can also ignore this issue in your code via the ignore-deprecated  annotation

350
        return /** @scrutinizer ignore-deprecated */ $this->accessMode;
Loading history...
351
    }
352
353
    /**
354
     * @param string $accessMode
355
     * @deprecated
356
     */
357 1
    public function setAccessMode($accessMode): void
358
    {
359 1
        $this->accessMode = $accessMode;
0 ignored issues
show
Deprecated Code introduced by
The property AOE\Crawler\Controller\C...Controller::$accessMode has been deprecated. ( Ignorable by Annotation )

If this is a false-positive, you can also ignore this issue in your code via the ignore-deprecated  annotation

359
        /** @scrutinizer ignore-deprecated */ $this->accessMode = $accessMode;
Loading history...
360 1
    }
361
362
    /**
363
     * Set disabled status to prevent processes from being processed
364
     * @deprecated
365
     */
366 3
    public function setDisabled(?bool $disabled = true): void
367
    {
368 3
        if ($disabled) {
369 2
            GeneralUtility::writeFile($this->processFilename, 'disabled');
0 ignored issues
show
Deprecated Code introduced by
The property AOE\Crawler\Controller\C...oller::$processFilename has been deprecated. ( Ignorable by Annotation )

If this is a false-positive, you can also ignore this issue in your code via the ignore-deprecated  annotation

369
            GeneralUtility::writeFile(/** @scrutinizer ignore-deprecated */ $this->processFilename, 'disabled');
Loading history...
370 1
        } elseif (is_file($this->processFilename)) {
0 ignored issues
show
Deprecated Code introduced by
The property AOE\Crawler\Controller\C...oller::$processFilename has been deprecated. ( Ignorable by Annotation )

If this is a false-positive, you can also ignore this issue in your code via the ignore-deprecated  annotation

370
        } elseif (is_file(/** @scrutinizer ignore-deprecated */ $this->processFilename)) {
Loading history...
371 1
            unlink($this->processFilename);
0 ignored issues
show
Deprecated Code introduced by
The property AOE\Crawler\Controller\C...oller::$processFilename has been deprecated. ( Ignorable by Annotation )

If this is a false-positive, you can also ignore this issue in your code via the ignore-deprecated  annotation

371
            unlink(/** @scrutinizer ignore-deprecated */ $this->processFilename);
Loading history...
372
        }
373 3
    }
374
375
    /**
376
     * Get disable status
377
     * @deprecated
378
     */
379 3
    public function getDisabled(): bool
380
    {
381 3
        return is_file($this->processFilename);
0 ignored issues
show
Deprecated Code introduced by
The property AOE\Crawler\Controller\C...oller::$processFilename has been deprecated. ( Ignorable by Annotation )

If this is a false-positive, you can also ignore this issue in your code via the ignore-deprecated  annotation

381
        return is_file(/** @scrutinizer ignore-deprecated */ $this->processFilename);
Loading history...
382
    }
383
384
    /**
385
     * @param string $filenameWithPath
386
     * @deprecated
387
     */
388 4
    public function setProcessFilename($filenameWithPath): void
389
    {
390 4
        $this->processFilename = $filenameWithPath;
0 ignored issues
show
Deprecated Code introduced by
The property AOE\Crawler\Controller\C...oller::$processFilename has been deprecated. ( Ignorable by Annotation )

If this is a false-positive, you can also ignore this issue in your code via the ignore-deprecated  annotation

390
        /** @scrutinizer ignore-deprecated */ $this->processFilename = $filenameWithPath;
Loading history...
391 4
    }
392
393
    /**
394
     * @return string
395
     * @deprecated
396
     */
397 1
    public function getProcessFilename()
398
    {
399 1
        return $this->processFilename;
0 ignored issues
show
Deprecated Code introduced by
The property AOE\Crawler\Controller\C...oller::$processFilename has been deprecated. ( Ignorable by Annotation )

If this is a false-positive, you can also ignore this issue in your code via the ignore-deprecated  annotation

399
        return /** @scrutinizer ignore-deprecated */ $this->processFilename;
Loading history...
400
    }
401
402
    /**
403
     * Sets the extensions settings (unserialized pendant of $TYPO3_CONF_VARS['EXT']['extConf']['crawler']).
404
     */
405 6
    public function setExtensionSettings(array $extensionSettings): void
406
    {
407 6
        $this->extensionSettings = $extensionSettings;
408 6
    }
409
410
    /**
411
     * Check if the given page should be crawled
412
     *
413
     * @return false|string false if the page should be crawled (not excluded), true / skipMessage if it should be skipped
414
     * @deprecated
415
     */
416
    public function checkIfPageShouldBeSkipped(array $pageRow)
417
    {
418
        $pageService = GeneralUtility::makeInstance(PageService::class);
419
        return $pageService->checkIfPageShouldBeSkipped($pageRow);
420
    }
421
422
    /**
423
     * Wrapper method for getUrlsForPageId()
424
     * It returns an array of configurations and no urls!
425
     *
426
     * @param array $pageRow Page record with at least dok-type and uid columns.
427
     * @param string $skipMessage
428
     * @return array
429
     * @see getUrlsForPageId()
430
     */
431 9
    public function getUrlsForPageRow(array $pageRow, &$skipMessage = '')
432
    {
433 9
        if (! is_int($pageRow['uid'])) {
434
            $skipMessage = 'PageUid ' . $pageRow['uid'] . ' was not an integer';
435
            return [];
436
        }
437
438 9
        $message = $this->getPageService()->checkIfPageShouldBeSkipped($pageRow);
439 9
        if ($message === false) {
440 8
            $res = $this->getUrlsForPageId($pageRow['uid']);
441 8
            $skipMessage = '';
442
        } else {
443 1
            $skipMessage = $message;
444 1
            $res = [];
445
        }
446
447 9
        return $res;
448
    }
449
450
    /**
451
     * Creates a list of URLs from input array (and submits them to queue if asked for)
452
     * See Web > Info module script + "indexed_search"'s crawler hook-client using this!
453
     *
454
     * @param array $vv Information about URLs from pageRow to crawl.
455
     * @param array $pageRow Page row
456
     * @param int $scheduledTime Unix time to schedule indexing to, typically time()
457
     * @param int $reqMinute Number of requests per minute (creates the interleave between requests)
458
     * @param bool $submitCrawlUrls If set, submits the URLs to queue
459
     * @param bool $downloadCrawlUrls If set (and submitcrawlUrls is false) will fill $downloadUrls with entries)
460
     * @param array $duplicateTrack Array which is passed by reference and contains the an id per url to secure we will not crawl duplicates
461
     * @param array $downloadUrls Array which will be filled with URLS for download if flag is set.
462
     * @param array $incomingProcInstructions Array of processing instructions
463
     * @return string List of URLs (meant for display in backend module)
464
     */
465 7
    public function urlListFromUrlArray(
466
        array $vv,
467
        array $pageRow,
468
        $scheduledTime,
469
        $reqMinute,
470
        $submitCrawlUrls,
471
        $downloadCrawlUrls,
472
        array &$duplicateTrack,
473
        array &$downloadUrls,
474
        array $incomingProcInstructions
475
    ) {
476 7
        if (! is_array($vv['URLs'])) {
477
            return 'ERROR - no URL generated';
478
        }
479 7
        $urlLog = [];
480 7
        $pageId = (int) $pageRow['uid'];
481 7
        $configurationHash = $this->getConfigurationHash($vv);
482 7
        $skipInnerCheck = $this->queueRepository->noUnprocessedQueueEntriesForPageWithConfigurationHashExist($pageId, $configurationHash);
483
484 7
        $urlService = new UrlService();
485
486 7
        foreach ($vv['URLs'] as $urlQuery) {
487 7
            if (! $this->drawURLs_PIfilter($vv['subCfg']['procInstrFilter'], $incomingProcInstructions)) {
488
                continue;
489
            }
490 7
            $url = (string) $urlService->getUrlFromPageAndQueryParameters(
491 7
                $pageId,
492
                $urlQuery,
493 7
                $vv['subCfg']['baseUrl'] ?? null,
494 7
                $vv['subCfg']['force_ssl'] ?? 0
495
            );
496
497
            // Create key by which to determine unique-ness:
498 7
            $uKey = $url . '|' . $vv['subCfg']['userGroups'] . '|' . $vv['subCfg']['procInstrFilter'];
499
500 7
            if (isset($duplicateTrack[$uKey])) {
501
                //if the url key is registered just display it and do not resubmit is
502
                $urlLog[] = '<em><span class="text-muted">' . htmlspecialchars($url) . '</span></em>';
503
            } else {
504
                // Scheduled time:
505 7
                $schTime = $scheduledTime + round(count($duplicateTrack) * (60 / $reqMinute));
506 7
                $schTime = intval($schTime / 60) * 60;
507 7
                $formattedDate = BackendUtility::datetime($schTime);
508 7
                $this->urlList[] = '[' . $formattedDate . '] ' . $url;
509 7
                $urlList = '[' . $formattedDate . '] ' . htmlspecialchars($url);
510
511
                // Submit for crawling!
512 7
                if ($submitCrawlUrls) {
513 7
                    $added = $this->addUrl(
514 7
                        $pageId,
515
                        $url,
516 7
                        $vv['subCfg'],
517
                        $scheduledTime,
518
                        $configurationHash,
519
                        $skipInnerCheck
520
                    );
521 7
                    if ($added === false) {
522 7
                        $urlList .= ' (URL already existed)';
523
                    }
524
                } elseif ($downloadCrawlUrls) {
525
                    $downloadUrls[$url] = $url;
526
                }
527 7
                $urlLog[] = $urlList;
528
            }
529 7
            $duplicateTrack[$uKey] = true;
530
        }
531
532 7
        return implode('<br>', $urlLog);
533
    }
534
535
    /**
536
     * Returns true if input processing instruction is among registered ones.
537
     *
538
     * @param string $piString PI to test
539
     * @param array $incomingProcInstructions Processing instructions
540
     * @return boolean
541
     */
542 8
    public function drawURLs_PIfilter($piString, array $incomingProcInstructions)
543
    {
544 8
        if (empty($incomingProcInstructions)) {
545 4
            return true;
546
        }
547
548 4
        foreach ($incomingProcInstructions as $pi) {
549 4
            if (GeneralUtility::inList($piString, $pi)) {
550 2
                return true;
551
            }
552
        }
553 2
        return false;
554
    }
555
556 9
    public function getPageTSconfigForId(int $id): array
557
    {
558 9
        if (! $this->MP) {
559 9
            $pageTSconfig = BackendUtility::getPagesTSconfig($id);
560
        } else {
561
            // TODO: Please check, this makes no sense to split a boolean value.
562
            [, $mountPointId] = explode('-', $this->MP);
0 ignored issues
show
Bug introduced by
$this->MP of type true is incompatible with the type string expected by parameter $string of explode(). ( Ignorable by Annotation )

If this is a false-positive, you can also ignore this issue in your code via the ignore-type  annotation

562
            [, $mountPointId] = explode('-', /** @scrutinizer ignore-type */ $this->MP);
Loading history...
563
            $pageTSconfig = BackendUtility::getPagesTSconfig($mountPointId);
0 ignored issues
show
Bug introduced by
$mountPointId of type string is incompatible with the type integer expected by parameter $id of TYPO3\CMS\Backend\Utilit...ity::getPagesTSconfig(). ( Ignorable by Annotation )

If this is a false-positive, you can also ignore this issue in your code via the ignore-type  annotation

563
            $pageTSconfig = BackendUtility::getPagesTSconfig(/** @scrutinizer ignore-type */ $mountPointId);
Loading history...
564
        }
565
566
        // Call a hook to alter configuration
567 9
        if (is_array($GLOBALS['TYPO3_CONF_VARS']['EXTCONF']['crawler']['getPageTSconfigForId'])) {
568
            $params = [
569
                'pageId' => $id,
570
                'pageTSConfig' => &$pageTSconfig,
571
            ];
572
            foreach ($GLOBALS['TYPO3_CONF_VARS']['EXTCONF']['crawler']['getPageTSconfigForId'] as $userFunc) {
573
                GeneralUtility::callUserFunction($userFunc, $params, $this);
574
            }
575
        }
576 9
        return $pageTSconfig;
577
    }
578
579
    /**
580
     * This methods returns an array of configurations.
581
     * Adds no urls!
582
     */
583 7
    public function getUrlsForPageId(int $pageId): array
584
    {
585
        // Get page TSconfig for page ID
586 7
        $pageTSconfig = $this->getPageTSconfigForId($pageId);
587
588 7
        $mountPoint = is_string($this->MP) ? $this->MP : '';
0 ignored issues
show
introduced by
The condition is_string($this->MP) is always false.
Loading history...
589
590 7
        $res = [];
591
592
        // Fetch Crawler Configuration from pageTSConfig
593 7
        $res = $this->configurationService->getConfigurationFromPageTS($pageTSconfig, $pageId, $res, $mountPoint);
594
595
        // Get configuration from tx_crawler_configuration records up the rootline
596 7
        $res = $this->configurationService->getConfigurationFromDatabase($pageId, $res);
597
598 7
        foreach ($GLOBALS['TYPO3_CONF_VARS']['EXTCONF']['crawler']['processUrls'] ?? [] as $func) {
599
            $params = [
600
                'res' => &$res,
601
            ];
602
            GeneralUtility::callUserFunction($func, $params, $this);
603
        }
604 7
        return $res;
605
    }
606
607
    /**
608
     * Find all configurations of subpages of a page
609
     * TODO: Write Functional Tests
610
     */
611 2
    public function getConfigurationsForBranch(int $rootid, int $depth): array
612
    {
613 2
        $configurationsForBranch = [];
614 2
        $pageTSconfig = $this->getPageTSconfigForId($rootid);
615 2
        $sets = $pageTSconfig['tx_crawler.']['crawlerCfg.']['paramSets.'] ?? [];
616 2
        foreach ($sets as $key => $value) {
617
            if (! is_array($value)) {
618
                continue;
619
            }
620
            $configurationsForBranch[] = substr($key, -1) === '.' ? substr($key, 0, -1) : $key;
621
        }
622 2
        $pids = [];
623 2
        $rootLine = BackendUtility::BEgetRootLine($rootid);
624 2
        foreach ($rootLine as $node) {
625 1
            $pids[] = $node['uid'];
626
        }
627
        /* @var PageTreeView $tree */
628 2
        $tree = GeneralUtility::makeInstance(PageTreeView::class);
629 2
        $perms_clause = $this->getBackendUser()->getPagePermsClause(Permission::PAGE_SHOW);
630 2
        $tree->init(empty($perms_clause) ? '' : ('AND ' . $perms_clause));
631 2
        $tree->getTree($rootid, $depth, '');
632 2
        foreach ($tree->tree as $node) {
633
            $pids[] = $node['row']['uid'];
634
        }
635
636 2
        $configurations = $this->configurationRepository->getCrawlerConfigurationRecordsFromRootLine($rootid, $pids);
637
638 2
        foreach($configurations as $configuration) {
639 1
            $configurationsForBranch[] = $configuration['name'];
640
        }
641 2
        return $configurationsForBranch;
642
    }
643
644
    /**
645
     * Check if a user has access to an item
646
     * (e.g. get the group list of the current logged in user from $GLOBALS['TSFE']->gr_list)
647
     *
648
     * @param string $groupList Comma-separated list of (fe_)group UIDs from a user
649
     * @param string $accessList Comma-separated list of (fe_)group UIDs of the item to access
650
     * @return bool TRUE if at least one of the users group UIDs is in the access list or the access list is empty
651
     * @see \TYPO3\CMS\Frontend\Page\PageRepository::getMultipleGroupsWhereClause()
652
     * @deprecated
653
     * @codeCoverageIgnore
654
     */
655
    public function hasGroupAccess($groupList, $accessList)
656
    {
657
        if (empty($accessList)) {
658
            return true;
659
        }
660
        foreach (GeneralUtility::intExplode(',', $groupList) as $groupUid) {
661
            if (GeneralUtility::inList($accessList, $groupUid)) {
662
                return true;
663
            }
664
        }
665
        return false;
666
    }
667
668
    /**
669
     * Will expand the parameters configuration to individual values. This follows a certain syntax of the value of each parameter.
670
     * Syntax of values:
671
     * - Basically: If the value is wrapped in [...] it will be expanded according to the following syntax, otherwise the value is taken literally
672
     * - Configuration is splitted by "|" and the parts are processed individually and finally added together
673
     * - For each configuration part:
674
     *         - "[int]-[int]" = Integer range, will be expanded to all values in between, values included, starting from low to high (max. 1000). Example "1-34" or "-40--30"
675
     *         - "_TABLE:[TCA table name];[_PID:[optional page id, default is current page]];[_ENABLELANG:1]" = Look up of table records from PID, filtering out deleted records. Example "_TABLE:tt_content; _PID:123"
676
     *        _ENABLELANG:1 picks only original records without their language overlays
677
     *         - Default: Literal value
678
     *
679
     * @param array $paramArray Array with key (GET var name) and values (value of GET var which is configuration for expansion)
680
     * @param integer $pid Current page ID
681
     * @return array
682
     * @deprecated
683
     * @codeCoverageIgnore
684
     *
685
     */
686
    public function expandParameters($paramArray, $pid)
687
    {
688
        // Traverse parameter names:
689
        foreach ($paramArray as $p => $v) {
690
            $v = trim($v);
691
692
            // If value is encapsulated in square brackets it means there are some ranges of values to find, otherwise the value is literal
693
            if (strpos($v, '[') === 0 && substr($v, -1) === ']') {
694
                // So, find the value inside brackets and reset the paramArray value as an array.
695
                $v = substr($v, 1, -1);
696
                $paramArray[$p] = [];
697
698
                // Explode parts and traverse them:
699
                $parts = explode('|', $v);
700
                foreach ($parts as $pV) {
701
702
                    // Look for integer range: (fx. 1-34 or -40--30 // reads minus 40 to minus 30)
703
                    if (preg_match('/^(-?[0-9]+)\s*-\s*(-?[0-9]+)$/', trim($pV), $reg)) {
704
                        $reg = $this->swapIfFirstIsLargerThanSecond($reg);
0 ignored issues
show
Deprecated Code introduced by
The function AOE\Crawler\Controller\C...rstIsLargerThanSecond() has been deprecated. ( Ignorable by Annotation )

If this is a false-positive, you can also ignore this issue in your code via the ignore-deprecated  annotation

704
                        $reg = /** @scrutinizer ignore-deprecated */ $this->swapIfFirstIsLargerThanSecond($reg);
Loading history...
705
706
                        // Traverse range, add values:
707
                        // Limit to size of range!
708
                        $runAwayBrake = 1000;
709
                        for ($a = $reg[1]; $a <= $reg[2]; $a++) {
710
                            $paramArray[$p][] = $a;
711
                            $runAwayBrake--;
712
                            if ($runAwayBrake <= 0) {
713
                                break;
714
                            }
715
                        }
716
                    } elseif (strpos(trim($pV), '_TABLE:') === 0) {
717
718
                        // Parse parameters:
719
                        $subparts = GeneralUtility::trimExplode(';', $pV);
720
                        $subpartParams = [];
721
                        foreach ($subparts as $spV) {
722
                            [$pKey, $pVal] = GeneralUtility::trimExplode(':', $spV);
723
                            $subpartParams[$pKey] = $pVal;
724
                        }
725
726
                        // Table exists:
727
                        if (isset($GLOBALS['TCA'][$subpartParams['_TABLE']])) {
728
                            $lookUpPid = isset($subpartParams['_PID']) ? intval($subpartParams['_PID']) : intval($pid);
729
                            $recursiveDepth = isset($subpartParams['_RECURSIVE']) ? intval($subpartParams['_RECURSIVE']) : 0;
730
                            $pidField = isset($subpartParams['_PIDFIELD']) ? trim($subpartParams['_PIDFIELD']) : 'pid';
731
                            $where = $subpartParams['_WHERE'] ?? '';
732
                            $addTable = $subpartParams['_ADDTABLE'] ?? '';
733
734
                            $fieldName = $subpartParams['_FIELD'] ? $subpartParams['_FIELD'] : 'uid';
735
                            if ($fieldName === 'uid' || $GLOBALS['TCA'][$subpartParams['_TABLE']]['columns'][$fieldName]) {
736
                                $queryBuilder = $this->getQueryBuilder($subpartParams['_TABLE']);
737
738
                                if ($recursiveDepth > 0) {
739
                                    /** @var QueryGenerator $queryGenerator */
740
                                    $queryGenerator = GeneralUtility::makeInstance(QueryGenerator::class);
741
                                    $pidList = $queryGenerator->getTreeList($lookUpPid, $recursiveDepth, 0, 1);
742
                                    $pidArray = GeneralUtility::intExplode(',', $pidList);
743
                                } else {
744
                                    $pidArray = [(string) $lookUpPid];
745
                                }
746
747
                                $queryBuilder->getRestrictions()
748
                                    ->removeAll()
749
                                    ->add(GeneralUtility::makeInstance(DeletedRestriction::class));
750
751
                                $queryBuilder
752
                                    ->select($fieldName)
753
                                    ->from($subpartParams['_TABLE'])
754
                                    ->where(
755
                                        $queryBuilder->expr()->in($pidField, $queryBuilder->createNamedParameter($pidArray, Connection::PARAM_INT_ARRAY)),
756
                                        $where
757
                                    );
758
759
                                if (! empty($addTable)) {
760
                                    // TODO: Check if this works as intended!
761
                                    $queryBuilder->add('from', $addTable);
762
                                }
763
                                $transOrigPointerField = $GLOBALS['TCA'][$subpartParams['_TABLE']]['ctrl']['transOrigPointerField'];
764
765
                                if ($subpartParams['_ENABLELANG'] && $transOrigPointerField) {
766
                                    $queryBuilder->andWhere(
767
                                        $queryBuilder->expr()->lte(
768
                                            $transOrigPointerField,
769
                                            0
770
                                        )
771
                                    );
772
                                }
773
774
                                $statement = $queryBuilder->execute();
775
776
                                $rows = [];
777
                                while ($row = $statement->fetch()) {
778
                                    $rows[$row[$fieldName]] = $row;
779
                                }
780
781
                                if (is_array($rows)) {
782
                                    $paramArray[$p] = array_merge($paramArray[$p], array_keys($rows));
783
                                }
784
                            }
785
                        }
786
                    } else {
787
                        // Just add value:
788
                        $paramArray[$p][] = $pV;
789
                    }
790
                    // Hook for processing own expandParameters place holder
791
                    if (is_array($GLOBALS['TYPO3_CONF_VARS']['SC_OPTIONS']['crawler/class.tx_crawler_lib.php']['expandParameters'])) {
792
                        $_params = [
793
                            'pObj' => &$this,
794
                            'paramArray' => &$paramArray,
795
                            'currentKey' => $p,
796
                            'currentValue' => $pV,
797
                            'pid' => $pid,
798
                        ];
799
                        foreach ($GLOBALS['TYPO3_CONF_VARS']['SC_OPTIONS']['crawler/class.tx_crawler_lib.php']['expandParameters'] as $_funcRef) {
800
                            GeneralUtility::callUserFunction($_funcRef, $_params, $this);
801
                        }
802
                    }
803
                }
804
805
                // Make unique set of values and sort array by key:
806
                $paramArray[$p] = array_unique($paramArray[$p]);
807
                ksort($paramArray);
808
            } else {
809
                // Set the literal value as only value in array:
810
                $paramArray[$p] = [$v];
811
            }
812
        }
813
814
        return $paramArray;
815
    }
816
817
    /**
818
     * Compiling URLs from parameter array (output of expandParameters())
819
     * The number of URLs will be the multiplication of the number of parameter values for each key
820
     *
821
     * @param array $paramArray Output of expandParameters(): Array with keys (GET var names) and for each an array of values
822
     * @param array $urls URLs accumulated in this array (for recursion)
823
     * @deprecated
824
     * @codeCoverageIgnore
825
     */
826
    public function compileUrls(array $paramArray, array $urls): array
827
    {
828
        return $this->urlService->compileUrls($paramArray, $urls, $this->getMaximumUrlsToCompile());
829
    }
830
831
    /************************************
832
     *
833
     * Crawler log
834
     *
835
     ************************************/
836
837
    /**
838
     * Return array of records from crawler queue for input page ID
839
     *
840
     * @param integer $id Page ID for which to look up log entries.
841
     * @param boolean $doFlush If TRUE, then entries selected at DELETED(!) instead of selected!
842
     * @param boolean $doFullFlush
843
     * @param integer $itemsPerPage Limit the amount of entries per page default is 10
844
     * @return array
845
     *
846
     * @deprecated
847
     */
848 4
    public function getLogEntriesForPageId($id, QueueFilter $queueFilter, $doFlush = false, $doFullFlush = false, $itemsPerPage = 10)
0 ignored issues
show
Unused Code introduced by
The parameter $doFullFlush is not used and could be removed. ( Ignorable by Annotation )

If this is a false-positive, you can also ignore this issue in your code via the ignore-unused  annotation

848
    public function getLogEntriesForPageId($id, QueueFilter $queueFilter, $doFlush = false, /** @scrutinizer ignore-unused */ $doFullFlush = false, $itemsPerPage = 10)

This check looks for parameters that have been defined for a function or method, but which are not used in the method body.

Loading history...
849
    {
850 4
        $queryBuilder = GeneralUtility::makeInstance(ConnectionPool::class)->getQueryBuilderForTable($this->tableName);
0 ignored issues
show
Deprecated Code introduced by
The property AOE\Crawler\Controller\C...rController::$tableName has been deprecated: Since v9.2.5 - This will be remove in v10 ( Ignorable by Annotation )

If this is a false-positive, you can also ignore this issue in your code via the ignore-deprecated  annotation

850
        $queryBuilder = GeneralUtility::makeInstance(ConnectionPool::class)->getQueryBuilderForTable(/** @scrutinizer ignore-deprecated */ $this->tableName);

This property has been deprecated. The supplier of the class has supplied an explanatory message.

The explanatory message should give you some clue as to whether and when the property will be removed from the class and what other property to use instead.

Loading history...
851
        $queryBuilder
852 4
            ->select('*')
853 4
            ->from($this->tableName)
0 ignored issues
show
Deprecated Code introduced by
The property AOE\Crawler\Controller\C...rController::$tableName has been deprecated: Since v9.2.5 - This will be remove in v10 ( Ignorable by Annotation )

If this is a false-positive, you can also ignore this issue in your code via the ignore-deprecated  annotation

853
            ->from(/** @scrutinizer ignore-deprecated */ $this->tableName)

This property has been deprecated. The supplier of the class has supplied an explanatory message.

The explanatory message should give you some clue as to whether and when the property will be removed from the class and what other property to use instead.

Loading history...
854 4
            ->where(
855 4
                $queryBuilder->expr()->eq('page_id', $queryBuilder->createNamedParameter($id, PDO::PARAM_INT))
856
            )
857 4
            ->orderBy('scheduled', 'DESC');
858
859 4
        $expressionBuilder = GeneralUtility::makeInstance(ConnectionPool::class)
860 4
            ->getConnectionForTable($this->tableName)
0 ignored issues
show
Deprecated Code introduced by
The property AOE\Crawler\Controller\C...rController::$tableName has been deprecated: Since v9.2.5 - This will be remove in v10 ( Ignorable by Annotation )

If this is a false-positive, you can also ignore this issue in your code via the ignore-deprecated  annotation

860
            ->getConnectionForTable(/** @scrutinizer ignore-deprecated */ $this->tableName)

This property has been deprecated. The supplier of the class has supplied an explanatory message.

The explanatory message should give you some clue as to whether and when the property will be removed from the class and what other property to use instead.

Loading history...
861 4
            ->getExpressionBuilder();
862 4
        $query = $expressionBuilder->andX();
0 ignored issues
show
Unused Code introduced by
The assignment to $query is dead and can be removed.
Loading history...
863
        // PHPStorm adds the highlight that the $addWhere is immediately overwritten,
864
        // but the $query = $expressionBuilder->andX() ensures that the $addWhere is written correctly with AND
865
        // between the statements, it's not a mistake in the code.
866 4
        switch ($queueFilter) {
867 4
            case 'pending':
868
                $queryBuilder->andWhere($queryBuilder->expr()->eq('exec_time', 0));
869
                break;
870 4
            case 'finished':
871
                $queryBuilder->andWhere($queryBuilder->expr()->gt('exec_time', 0));
872
                break;
873
        }
874
875 4
        if ($doFlush) {
876 2
            $this->queueRepository->flushQueue($queueFilter);
877
        }
878 4
        if ($itemsPerPage > 0) {
879
            $queryBuilder
880 4
                ->setMaxResults((int) $itemsPerPage);
881
        }
882
883 4
        return $queryBuilder->execute()->fetchAll();
884
    }
885
886
    /**
887
     * Return array of records from crawler queue for input set ID
888
     *
889
     * @param int $set_id Set ID for which to look up log entries.
890
     * @param string $filter Filter: "all" => all entries, "pending" => all that is not yet run, "finished" => all complete ones
891
     * @param bool $doFlush If TRUE, then entries selected at DELETED(!) instead of selected!
892
     * @param int $itemsPerPage Limit the amount of entries per page default is 10
893
     * @return array
894
     *
895
     * @deprecated
896
     */
897 6
    public function getLogEntriesForSetId(int $set_id, string $filter = '', bool $doFlush = false, bool $doFullFlush = false, int $itemsPerPage = 10)
898
    {
899 6
        $queryBuilder = GeneralUtility::makeInstance(ConnectionPool::class)->getQueryBuilderForTable($this->tableName);
0 ignored issues
show
Deprecated Code introduced by
The property AOE\Crawler\Controller\C...rController::$tableName has been deprecated: Since v9.2.5 - This will be remove in v10 ( Ignorable by Annotation )

If this is a false-positive, you can also ignore this issue in your code via the ignore-deprecated  annotation

899
        $queryBuilder = GeneralUtility::makeInstance(ConnectionPool::class)->getQueryBuilderForTable(/** @scrutinizer ignore-deprecated */ $this->tableName);

This property has been deprecated. The supplier of the class has supplied an explanatory message.

The explanatory message should give you some clue as to whether and when the property will be removed from the class and what other property to use instead.

Loading history...
900
        $queryBuilder
901 6
            ->select('*')
902 6
            ->from($this->tableName)
0 ignored issues
show
Deprecated Code introduced by
The property AOE\Crawler\Controller\C...rController::$tableName has been deprecated: Since v9.2.5 - This will be remove in v10 ( Ignorable by Annotation )

If this is a false-positive, you can also ignore this issue in your code via the ignore-deprecated  annotation

902
            ->from(/** @scrutinizer ignore-deprecated */ $this->tableName)

This property has been deprecated. The supplier of the class has supplied an explanatory message.

The explanatory message should give you some clue as to whether and when the property will be removed from the class and what other property to use instead.

Loading history...
903 6
            ->where(
904 6
                $queryBuilder->expr()->eq('set_id', $queryBuilder->createNamedParameter($set_id, PDO::PARAM_INT))
905
            )
906 6
            ->orderBy('scheduled', 'DESC');
907
908 6
        $expressionBuilder = GeneralUtility::makeInstance(ConnectionPool::class)
909 6
            ->getConnectionForTable($this->tableName)
0 ignored issues
show
Deprecated Code introduced by
The property AOE\Crawler\Controller\C...rController::$tableName has been deprecated: Since v9.2.5 - This will be remove in v10 ( Ignorable by Annotation )

If this is a false-positive, you can also ignore this issue in your code via the ignore-deprecated  annotation

909
            ->getConnectionForTable(/** @scrutinizer ignore-deprecated */ $this->tableName)

This property has been deprecated. The supplier of the class has supplied an explanatory message.

The explanatory message should give you some clue as to whether and when the property will be removed from the class and what other property to use instead.

Loading history...
910 6
            ->getExpressionBuilder();
911 6
        $query = $expressionBuilder->andX();
912
        // PHPStorm adds the highlight that the $addWhere is immediately overwritten,
913
        // but the $query = $expressionBuilder->andX() ensures that the $addWhere is written correctly with AND
914
        // between the statements, it's not a mistake in the code.
915 6
        $addWhere = '';
916 6
        switch ($filter) {
917 6
            case 'pending':
918 1
                $queryBuilder->andWhere($queryBuilder->expr()->eq('exec_time', 0));
919 1
                $addWhere = $query->add($expressionBuilder->eq('exec_time', 0));
0 ignored issues
show
Unused Code introduced by
The assignment to $addWhere is dead and can be removed.
Loading history...
920 1
                break;
921 5
            case 'finished':
922 1
                $queryBuilder->andWhere($queryBuilder->expr()->gt('exec_time', 0));
923 1
                $addWhere = $query->add($expressionBuilder->gt('exec_time', 0));
924 1
                break;
925
        }
926 6
        if ($doFlush) {
927 4
            $addWhere = $query->add($expressionBuilder->eq('set_id', (int) $set_id));
928 4
            $this->flushQueue($doFullFlush ? '' : $addWhere);
0 ignored issues
show
Deprecated Code introduced by
The function AOE\Crawler\Controller\C...ontroller::flushQueue() has been deprecated. ( Ignorable by Annotation )

If this is a false-positive, you can also ignore this issue in your code via the ignore-deprecated  annotation

928
            /** @scrutinizer ignore-deprecated */ $this->flushQueue($doFullFlush ? '' : $addWhere);
Loading history...
929 4
            return [];
930
        }
931 2
        if ($itemsPerPage > 0) {
932
            $queryBuilder
933 2
                ->setMaxResults((int) $itemsPerPage);
934
        }
935
936 2
        return $queryBuilder->execute()->fetchAll();
937
    }
938
939
    /**
940
     * Adding call back entries to log (called from hooks typically, see indexed search class "class.crawler.php"
941
     *
942
     * @param integer $setId Set ID
943
     * @param array $params Parameters to pass to call back function
944
     * @param string $callBack Call back object reference, eg. 'EXT:indexed_search/class.crawler.php:&tx_indexedsearch_crawler'
945
     * @param integer $page_id Page ID to attach it to
946
     * @param integer $schedule Time at which to activate
947
     */
948
    public function addQueueEntry_callBack($setId, $params, $callBack, $page_id = 0, $schedule = 0): void
949
    {
950
        if (! is_array($params)) {
0 ignored issues
show
introduced by
The condition is_array($params) is always true.
Loading history...
951
            $params = [];
952
        }
953
        $params['_CALLBACKOBJ'] = $callBack;
954
955
        GeneralUtility::makeInstance(ConnectionPool::class)->getConnectionForTable(QueueRepository::TABLE_NAME)
956
            ->insert(
957
                QueueRepository::TABLE_NAME,
958
                [
959
                    'page_id' => (int) $page_id,
960
                    'parameters' => json_encode($params),
961
                    'scheduled' => (int) $schedule ?: $this->getCurrentTime(),
962
                    'exec_time' => 0,
963
                    'set_id' => (int) $setId,
964
                    'result_data' => '',
965
                ]
966
            );
967
    }
968
969
    /************************************
970
     *
971
     * URL setting
972
     *
973
     ************************************/
974
975
    /**
976
     * Setting a URL for crawling:
977
     *
978
     * @param integer $id Page ID
979
     * @param string $url Complete URL
980
     * @param array $subCfg Sub configuration array (from TS config)
981
     * @param integer $tstamp Scheduled-time
982
     * @param string $configurationHash (optional) configuration hash
983
     * @param bool $skipInnerDuplicationCheck (optional) skip inner duplication check
984
     * @return bool
985
     */
986 11
    public function addUrl(
987
        $id,
988
        $url,
989
        array $subCfg,
990
        $tstamp,
991
        $configurationHash = '',
992
        $skipInnerDuplicationCheck = false
993
    ) {
994 11
        $urlAdded = false;
995 11
        $rows = [];
996
997
        // Creating parameters:
998
        $parameters = [
999 11
            'url' => $url,
1000
        ];
1001
1002
        // fe user group simulation:
1003 11
        $uGs = implode(',', array_unique(GeneralUtility::intExplode(',', $subCfg['userGroups'], true)));
1004 11
        if ($uGs) {
1005 1
            $parameters['feUserGroupList'] = $uGs;
1006
        }
1007
1008
        // Setting processing instructions
1009 11
        $parameters['procInstructions'] = GeneralUtility::trimExplode(',', $subCfg['procInstrFilter']);
1010 11
        if (is_array($subCfg['procInstrParams.'])) {
1011 8
            $parameters['procInstrParams'] = $subCfg['procInstrParams.'];
1012
        }
1013
1014
        // Compile value array:
1015 11
        $parameters_serialized = json_encode($parameters);
1016
        $fieldArray = [
1017 11
            'page_id' => (int) $id,
1018 11
            'parameters' => $parameters_serialized,
1019 11
            'parameters_hash' => GeneralUtility::shortMD5($parameters_serialized),
1020 11
            'configuration_hash' => $configurationHash,
1021 11
            'scheduled' => $tstamp,
1022 11
            'exec_time' => 0,
1023 11
            'set_id' => (int) $this->setID,
1024 11
            'result_data' => '',
1025 11
            'configuration' => $subCfg['key'],
1026
        ];
1027
1028 11
        if ($this->registerQueueEntriesInternallyOnly) {
1029
            //the entries will only be registered and not stored to the database
1030 1
            $this->queueEntries[] = $fieldArray;
1031
        } else {
1032 10
            if (! $skipInnerDuplicationCheck) {
1033
                // check if there is already an equal entry
1034 9
                $rows = $this->queueRepository->getDuplicateQueueItemsIfExists(
1035 9
                    (bool) $this->extensionSettings['enableTimeslot'],
1036
                    $tstamp,
1037 9
                    $this->getCurrentTime(),
1038 9
                    $fieldArray['page_id'],
1039 9
                    $fieldArray['parameters_hash']
1040
                );
1041
            }
1042
1043 10
            if (empty($rows)) {
1044 9
                $connectionForCrawlerQueue = GeneralUtility::makeInstance(ConnectionPool::class)->getConnectionForTable(QueueRepository::TABLE_NAME);
1045 9
                $connectionForCrawlerQueue->insert(
1046 9
                    QueueRepository::TABLE_NAME,
1047
                    $fieldArray
1048
                );
1049 9
                $uid = $connectionForCrawlerQueue->lastInsertId(QueueRepository::TABLE_NAME, 'qid');
1050 9
                $rows[] = $uid;
1051 9
                $urlAdded = true;
1052
1053 9
                $signalPayload = ['uid' => $uid, 'fieldArray' => $fieldArray];
1054 9
                SignalSlotUtility::emitSignal(
0 ignored issues
show
Deprecated Code introduced by
The function AOE\Crawler\Utility\Sign...otUtility::emitSignal() has been deprecated. ( Ignorable by Annotation )

If this is a false-positive, you can also ignore this issue in your code via the ignore-deprecated  annotation

1054
                /** @scrutinizer ignore-deprecated */ SignalSlotUtility::emitSignal(
Loading history...
1055 9
                    self::class,
1056 9
                    SignalSlotUtility::SIGNAL_URL_ADDED_TO_QUEUE,
1057
                    $signalPayload
1058
                );
1059
            } else {
1060 5
                $signalPayload = ['rows' => $rows, 'fieldArray' => $fieldArray];
1061 5
                SignalSlotUtility::emitSignal(
0 ignored issues
show
Deprecated Code introduced by
The function AOE\Crawler\Utility\Sign...otUtility::emitSignal() has been deprecated. ( Ignorable by Annotation )

If this is a false-positive, you can also ignore this issue in your code via the ignore-deprecated  annotation

1061
                /** @scrutinizer ignore-deprecated */ SignalSlotUtility::emitSignal(
Loading history...
1062 5
                    self::class,
1063 5
                    SignalSlotUtility::SIGNAL_DUPLICATE_URL_IN_QUEUE,
1064
                    $signalPayload
1065
                );
1066
            }
1067
        }
1068
1069 11
        return $urlAdded;
1070
    }
1071
1072
    /**
1073
     * Returns the current system time
1074
     *
1075
     * @return int
1076
     */
1077 4
    public function getCurrentTime()
1078
    {
1079 4
        return time();
1080
    }
1081
1082
    /************************************
1083
     *
1084
     * URL reading
1085
     *
1086
     ************************************/
1087
1088
    /**
1089
     * Read URL for single queue entry
1090
     *
1091
     * @param integer $queueId
1092
     * @param boolean $force If set, will process even if exec_time has been set!
1093
     *
1094
     * @return int|null
1095
     */
1096 2
    public function readUrl($queueId, $force = false)
1097
    {
1098 2
        $queryBuilder = GeneralUtility::makeInstance(ConnectionPool::class)->getQueryBuilderForTable(QueueRepository::TABLE_NAME);
1099 2
        $ret = 0;
1100 2
        $this->logger->debug('crawler-readurl start ' . microtime(true));
0 ignored issues
show
Bug introduced by
The method debug() does not exist on null. ( Ignorable by Annotation )

If this is a false-positive, you can also ignore this issue in your code via the ignore-call  annotation

1100
        $this->logger->/** @scrutinizer ignore-call */ 
1101
                       debug('crawler-readurl start ' . microtime(true));

This check looks for calls to methods that do not seem to exist on a given type. It looks for the method on the type itself as well as in inherited classes or implemented interfaces.

This is most likely a typographical error or the method has been renamed.

Loading history...
1101
1102
        $queryBuilder
1103 2
            ->select('*')
1104 2
            ->from(QueueRepository::TABLE_NAME)
1105 2
            ->where(
1106 2
                $queryBuilder->expr()->eq('qid', $queryBuilder->createNamedParameter($queueId, PDO::PARAM_INT))
1107
            );
1108 2
        if (! $force) {
1109
            $queryBuilder
1110 2
                ->andWhere('exec_time = 0')
1111 2
                ->andWhere('process_scheduled > 0');
1112
        }
1113 2
        $queueRec = $queryBuilder->execute()->fetch();
1114
1115 2
        if (! is_array($queueRec)) {
1116
            return;
1117
        }
1118
1119 2
        SignalSlotUtility::emitSignal(
0 ignored issues
show
Deprecated Code introduced by
The function AOE\Crawler\Utility\Sign...otUtility::emitSignal() has been deprecated. ( Ignorable by Annotation )

If this is a false-positive, you can also ignore this issue in your code via the ignore-deprecated  annotation

1119
        /** @scrutinizer ignore-deprecated */ SignalSlotUtility::emitSignal(
Loading history...
1120 2
            self::class,
1121 2
            SignalSlotUtility::SIGNAL_QUEUEITEM_PREPROCESS,
1122 2
            [$queueId, &$queueRec]
1123
        );
1124
1125
        // Set exec_time to lock record:
1126 2
        $field_array = ['exec_time' => $this->getCurrentTime()];
1127
1128 2
        if (isset($this->processID)) {
1129
            //if mulitprocessing is used we need to store the id of the process which has handled this entry
1130 2
            $field_array['process_id_completed'] = $this->processID;
1131
        }
1132
1133 2
        GeneralUtility::makeInstance(ConnectionPool::class)->getConnectionForTable(QueueRepository::TABLE_NAME)
1134 2
            ->update(
1135 2
                QueueRepository::TABLE_NAME,
1136
                $field_array,
1137 2
                ['qid' => (int) $queueId]
1138
            );
1139
1140 2
        $result = $this->queueExecutor->executeQueueItem($queueRec, $this);
1141 2
        if ($result['content'] === null) {
1142
            $resultData = 'An errors happened';
0 ignored issues
show
Unused Code introduced by
The assignment to $resultData is dead and can be removed.
Loading history...
1143
        } else {
1144
            /** @var JsonCompatibilityConverter $jsonCompatibilityConverter */
1145 2
            $jsonCompatibilityConverter = GeneralUtility::makeInstance(JsonCompatibilityConverter::class);
1146 2
            $resultData = $jsonCompatibilityConverter->convert($result['content']);
1147
1148
            //atm there's no need to point to specific pollable extensions
1149 2
            if (is_array($resultData) && is_array($GLOBALS['TYPO3_CONF_VARS']['EXTCONF']['crawler']['pollSuccess'])) {
1150
                foreach ($GLOBALS['TYPO3_CONF_VARS']['EXTCONF']['crawler']['pollSuccess'] as $pollable) {
1151
                    // only check the success value if the instruction is runnig
1152
                    // it is important to name the pollSuccess key same as the procInstructions key
1153
                    if (is_array($resultData['parameters']['procInstructions'])
1154
                        && in_array(
1155
                            $pollable,
1156
                            $resultData['parameters']['procInstructions'], true
1157
                        )
1158
                    ) {
1159
                        if (! empty($resultData['success'][$pollable]) && $resultData['success'][$pollable]) {
1160
                            $ret |= self::CLI_STATUS_POLLABLE_PROCESSED;
0 ignored issues
show
Deprecated Code introduced by
The constant AOE\Crawler\Controller\C...ATUS_POLLABLE_PROCESSED has been deprecated: since 9.2.5 will be removed in v11.x ( Ignorable by Annotation )

If this is a false-positive, you can also ignore this issue in your code via the ignore-deprecated  annotation

1160
                            $ret |= /** @scrutinizer ignore-deprecated */ self::CLI_STATUS_POLLABLE_PROCESSED;

This class constant has been deprecated. The supplier of the class has supplied an explanatory message.

The explanatory message should give you some clue as to whether and when the constant will be removed from the class and what other constant to use instead.

Loading history...
1161
                        }
1162
                    }
1163
                }
1164
            }
1165
        }
1166
        // Set result in log which also denotes the end of the processing of this entry.
1167 2
        $field_array = ['result_data' => json_encode($result)];
1168
1169 2
        SignalSlotUtility::emitSignal(
0 ignored issues
show
Deprecated Code introduced by
The function AOE\Crawler\Utility\Sign...otUtility::emitSignal() has been deprecated. ( Ignorable by Annotation )

If this is a false-positive, you can also ignore this issue in your code via the ignore-deprecated  annotation

1169
        /** @scrutinizer ignore-deprecated */ SignalSlotUtility::emitSignal(
Loading history...
1170 2
            self::class,
1171 2
            SignalSlotUtility::SIGNAL_QUEUEITEM_POSTPROCESS,
1172 2
            [$queueId, &$field_array]
1173
        );
1174
1175 2
        GeneralUtility::makeInstance(ConnectionPool::class)->getConnectionForTable(QueueRepository::TABLE_NAME)
1176 2
            ->update(
1177 2
                QueueRepository::TABLE_NAME,
1178
                $field_array,
1179 2
                ['qid' => (int) $queueId]
1180
            );
1181
1182 2
        $this->logger->debug('crawler-readurl stop ' . microtime(true));
1183 2
        return $ret;
1184
    }
1185
1186
    /**
1187
     * Read URL for not-yet-inserted log-entry
1188
     *
1189
     * @param array $field_array Queue field array,
1190
     *
1191
     * @return array|bool|mixed|string
1192
     */
1193
    public function readUrlFromArray($field_array)
1194
    {
1195
        // Set exec_time to lock record:
1196
        $field_array['exec_time'] = $this->getCurrentTime();
1197
        $connectionForCrawlerQueue = GeneralUtility::makeInstance(ConnectionPool::class)->getConnectionForTable(QueueRepository::TABLE_NAME);
1198
        $connectionForCrawlerQueue->insert(
1199
            QueueRepository::TABLE_NAME,
1200
            $field_array
1201
        );
1202
        $queueId = $field_array['qid'] = $connectionForCrawlerQueue->lastInsertId(QueueRepository::TABLE_NAME, 'qid');
1203
        $result = $this->queueExecutor->executeQueueItem($field_array, $this);
1204
1205
        // Set result in log which also denotes the end of the processing of this entry.
1206
        $field_array = ['result_data' => json_encode($result)];
1207
1208
        SignalSlotUtility::emitSignal(
0 ignored issues
show
Deprecated Code introduced by
The function AOE\Crawler\Utility\Sign...otUtility::emitSignal() has been deprecated. ( Ignorable by Annotation )

If this is a false-positive, you can also ignore this issue in your code via the ignore-deprecated  annotation

1208
        /** @scrutinizer ignore-deprecated */ SignalSlotUtility::emitSignal(
Loading history...
1209
            self::class,
1210
            SignalSlotUtility::SIGNAL_QUEUEITEM_POSTPROCESS,
1211
            [$queueId, &$field_array]
1212
        );
1213
1214
        $connectionForCrawlerQueue->update(
1215
            QueueRepository::TABLE_NAME,
1216
            $field_array,
1217
            ['qid' => $queueId]
1218
        );
1219
1220
        return $result;
1221
    }
1222
1223
    /*****************************
1224
     *
1225
     * Compiling URLs to crawl - tools
1226
     *
1227
     *****************************/
1228
1229
    /**
1230
     * @param integer $id Root page id to start from.
1231
     * @param integer $depth Depth of tree, 0=only id-page, 1= on sublevel, 99 = infinite
1232
     * @param integer $scheduledTime Unix Time when the URL is timed to be visited when put in queue
1233
     * @param integer $reqMinute Number of requests per minute (creates the interleave between requests)
1234
     * @param boolean $submitCrawlUrls If set, submits the URLs to queue in database (real crawling)
1235
     * @param boolean $downloadCrawlUrls If set (and submitcrawlUrls is false) will fill $downloadUrls with entries)
1236
     * @param array $incomingProcInstructions Array of processing instructions
1237
     * @param array $configurationSelection Array of configuration keys
1238
     * @return string
1239
     */
1240
    public function getPageTreeAndUrls(
1241
        $id,
1242
        $depth,
1243
        $scheduledTime,
1244
        $reqMinute,
1245
        $submitCrawlUrls,
1246
        $downloadCrawlUrls,
1247
        array $incomingProcInstructions,
1248
        array $configurationSelection
1249
    ) {
1250
        $this->scheduledTime = $scheduledTime;
1251
        $this->reqMinute = $reqMinute;
1252
        $this->submitCrawlUrls = $submitCrawlUrls;
1253
        $this->downloadCrawlUrls = $downloadCrawlUrls;
1254
        $this->incomingProcInstructions = $incomingProcInstructions;
1255
        $this->incomingConfigurationSelection = $configurationSelection;
1256
1257
        $this->duplicateTrack = [];
1258
        $this->downloadUrls = [];
1259
1260
        // Drawing tree:
1261
        /* @var PageTreeView $tree */
1262
        $tree = GeneralUtility::makeInstance(PageTreeView::class);
1263
        $perms_clause = $this->getBackendUser()->getPagePermsClause(Permission::PAGE_SHOW);
1264
        $tree->init('AND ' . $perms_clause);
1265
1266
        $pageInfo = BackendUtility::readPageAccess($id, $perms_clause);
1267
        if (is_array($pageInfo)) {
1268
            // Set root row:
1269
            $tree->tree[] = [
1270
                'row' => $pageInfo,
1271
                'HTML' => $this->iconFactory->getIconForRecord('pages', $pageInfo, Icon::SIZE_SMALL),
1272
            ];
1273
        }
1274
1275
        // Get branch beneath:
1276
        if ($depth) {
1277
            $tree->getTree($id, $depth, '');
1278
        }
1279
1280
        // Traverse page tree:
1281
        $code = '';
1282
1283
        foreach ($tree->tree as $data) {
1284
            $this->MP = false;
1285
1286
            // recognize mount points
1287
            if ($data['row']['doktype'] === PageRepository::DOKTYPE_MOUNTPOINT) {
1288
                $mountpage = $this->pageRepository->getPage($data['row']['uid']);
1289
1290
                // fetch mounted pages
1291
                $this->MP = $mountpage[0]['mount_pid'] . '-' . $data['row']['uid'];
0 ignored issues
show
Documentation Bug introduced by
The property $MP was declared of type boolean, but $mountpage[0]['mount_pid...' . $data['row']['uid'] is of type string. Maybe add a type cast?

This check looks for assignments to scalar types that may be of the wrong type.

To ensure the code behaves as expected, it may be a good idea to add an explicit type cast.

$answer = 42;

$correct = false;

$correct = (bool) $answer;
Loading history...
1292
1293
                $mountTree = GeneralUtility::makeInstance(PageTreeView::class);
1294
                $mountTree->init('AND ' . $perms_clause);
1295
                $mountTree->getTree($mountpage[0]['mount_pid'], $depth);
1296
1297
                foreach ($mountTree->tree as $mountData) {
1298
                    $code .= $this->drawURLs_addRowsForPage(
1299
                        $mountData['row'],
1300
                        $mountData['HTML'] . BackendUtility::getRecordTitle('pages', $mountData['row'], true)
1301
                    );
1302
                }
1303
1304
                // replace page when mount_pid_ol is enabled
1305
                if ($mountpage[0]['mount_pid_ol']) {
1306
                    $data['row']['uid'] = $mountpage[0]['mount_pid'];
1307
                } else {
1308
                    // if the mount_pid_ol is not set the MP must not be used for the mountpoint page
1309
                    $this->MP = false;
1310
                }
1311
            }
1312
1313
            $code .= $this->drawURLs_addRowsForPage(
1314
                $data['row'],
1315
                $data['HTML'] . BackendUtility::getRecordTitle('pages', $data['row'], true)
1316
            );
1317
        }
1318
1319
        return $code;
1320
    }
1321
1322
    /**
1323
     * Expands exclude string
1324
     *
1325
     * @param string $excludeString Exclude string
1326
     * @return array
1327
     * @deprecated
1328
     */
1329 1
    public function expandExcludeString($excludeString)
1330
    {
1331 1
        return $this->configurationService->expandExcludeString($excludeString);
1332
    }
1333
1334
    /**
1335
     * Create the rows for display of the page tree
1336
     * For each page a number of rows are shown displaying GET variable configuration
1337
     */
1338
    public function drawURLs_addRowsForPage(array $pageRow, string $pageTitle): string
1339
    {
1340
        $skipMessage = '';
1341
1342
        // Get list of configurations
1343
        $configurations = $this->getUrlsForPageRow($pageRow, $skipMessage);
1344
        $configurations = ConfigurationService::removeDisallowedConfigurations($this->incomingConfigurationSelection, $configurations);
1345
1346
        // Traverse parameter combinations:
1347
        $c = 0;
1348
        $content = '';
1349
        if (! empty($configurations)) {
1350
            foreach ($configurations as $confKey => $confArray) {
1351
1352
                // Title column:
1353
                if (! $c) {
1354
                    $titleClm = '<td rowspan="' . count($configurations) . '">' . $pageTitle . '</td>';
1355
                } else {
1356
                    $titleClm = '';
1357
                }
1358
1359
                if (! in_array($pageRow['uid'], $this->configurationService->expandExcludeString($confArray['subCfg']['exclude']), true)) {
1360
1361
                    // URL list:
1362
                    $urlList = $this->urlListFromUrlArray(
1363
                        $confArray,
1364
                        $pageRow,
1365
                        $this->scheduledTime,
1366
                        $this->reqMinute,
1367
                        $this->submitCrawlUrls,
1368
                        $this->downloadCrawlUrls,
1369
                        $this->duplicateTrack,
1370
                        $this->downloadUrls,
1371
                        // if empty the urls won't be filtered by processing instructions
1372
                        $this->incomingProcInstructions
1373
                    );
1374
1375
                    // Expanded parameters:
1376
                    $paramExpanded = '';
1377
                    $calcAccu = [];
1378
                    $calcRes = 1;
1379
                    foreach ($confArray['paramExpanded'] as $gVar => $gVal) {
1380
                        $paramExpanded .= '
1381
                            <tr>
1382
                                <td>' . htmlspecialchars('&' . $gVar . '=') . '<br/>' .
1383
                            '(' . count($gVal) . ')' .
1384
                            '</td>
1385
                                <td nowrap="nowrap">' . nl2br(htmlspecialchars(implode(chr(10), $gVal))) . '</td>
1386
                            </tr>
1387
                        ';
1388
                        $calcRes *= count($gVal);
1389
                        $calcAccu[] = count($gVal);
1390
                    }
1391
                    $paramExpanded = '<table>' . $paramExpanded . '</table>';
1392
                    $paramExpanded .= 'Comb: ' . implode('*', $calcAccu) . '=' . $calcRes;
1393
1394
                    // Options
1395
                    $optionValues = '';
1396
                    if ($confArray['subCfg']['userGroups']) {
1397
                        $optionValues .= 'User Groups: ' . $confArray['subCfg']['userGroups'] . '<br/>';
1398
                    }
1399
                    if ($confArray['subCfg']['procInstrFilter']) {
1400
                        $optionValues .= 'ProcInstr: ' . $confArray['subCfg']['procInstrFilter'] . '<br/>';
1401
                    }
1402
1403
                    // Compile row:
1404
                    $content .= '
1405
                        <tr>
1406
                            ' . $titleClm . '
1407
                            <td>' . htmlspecialchars($confKey) . '</td>
1408
                            <td>' . nl2br(htmlspecialchars(rawurldecode(trim(str_replace('&', chr(10) . '&', GeneralUtility::implodeArrayForUrl('', $confArray['paramParsed'])))))) . '</td>
1409
                            <td>' . $paramExpanded . '</td>
1410
                            <td nowrap="nowrap">' . $urlList . '</td>
1411
                            <td nowrap="nowrap">' . $optionValues . '</td>
1412
                            <td nowrap="nowrap">' . DebugUtility::viewArray($confArray['subCfg']['procInstrParams.']) . '</td>
1413
                        </tr>';
1414
                } else {
1415
                    $content .= '<tr>
1416
                            ' . $titleClm . '
1417
                            <td>' . htmlspecialchars($confKey) . '</td>
1418
                            <td colspan="5"><em>No entries</em> (Page is excluded in this configuration)</td>
1419
                        </tr>';
1420
                }
1421
1422
                $c++;
1423
            }
1424
        } else {
1425
            $message = ! empty($skipMessage) ? ' (' . $skipMessage . ')' : '';
1426
1427
            // Compile row:
1428
            $content .= '
1429
                <tr>
1430
                    <td>' . $pageTitle . '</td>
1431
                    <td colspan="6"><em>No entries</em>' . $message . '</td>
1432
                </tr>';
1433
        }
1434
1435
        return $content;
1436
    }
1437
1438
    /*****************************
1439
     *
1440
     * CLI functions
1441
     *
1442
     *****************************/
1443
1444
    /**
1445
     * Running the functionality of the CLI (crawling URLs from queue)
1446
     * @deprecated
1447
     * @codeCoverageIgnore
1448
     */
1449
    public function CLI_run(int $countInARun, int $sleepTime, int $sleepAfterFinish): int
1450
    {
1451
        $result = 0;
1452
        $counter = 0;
1453
1454
        // First, run hooks:
1455
        foreach ($GLOBALS['TYPO3_CONF_VARS']['EXTCONF']['crawler']['cli_hooks'] ?? [] as $objRef) {
1456
            trigger_error(
1457
                'This hook (crawler/cli_hooks) is deprecated since 9.1.5 and will be removed when dropping support for TYPO3 9LTS and 10LTS',
1458
                E_USER_DEPRECATED
1459
            );
1460
            $hookObj = GeneralUtility::makeInstance($objRef);
1461
            if (is_object($hookObj)) {
1462
                $hookObj->crawler_init($this);
1463
            }
1464
        }
1465
1466
        // Clean up the queue
1467
        $this->queueRepository->cleanupQueue();
1468
1469
        // Select entries:
1470
        $rows = $this->queueRepository->fetchRecordsToBeCrawled($countInARun);
1471
1472
        if (! empty($rows)) {
1473
            $quidList = [];
1474
1475
            foreach ($rows as $r) {
1476
                $quidList[] = $r['qid'];
1477
            }
1478
1479
            $processId = $this->CLI_buildProcessId();
0 ignored issues
show
Deprecated Code introduced by
The function AOE\Crawler\Controller\C...r::CLI_buildProcessId() has been deprecated. ( Ignorable by Annotation )

If this is a false-positive, you can also ignore this issue in your code via the ignore-deprecated  annotation

1479
            $processId = /** @scrutinizer ignore-deprecated */ $this->CLI_buildProcessId();
Loading history...
1480
1481
            //save the number of assigned queue entries to determine how many have been processed later
1482
            $numberOfAffectedRows = $this->queueRepository->updateProcessIdAndSchedulerForQueueIds($quidList, $processId);
1483
            $this->processRepository->updateProcessAssignItemsCount($numberOfAffectedRows, $processId);
1484
1485
            if ($numberOfAffectedRows !== count($quidList)) {
1486
                return ($result | self::CLI_STATUS_ABORTED);
0 ignored issues
show
Deprecated Code introduced by
The constant AOE\Crawler\Controller\C...ler::CLI_STATUS_ABORTED has been deprecated: since 9.2.5 will be removed in v11.x ( Ignorable by Annotation )

If this is a false-positive, you can also ignore this issue in your code via the ignore-deprecated  annotation

1486
                return ($result | /** @scrutinizer ignore-deprecated */ self::CLI_STATUS_ABORTED);

This class constant has been deprecated. The supplier of the class has supplied an explanatory message.

The explanatory message should give you some clue as to whether and when the constant will be removed from the class and what other constant to use instead.

Loading history...
1487
            }
1488
1489
            foreach ($rows as $r) {
1490
                $result |= $this->readUrl($r['qid']);
1491
1492
                $counter++;
1493
                // Just to relax the system
1494
                usleep((int) $sleepTime);
1495
1496
                // if during the start and the current read url the cli has been disable we need to return from the function
1497
                // mark the process NOT as ended.
1498
                if ($this->crawler->isDisabled()) {
1499
                    return ($result | self::CLI_STATUS_ABORTED);
0 ignored issues
show
Deprecated Code introduced by
The constant AOE\Crawler\Controller\C...ler::CLI_STATUS_ABORTED has been deprecated: since 9.2.5 will be removed in v11.x ( Ignorable by Annotation )

If this is a false-positive, you can also ignore this issue in your code via the ignore-deprecated  annotation

1499
                    return ($result | /** @scrutinizer ignore-deprecated */ self::CLI_STATUS_ABORTED);

This class constant has been deprecated. The supplier of the class has supplied an explanatory message.

The explanatory message should give you some clue as to whether and when the constant will be removed from the class and what other constant to use instead.

Loading history...
1500
                }
1501
1502
                if (! $this->processRepository->isProcessActive($this->CLI_buildProcessId())) {
0 ignored issues
show
Deprecated Code introduced by
The function AOE\Crawler\Controller\C...r::CLI_buildProcessId() has been deprecated. ( Ignorable by Annotation )

If this is a false-positive, you can also ignore this issue in your code via the ignore-deprecated  annotation

1502
                if (! $this->processRepository->isProcessActive(/** @scrutinizer ignore-deprecated */ $this->CLI_buildProcessId())) {
Loading history...
1503
                    $this->CLI_debug('conflict / timeout (' . $this->CLI_buildProcessId() . ')');
0 ignored issues
show
Deprecated Code introduced by
The function AOE\Crawler\Controller\C...Controller::CLI_debug() has been deprecated. ( Ignorable by Annotation )

If this is a false-positive, you can also ignore this issue in your code via the ignore-deprecated  annotation

1503
                    /** @scrutinizer ignore-deprecated */ $this->CLI_debug('conflict / timeout (' . $this->CLI_buildProcessId() . ')');
Loading history...
Deprecated Code introduced by
The function AOE\Crawler\Controller\C...r::CLI_buildProcessId() has been deprecated. ( Ignorable by Annotation )

If this is a false-positive, you can also ignore this issue in your code via the ignore-deprecated  annotation

1503
                    $this->CLI_debug('conflict / timeout (' . /** @scrutinizer ignore-deprecated */ $this->CLI_buildProcessId() . ')');
Loading history...
1504
                    $result |= self::CLI_STATUS_ABORTED;
0 ignored issues
show
Deprecated Code introduced by
The constant AOE\Crawler\Controller\C...ler::CLI_STATUS_ABORTED has been deprecated: since 9.2.5 will be removed in v11.x ( Ignorable by Annotation )

If this is a false-positive, you can also ignore this issue in your code via the ignore-deprecated  annotation

1504
                    $result |= /** @scrutinizer ignore-deprecated */ self::CLI_STATUS_ABORTED;

This class constant has been deprecated. The supplier of the class has supplied an explanatory message.

The explanatory message should give you some clue as to whether and when the constant will be removed from the class and what other constant to use instead.

Loading history...
1505
                    //possible timeout
1506
                    break;
1507
                }
1508
            }
1509
1510
            sleep((int) $sleepAfterFinish);
1511
        }
1512
1513
        if ($counter > 0) {
1514
            $result |= self::CLI_STATUS_PROCESSED;
0 ignored issues
show
Deprecated Code introduced by
The constant AOE\Crawler\Controller\C...r::CLI_STATUS_PROCESSED has been deprecated: since 9.2.5 will be removed in v11.x ( Ignorable by Annotation )

If this is a false-positive, you can also ignore this issue in your code via the ignore-deprecated  annotation

1514
            $result |= /** @scrutinizer ignore-deprecated */ self::CLI_STATUS_PROCESSED;

This class constant has been deprecated. The supplier of the class has supplied an explanatory message.

The explanatory message should give you some clue as to whether and when the constant will be removed from the class and what other constant to use instead.

Loading history...
1515
        }
1516
1517
        return $result;
1518
    }
1519
1520
    /**
1521
     * Activate hooks
1522
     * @deprecated
1523
     * @codeCoverageIgnore
1524
     */
1525
    public function CLI_runHooks(): void
1526
    {
1527
        foreach ($GLOBALS['TYPO3_CONF_VARS']['EXTCONF']['crawler']['cli_hooks'] ?? [] as $objRef) {
1528
            $hookObj = GeneralUtility::makeInstance($objRef);
1529
            if (is_object($hookObj)) {
1530
                $hookObj->crawler_init($this);
1531
            }
1532
        }
1533
    }
1534
1535
    /**
1536
     * Try to acquire a new process with the given id
1537
     * also performs some auto-cleanup for orphan processes
1538
     * @param string $id identification string for the process
1539
     * @return boolean
1540
     * @todo preemption might not be the most elegant way to clean up
1541
     * @deprecated
1542
     * @codeCoverageIgnore
1543
     */
1544
    public function CLI_checkAndAcquireNewProcess($id)
1545
    {
1546
        $ret = true;
1547
1548
        $systemProcessId = getmypid();
1549
        if (! $systemProcessId) {
1550
            return false;
1551
        }
1552
1553
        $processCount = 0;
1554
        $orphanProcesses = [];
1555
1556
        $activeProcesses = $this->processRepository->findAllActive();
1557
        $currentTime = $this->getCurrentTime();
1558
1559
        /** @var Process $process */
1560
        foreach ($activeProcesses as $process) {
1561
            if ($process->getTtl() < $currentTime) {
1562
                $orphanProcesses[] = $process->getProcessId();
1563
            } else {
1564
                $processCount++;
1565
            }
1566
        }
1567
1568
        // if there are less than allowed active processes then add a new one
1569
        if ($processCount < (int) $this->extensionSettings['processLimit']) {
1570
            $this->processRepository->addProcess($id, $systemProcessId);
1571
        } else {
1572
            $ret = false;
1573
        }
1574
1575
        $this->processRepository->deleteProcessesMarkedAsDeleted();
1576
        $this->processRepository->markRequestedProcessesAsNotActive($orphanProcesses);
1577
        $this->queueRepository->unsetProcessScheduledAndProcessIdForQueueEntries($orphanProcesses);
1578
1579
        return $ret;
1580
    }
1581
1582
    /**
1583
     * Release a process and the required resources
1584
     *
1585
     * @param mixed $releaseIds string with a single process-id or array with multiple process-ids
1586
     * @return boolean
1587
     * @deprecated
1588
     * @codeCoverageIgnore
1589
     */
1590
    public function CLI_releaseProcesses($releaseIds)
1591
    {
1592
        $queryBuilder = GeneralUtility::makeInstance(ConnectionPool::class)->getQueryBuilderForTable($this->tableName);
0 ignored issues
show
Deprecated Code introduced by
The property AOE\Crawler\Controller\C...rController::$tableName has been deprecated: Since v9.2.5 - This will be remove in v10 ( Ignorable by Annotation )

If this is a false-positive, you can also ignore this issue in your code via the ignore-deprecated  annotation

1592
        $queryBuilder = GeneralUtility::makeInstance(ConnectionPool::class)->getQueryBuilderForTable(/** @scrutinizer ignore-deprecated */ $this->tableName);

This property has been deprecated. The supplier of the class has supplied an explanatory message.

The explanatory message should give you some clue as to whether and when the property will be removed from the class and what other property to use instead.

Loading history...
1593
1594
        if (! is_array($releaseIds)) {
1595
            $releaseIds = [$releaseIds];
1596
        }
1597
1598
        if (empty($releaseIds)) {
1599
            //nothing to release
1600
            return false;
1601
        }
1602
1603
        // some kind of 2nd chance algo - this way you need at least 2 processes to have a real cleanup
1604
        // this ensures that a single process can't mess up the entire process table
1605
1606
        // mark all processes as deleted which have no "waiting" queue-entires and which are not active
1607
1608
        // ReleaseQueueEntries
1609
        $queryBuilder
1610
            ->update(QueueRepository::TABLE_NAME, 'q')
1611
            ->where(
1612
                'q.process_id IN(SELECT p.process_id FROM tx_crawler_process as p WHERE p.active = 0)'
1613
            )
1614
            ->set('q.process_scheduled', 0)
1615
            ->set('q.process_id', '')
1616
            ->execute();
1617
1618
        // FIXME: Not entirely sure that this is equivalent to the previous version
1619
        $queryBuilder->resetQueryPart('set');
1620
1621
        // ReleaseProcessEntries
1622
        $queryBuilder
1623
            ->update(ProcessRepository::TABLE_NAME)
1624
            ->where(
1625
                $queryBuilder->expr()->eq('active', 0),
1626
                'process_id IN(SELECT q.process_id FROM tx_crawler_queue as q WHERE q.exec_time = 0)'
1627
            )
1628
            ->set('system_process_id', 0)
1629
            ->execute();
1630
1631
        $this->processRepository->markRequestedProcessesAsNotActive($releaseIds);
1632
        $this->queueRepository->unsetProcessScheduledAndProcessIdForQueueEntries($releaseIds);
1633
1634
        return true;
1635
    }
1636
1637
    /**
1638
     * Create a unique Id for the current process
1639
     *
1640
     * @return string the ID
1641
     * @deprecated
1642
     * @codeCoverageIgnore
1643
     */
1644
    public function CLI_buildProcessId()
1645
    {
1646
        if (! $this->processID) {
1647
            $this->processID = GeneralUtility::shortMD5(microtime(true));
1648
        }
1649
        return $this->processID;
1650
    }
1651
1652
    /**
1653
     * Prints a message to the stdout (only if debug-mode is enabled)
1654
     *
1655
     * @param string $msg the message
1656
     * @deprecated
1657
     * @codeCoverageIgnore
1658
     */
1659
    public function CLI_debug($msg): void
1660
    {
1661
        if ((int) $this->extensionSettings['processDebug']) {
1662
            echo $msg . "\n";
1663
            flush();
1664
        }
1665
    }
1666
1667
    /**
1668
     * Cleans up entries that stayed for too long in the queue. These are:
1669
     * - processed entries that are over 1.5 days in age
1670
     * - scheduled entries that are over 7 days old
1671
     *
1672
     * @deprecated
1673
     */
1674 1
    public function cleanUpOldQueueEntries(): void
1675
    {
1676
        // 24*60*60 Seconds in 24 hours
1677 1
        $processedAgeInSeconds = $this->extensionSettings['cleanUpProcessedAge'] * 86400;
1678 1
        $scheduledAgeInSeconds = $this->extensionSettings['cleanUpScheduledAge'] * 86400;
1679
1680 1
        $now = time();
1681 1
        $condition = '(exec_time<>0 AND exec_time<' . ($now - $processedAgeInSeconds) . ') OR scheduled<=' . ($now - $scheduledAgeInSeconds);
1682 1
        $this->flushQueue($condition);
0 ignored issues
show
Deprecated Code introduced by
The function AOE\Crawler\Controller\C...ontroller::flushQueue() has been deprecated. ( Ignorable by Annotation )

If this is a false-positive, you can also ignore this issue in your code via the ignore-deprecated  annotation

1682
        /** @scrutinizer ignore-deprecated */ $this->flushQueue($condition);
Loading history...
1683 1
    }
1684
1685
    /**
1686
     * Removes queue entries
1687
     *
1688
     * @param string $where SQL related filter for the entries which should be removed
1689
     *
1690
     * @deprecated
1691
     */
1692 5
    protected function flushQueue($where = ''): void
1693
    {
1694 5
        $realWhere = strlen((string) $where) > 0 ? $where : '1=1';
1695
1696 5
        $queryBuilder = $this->getQueryBuilder($this->tableName);
0 ignored issues
show
Deprecated Code introduced by
The property AOE\Crawler\Controller\C...rController::$tableName has been deprecated: Since v9.2.5 - This will be remove in v10 ( Ignorable by Annotation )

If this is a false-positive, you can also ignore this issue in your code via the ignore-deprecated  annotation

1696
        $queryBuilder = $this->getQueryBuilder(/** @scrutinizer ignore-deprecated */ $this->tableName);

This property has been deprecated. The supplier of the class has supplied an explanatory message.

The explanatory message should give you some clue as to whether and when the property will be removed from the class and what other property to use instead.

Loading history...
1697
1698
        $groups = $queryBuilder
1699 5
            ->selectLiteral('DISTINCT set_id')
1700 5
            ->from($this->tableName)
0 ignored issues
show
Deprecated Code introduced by
The property AOE\Crawler\Controller\C...rController::$tableName has been deprecated: Since v9.2.5 - This will be remove in v10 ( Ignorable by Annotation )

If this is a false-positive, you can also ignore this issue in your code via the ignore-deprecated  annotation

1700
            ->from(/** @scrutinizer ignore-deprecated */ $this->tableName)

This property has been deprecated. The supplier of the class has supplied an explanatory message.

The explanatory message should give you some clue as to whether and when the property will be removed from the class and what other property to use instead.

Loading history...
1701 5
            ->where($realWhere)
1702 5
            ->execute()
1703 5
            ->fetchAll();
1704 5
        if (is_array($groups)) {
0 ignored issues
show
introduced by
The condition is_array($groups) is always true.
Loading history...
1705 5
            foreach ($groups as $group) {
1706
                $subSet = $queryBuilder
1707 4
                    ->select('qid', 'set_id')
1708 4
                    ->from($this->tableName)
0 ignored issues
show
Deprecated Code introduced by
The property AOE\Crawler\Controller\C...rController::$tableName has been deprecated: Since v9.2.5 - This will be remove in v10 ( Ignorable by Annotation )

If this is a false-positive, you can also ignore this issue in your code via the ignore-deprecated  annotation

1708
                    ->from(/** @scrutinizer ignore-deprecated */ $this->tableName)

This property has been deprecated. The supplier of the class has supplied an explanatory message.

The explanatory message should give you some clue as to whether and when the property will be removed from the class and what other property to use instead.

Loading history...
1709 4
                    ->where(
1710 4
                        $realWhere,
1711 4
                        $queryBuilder->expr()->eq('set_id', $group['set_id'])
1712
                    )
1713 4
                    ->execute()
1714 4
                    ->fetchAll();
1715
1716 4
                $payLoad = ['subSet' => $subSet];
1717 4
                SignalSlotUtility::emitSignal(
0 ignored issues
show
Deprecated Code introduced by
The function AOE\Crawler\Utility\Sign...otUtility::emitSignal() has been deprecated. ( Ignorable by Annotation )

If this is a false-positive, you can also ignore this issue in your code via the ignore-deprecated  annotation

1717
                /** @scrutinizer ignore-deprecated */ SignalSlotUtility::emitSignal(
Loading history...
1718 4
                    self::class,
1719 4
                    SignalSlotUtility::SIGNAL_QUEUE_ENTRY_FLUSH,
1720
                    $payLoad
1721
                );
1722
            }
1723
        }
1724
1725
        $queryBuilder
1726 5
            ->delete($this->tableName)
0 ignored issues
show
Deprecated Code introduced by
The property AOE\Crawler\Controller\C...rController::$tableName has been deprecated: Since v9.2.5 - This will be remove in v10 ( Ignorable by Annotation )

If this is a false-positive, you can also ignore this issue in your code via the ignore-deprecated  annotation

1726
            ->delete(/** @scrutinizer ignore-deprecated */ $this->tableName)

This property has been deprecated. The supplier of the class has supplied an explanatory message.

The explanatory message should give you some clue as to whether and when the property will be removed from the class and what other property to use instead.

Loading history...
1727 5
            ->where($realWhere)
1728 5
            ->execute();
1729 5
    }
1730
1731
    /**
1732
     * This method determines duplicates for a queue entry with the same parameters and this timestamp.
1733
     * If the timestamp is in the past, it will check if there is any unprocessed queue entry in the past.
1734
     * If the timestamp is in the future it will check, if the queued entry has exactly the same timestamp
1735
     *
1736
     * @param int $tstamp
1737
     * @param array $fieldArray
1738
     *
1739
     * @return array
1740
     * @deprecated
1741
     */
1742 5
    protected function getDuplicateRowsIfExist($tstamp, $fieldArray)
1743
    {
1744 5
        $rows = [];
1745
1746 5
        $currentTime = $this->getCurrentTime();
1747
1748 5
        $queryBuilder = GeneralUtility::makeInstance(ConnectionPool::class)->getQueryBuilderForTable($this->tableName);
0 ignored issues
show
Deprecated Code introduced by
The property AOE\Crawler\Controller\C...rController::$tableName has been deprecated: Since v9.2.5 - This will be remove in v10 ( Ignorable by Annotation )

If this is a false-positive, you can also ignore this issue in your code via the ignore-deprecated  annotation

1748
        $queryBuilder = GeneralUtility::makeInstance(ConnectionPool::class)->getQueryBuilderForTable(/** @scrutinizer ignore-deprecated */ $this->tableName);

This property has been deprecated. The supplier of the class has supplied an explanatory message.

The explanatory message should give you some clue as to whether and when the property will be removed from the class and what other property to use instead.

Loading history...
1749
        $queryBuilder
1750 5
            ->select('qid')
1751 5
            ->from(QueueRepository::TABLE_NAME);
1752
        //if this entry is scheduled with "now"
1753 5
        if ($tstamp <= $currentTime) {
1754 2
            if ($this->extensionSettings['enableTimeslot']) {
1755 1
                $timeBegin = $currentTime - 100;
1756 1
                $timeEnd = $currentTime + 100;
1757
                $queryBuilder
1758 1
                    ->where(
1759 1
                        'scheduled BETWEEN ' . $timeBegin . ' AND ' . $timeEnd . ''
1760
                    )
1761 1
                    ->orWhere(
1762 1
                        $queryBuilder->expr()->lte('scheduled', $currentTime)
1763
                    );
1764
            } else {
1765
                $queryBuilder
1766 1
                    ->where(
1767 2
                        $queryBuilder->expr()->lte('scheduled', $currentTime)
1768
                    );
1769
            }
1770 3
        } elseif ($tstamp > $currentTime) {
1771
            //entry with a timestamp in the future need to have the same schedule time
1772
            $queryBuilder
1773 3
                ->where(
1774 3
                    $queryBuilder->expr()->eq('scheduled', $tstamp)
1775
                );
1776
        }
1777
1778
        $queryBuilder
1779 5
            ->andWhere('NOT exec_time')
1780 5
            ->andWhere('NOT process_id')
1781 5
            ->andWhere($queryBuilder->expr()->eq('page_id', $queryBuilder->createNamedParameter($fieldArray['page_id'], PDO::PARAM_INT)))
1782 5
            ->andWhere($queryBuilder->expr()->eq('parameters_hash', $queryBuilder->createNamedParameter($fieldArray['parameters_hash'], PDO::PARAM_STR)));
1783
1784 5
        $statement = $queryBuilder->execute();
1785
1786 5
        while ($row = $statement->fetch()) {
1787 5
            $rows[] = $row['qid'];
1788
        }
1789
1790 5
        return $rows;
1791
    }
1792
1793
    /**
1794
     * Returns a md5 hash generated from a serialized configuration array.
1795
     *
1796
     * @return string
1797
     */
1798 13
    protected function getConfigurationHash(array $configuration)
1799
    {
1800 13
        unset($configuration['paramExpanded']);
1801 13
        unset($configuration['URLs']);
1802 13
        return md5(serialize($configuration));
1803
    }
1804
1805
    /**
1806
     * Build a URL from a Page and the Query String. If the page has a Site configuration, it can be built by using
1807
     * the Site instance.
1808
     *
1809
     * @param int $httpsOrHttp see tx_crawler_configuration.force_ssl
1810
     * @throws SiteNotFoundException
1811
     * @throws InvalidRouteArgumentsException
1812
     *
1813
     * @deprecated Using CrawlerController::getUrlFromPageAndQueryParameters() is deprecated since 9.1.1 and will be removed in v11.x, please use UrlService->getUrlFromPageAndQueryParameters() instead.
1814
     * @codeCoverageIgnore
1815
     */
1816
    protected function getUrlFromPageAndQueryParameters(int $pageId, string $queryString, ?string $alternativeBaseUrl, int $httpsOrHttp): UriInterface
1817
    {
1818
        $urlService = new UrlService();
1819
        return $urlService->getUrlFromPageAndQueryParameters($pageId, $queryString, $alternativeBaseUrl, $httpsOrHttp);
1820
    }
1821
1822
    /**
1823
     * @deprecated
1824
     */
1825 1
    protected function swapIfFirstIsLargerThanSecond(array $reg): array
1826
    {
1827
        // Swap if first is larger than last:
1828 1
        if ($reg[1] > $reg[2]) {
1829
            $temp = $reg[2];
1830
            $reg[2] = $reg[1];
1831
            $reg[1] = $temp;
1832
        }
1833
1834 1
        return $reg;
1835
    }
1836
1837 7
    protected function getPageService(): PageService
1838
    {
1839 7
        return new PageService();
1840
    }
1841
1842
    private function getMaximumUrlsToCompile(): int
1843
    {
1844
        return $this->maximumUrlsToCompile;
1845
    }
1846
1847
    /**
1848
     * @return BackendUserAuthentication
1849
     */
1850 2
    private function getBackendUser()
1851
    {
1852
        // Make sure the _cli_ user is loaded
1853 2
        Bootstrap::initializeBackendAuthentication();
1854 2
        if ($this->backendUser === null) {
1855 2
            $this->backendUser = $GLOBALS['BE_USER'];
1856
        }
1857 2
        return $this->backendUser;
1858
    }
1859
1860
    /**
1861
     * Get querybuilder for given table
1862
     *
1863
     * @return QueryBuilder
1864
     */
1865 11
    private function getQueryBuilder(string $table)
1866
    {
1867 11
        return GeneralUtility::makeInstance(ConnectionPool::class)->getQueryBuilderForTable($table);
1868
    }
1869
}
1870