CrawlerController::setDisabled()   A
last analyzed

Complexity

Conditions 3
Paths 3

Size

Total Lines 6
Code Lines 4

Duplication

Lines 0
Ratio 0 %

Code Coverage

Tests 5
CRAP Score 3

Importance

Changes 0
Metric Value
cc 3
eloc 4
nc 3
nop 1
dl 0
loc 6
ccs 5
cts 5
cp 1
crap 3
rs 10
c 0
b 0
f 0
1
<?php
2
3
declare(strict_types=1);
4
5
namespace AOE\Crawler\Controller;
6
7
/***************************************************************
8
 *  Copyright notice
9
 *
10
 *  (c) 2020 AOE GmbH <[email protected]>
11
 *
12
 *  All rights reserved
13
 *
14
 *  This script is part of the TYPO3 project. The TYPO3 project is
15
 *  free software; you can redistribute it and/or modify
16
 *  it under the terms of the GNU General Public License as published by
17
 *  the Free Software Foundation; either version 3 of the License, or
18
 *  (at your option) any later version.
19
 *
20
 *  The GNU General Public License can be found at
21
 *  http://www.gnu.org/copyleft/gpl.html.
22
 *
23
 *  This script is distributed in the hope that it will be useful,
24
 *  but WITHOUT ANY WARRANTY; without even the implied warranty of
25
 *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
26
 *  GNU General Public License for more details.
27
 *
28
 *  This copyright notice MUST APPEAR in all copies of the script!
29
 ***************************************************************/
30
31
use AOE\Crawler\Configuration\ExtensionConfigurationProvider;
32
use AOE\Crawler\Converter\JsonCompatibilityConverter;
33
use AOE\Crawler\Crawler;
34
use AOE\Crawler\CrawlStrategy\CrawlStrategyFactory;
35
use AOE\Crawler\Domain\Model\Process;
36
use AOE\Crawler\Domain\Repository\ConfigurationRepository;
37
use AOE\Crawler\Domain\Repository\ProcessRepository;
38
use AOE\Crawler\Domain\Repository\QueueRepository;
39
use AOE\Crawler\QueueExecutor;
40
use AOE\Crawler\Service\ConfigurationService;
41
use AOE\Crawler\Service\PageService;
42
use AOE\Crawler\Service\UrlService;
43
use AOE\Crawler\Utility\SignalSlotUtility;
44
use AOE\Crawler\Value\QueueFilter;
45
use PDO;
46
use Psr\Http\Message\UriInterface;
47
use Psr\Log\LoggerAwareInterface;
48
use Psr\Log\LoggerAwareTrait;
49
use TYPO3\CMS\Backend\Tree\View\PageTreeView;
50
use TYPO3\CMS\Backend\Utility\BackendUtility;
51
use TYPO3\CMS\Core\Authentication\BackendUserAuthentication;
52
use TYPO3\CMS\Core\Compatibility\PublicMethodDeprecationTrait;
53
use TYPO3\CMS\Core\Compatibility\PublicPropertyDeprecationTrait;
54
use TYPO3\CMS\Core\Core\Bootstrap;
55
use TYPO3\CMS\Core\Core\Environment;
56
use TYPO3\CMS\Core\Database\Connection;
57
use TYPO3\CMS\Core\Database\ConnectionPool;
58
use TYPO3\CMS\Core\Database\Query\QueryBuilder;
59
use TYPO3\CMS\Core\Database\Query\Restriction\DeletedRestriction;
60
use TYPO3\CMS\Core\Database\QueryGenerator;
61
use TYPO3\CMS\Core\Domain\Repository\PageRepository;
62
use TYPO3\CMS\Core\Exception\SiteNotFoundException;
63
use TYPO3\CMS\Core\Imaging\Icon;
64
use TYPO3\CMS\Core\Imaging\IconFactory;
65
use TYPO3\CMS\Core\Routing\InvalidRouteArgumentsException;
66
use TYPO3\CMS\Core\Site\Entity\Site;
67
use TYPO3\CMS\Core\Type\Bitmask\Permission;
68
use TYPO3\CMS\Core\Utility\DebugUtility;
69
use TYPO3\CMS\Core\Utility\GeneralUtility;
70
use TYPO3\CMS\Core\Utility\MathUtility;
71
use TYPO3\CMS\Extbase\Object\ObjectManager;
72
73
/**
74
 * Class CrawlerController
75
 *
76
 * @package AOE\Crawler\Controller
77
 */
78
class CrawlerController implements LoggerAwareInterface
79
{
80
    use LoggerAwareTrait;
81
    use PublicMethodDeprecationTrait;
82
    use PublicPropertyDeprecationTrait;
83
84
    /**
85
     * @deprecated since 9.2.5 will be removed in v11.x
86
     */
87
    public const CLI_STATUS_NOTHING_PROCCESSED = 0;
88
89
    /**
90
     * queue not empty
91
     * @deprecated since 9.2.5 will be removed in v11.x
92
     */
93
    public const CLI_STATUS_REMAIN = 1;
94
95
    /**
96
     * (some) queue items where processed
97
     * @deprecated since 9.2.5 will be removed in v11.x
98
     */
99
    public const CLI_STATUS_PROCESSED = 2;
100
101
    /**
102
     * instance didn't finish
103
     * @deprecated since 9.2.5 will be removed in v11.x
104
     */
105
    public const CLI_STATUS_ABORTED = 4;
106
107
    /**
108
     * @deprecated since 9.2.5 will be removed in v11.x
109
     */
110
    public const CLI_STATUS_POLLABLE_PROCESSED = 8;
111
112
    /**
113
     * @var integer
114
     */
115
    public $setID = 0;
116
117
    /**
118
     * @var string
119
     */
120
    public $processID = '';
121
122
    /**
123
     * @var array
124
     */
125
    public $duplicateTrack = [];
126
127
    /**
128
     * @var array
129
     */
130
    public $downloadUrls = [];
131
132
    /**
133
     * @var array
134
     */
135
    public $incomingProcInstructions = [];
136
137
    /**
138
     * @var array
139
     */
140
    public $incomingConfigurationSelection = [];
141
142
    /**
143
     * @var bool
144
     */
145
    public $registerQueueEntriesInternallyOnly = false;
146
147
    /**
148
     * @var array
149
     */
150
    public $queueEntries = [];
151
152
    /**
153
     * @var array
154
     */
155
    public $urlList = [];
156
157
    /**
158
     * @var array
159
     */
160
    public $extensionSettings = [];
161
162
    /**
163
     * Mount Point
164
     *
165
     * @var bool
166
     * Todo: Check what this is used for and adjust the type hint or code, as bool doesn't match the current code.
167
     */
168
    public $MP = false;
169
170
    /**
171
     * @var string
172
     * @deprecated
173
     */
174
    protected $processFilename;
175
176
    /**
177
     * Holds the internal access mode can be 'gui','cli' or 'cli_im'
178
     *
179
     * @var string
180
     * @deprecated
181
     */
182
    protected $accessMode;
183
184
    /**
185
     * @var QueueRepository
186
     */
187
    protected $queueRepository;
188
189
    /**
190
     * @var ProcessRepository
191
     */
192
    protected $processRepository;
193
194
    /**
195
     * @var ConfigurationRepository
196
     */
197
    protected $configurationRepository;
198
199
    /**
200
     * @var QueueExecutor
201
     */
202
    protected $queueExecutor;
203
204
    /**
205
     * @var int
206
     */
207
    protected $maximumUrlsToCompile = 10000;
208
209
    /**
210
     * @var IconFactory
211
     */
212
    protected $iconFactory;
213
214
    /**
215
     * @var string[]
216
     */
217
    private $deprecatedPublicMethods = [
0 ignored issues
show
introduced by
The private property $deprecatedPublicMethods is not used, and could be removed.
Loading history...
218
        'compileUrls' => 'Using CrawlerController->compileUrls() is deprecated since 9.2.5, and will be removed in v11.x',
219
        'cleanUpOldQueueEntries' => 'Using CrawlerController::cleanUpOldQueueEntries() is deprecated since 9.0.1 and will be removed in v11.x, please use QueueRepository->cleanUpOldQueueEntries() instead.',
220
        'CLI_buildProcessId' => 'Using CrawlerController->CLI_buildProcessId() is deprecated since 9.2.5 and will be removed in v11.x',
221
        'CLI_checkAndAcquireNewProcess' => 'Using CrawlerController->CLI_checkAndAcquireNewProcess() is deprecated since 9.2.5 and will be removed in v11.x',
222
        'CLI_debug' => 'Using CrawlerController->CLI_debug() is deprecated since 9.1.3 and will be removed in v11.x',
223
        'CLI_releaseProcesses' => 'Using CrawlerController->CLI_releaseProcesses() is deprecated since 9.2.2 and will be removed in v11.x',
224
        'CLI_run' => 'Using CrawlerController->CLI_run() is deprecated since 9.2.2 and will be removed in v11.x',
225
        'CLI_runHooks' => 'Using CrawlerController->CLI_runHooks() is deprecated since 9.1.5 and will be removed in v11.x',
226
        'expandExcludeString' => 'Using CrawlerController->expandExcludeString() is deprecated since 9.2.5 and will be removed in v11.x',
227
        'getAccessMode' => 'Using CrawlerController->getAccessMode() is deprecated since 9.1.3 and will be removed in v11.x',
228
        'getLogEntriesForPageId' => 'Using CrawlerController->getLogEntriesForPageId() is deprecated since 9.1.5 and will be remove in v11.x',
229
        'getLogEntriesForSetId' => 'Using crawlerController::getLogEntriesForSetId() is deprecated since 9.0.1 and will be removed in v11.x',
230
        'hasGroupAccess' => 'Using CrawlerController->getLogEntriesForPageId() is deprecated since 9.2.2 and will be remove in v11.x, please use UserService::hasGroupAccess() instead.',
231
        'flushQueue' => 'Using CrawlerController::flushQueue() is deprecated since 9.0.1 and will be removed in v11.x, please use QueueRepository->flushQueue() instead.',
232
        'setAccessMode' => 'Using CrawlerController->setAccessMode() is deprecated since 9.1.3 and will be removed in v11.x',
233
        'getDisabled' => 'Using CrawlerController->getDisabled() is deprecated since 9.1.3 and will be removed in v11.x, please use Crawler->isDisabled() instead',
234
        'setDisabled' => 'Using CrawlerController->setDisabled() is deprecated since 9.1.3 and will be removed in v11.x, please use Crawler->setDisabled() instead',
235
        'getProcessFilename' => 'Using CrawlerController->getProcessFilename() is deprecated since 9.1.3 and will be removed in v11.x',
236
        'setProcessFilename' => 'Using CrawlerController->setProcessFilename() is deprecated since 9.1.3 and will be removed in v11.x',
237
        'getDuplicateRowsIfExist' => 'Using CrawlerController->getDuplicateRowsIfExist() is deprecated since 9.1.4 and will be remove in v11.x, please use QueueRepository->getDuplicateQueueItemsIfExists() instead',
238
        'checkIfPageShouldBeSkipped' => 'Using CrawlerController->checkIfPageShouldBeSkipped() is deprecated since 9.2.5 and will be removed in v11.x',
239
        'swapIfFirstIsLargerThanSecond' => 'Using CrawlerController->swapIfFirstIsLargerThanSecond() is deprecated since 9.2.5, and will be removed in v11.x',
240
        'expandParameters' => 'Using CrawlerController->expandParameters() is deprecated since 9.2.5, and will be removed in v11.x',
241
    ];
242
243
    /**
244
     * @var string[]
245
     */
246
    private $deprecatedPublicProperties = [
247
        'accessMode' => 'Using CrawlerController->accessMode is deprecated since 9.1.3 and will be removed in v11.x',
248
        'processFilename' => 'Using CrawlerController->accessMode is deprecated since 9.1.3 and will be removed in v11.x',
249
    ];
250
251
    /**
252
     * @var BackendUserAuthentication|null
253
     */
254
    private $backendUser;
255
256
    /**
257
     * @var integer
258
     */
259
    private $scheduledTime = 0;
260
261
    /**
262
     * @var integer
263
     */
264
    private $reqMinute = 0;
265
266
    /**
267
     * @var bool
268
     */
269
    private $submitCrawlUrls = false;
270
271
    /**
272
     * @var bool
273
     */
274
    private $downloadCrawlUrls = false;
275
276
    /**
277
     * @var PageRepository
278
     */
279
    private $pageRepository;
280
281
    /**
282
     * @var Crawler
283
     */
284
    private $crawler;
285
286
    /**
287
     * @var ConfigurationService
288
     */
289
    private $configurationService;
290
291
    /**
292
     * @var UrlService
293
     */
294
    private $urlService;
295
296
    /************************************
297
     *
298
     * Getting URLs based on Page TSconfig
299
     *
300
     ************************************/
301
302 41
    public function __construct()
303
    {
304 41
        $objectManager = GeneralUtility::makeInstance(ObjectManager::class);
305 41
        $crawlStrategyFactory = GeneralUtility::makeInstance(CrawlStrategyFactory::class);
306 41
        $this->queueRepository = $objectManager->get(QueueRepository::class);
307 41
        $this->processRepository = $objectManager->get(ProcessRepository::class);
308 41
        $this->configurationRepository = $objectManager->get(ConfigurationRepository::class);
309 41
        $this->pageRepository = GeneralUtility::makeInstance(PageRepository::class);
310 41
        $this->queueExecutor = GeneralUtility::makeInstance(QueueExecutor::class, $crawlStrategyFactory);
311 41
        $this->iconFactory = GeneralUtility::makeInstance(IconFactory::class);
312 41
        $this->crawler = GeneralUtility::makeInstance(Crawler::class);
313 41
        $this->configurationService = GeneralUtility::makeInstance(ConfigurationService::class);
314 41
        $this->urlService = GeneralUtility::makeInstance(UrlService::class);
315
316 41
        $this->processFilename = Environment::getVarPath() . '/lock/tx_crawler.proc';
0 ignored issues
show
Deprecated Code introduced by
The property AOE\Crawler\Controller\C...oller::$processFilename has been deprecated. ( Ignorable by Annotation )

If this is a false-positive, you can also ignore this issue in your code via the ignore-deprecated  annotation

316
        /** @scrutinizer ignore-deprecated */ $this->processFilename = Environment::getVarPath() . '/lock/tx_crawler.proc';
Loading history...
317
318
        /** @var ExtensionConfigurationProvider $configurationProvider */
319 41
        $configurationProvider = GeneralUtility::makeInstance(ExtensionConfigurationProvider::class);
320 41
        $settings = $configurationProvider->getExtensionConfiguration();
321 41
        $this->extensionSettings = is_array($settings) ? $settings : [];
322
323 41
        if (MathUtility::convertToPositiveInteger($this->extensionSettings['countInARun']) === 0) {
324
            $this->extensionSettings['countInARun'] = 100;
325
        }
326
327 41
        $this->extensionSettings['processLimit'] = MathUtility::forceIntegerInRange($this->extensionSettings['processLimit'], 1, 99, 1);
328 41
        $this->setMaximumUrlsToCompile(MathUtility::forceIntegerInRange($this->extensionSettings['maxCompileUrls'], 1, 1000000000, 10000));
329 41
    }
330
331 41
    public function setMaximumUrlsToCompile(int $maximumUrlsToCompile): void
332
    {
333 41
        $this->maximumUrlsToCompile = $maximumUrlsToCompile;
334 41
    }
335
336
    /**
337
     * Method to set the accessMode can be gui, cli or cli_im
338
     *
339
     * @return string
340
     * @deprecated
341
     */
342 1
    public function getAccessMode()
343
    {
344 1
        return $this->accessMode;
0 ignored issues
show
Deprecated Code introduced by
The property AOE\Crawler\Controller\C...Controller::$accessMode has been deprecated. ( Ignorable by Annotation )

If this is a false-positive, you can also ignore this issue in your code via the ignore-deprecated  annotation

344
        return /** @scrutinizer ignore-deprecated */ $this->accessMode;
Loading history...
345
    }
346
347
    /**
348
     * @param string $accessMode
349
     * @deprecated
350
     */
351 1
    public function setAccessMode($accessMode): void
352
    {
353 1
        $this->accessMode = $accessMode;
0 ignored issues
show
Deprecated Code introduced by
The property AOE\Crawler\Controller\C...Controller::$accessMode has been deprecated. ( Ignorable by Annotation )

If this is a false-positive, you can also ignore this issue in your code via the ignore-deprecated  annotation

353
        /** @scrutinizer ignore-deprecated */ $this->accessMode = $accessMode;
Loading history...
354 1
    }
355
356
    /**
357
     * Set disabled status to prevent processes from being processed
358
     * @deprecated
359
     */
360 3
    public function setDisabled(?bool $disabled = true): void
361
    {
362 3
        if ($disabled) {
363 2
            GeneralUtility::writeFile($this->processFilename, 'disabled');
0 ignored issues
show
Deprecated Code introduced by
The property AOE\Crawler\Controller\C...oller::$processFilename has been deprecated. ( Ignorable by Annotation )

If this is a false-positive, you can also ignore this issue in your code via the ignore-deprecated  annotation

363
            GeneralUtility::writeFile(/** @scrutinizer ignore-deprecated */ $this->processFilename, 'disabled');
Loading history...
364 1
        } elseif (is_file($this->processFilename)) {
0 ignored issues
show
Deprecated Code introduced by
The property AOE\Crawler\Controller\C...oller::$processFilename has been deprecated. ( Ignorable by Annotation )

If this is a false-positive, you can also ignore this issue in your code via the ignore-deprecated  annotation

364
        } elseif (is_file(/** @scrutinizer ignore-deprecated */ $this->processFilename)) {
Loading history...
365 1
            unlink($this->processFilename);
0 ignored issues
show
Deprecated Code introduced by
The property AOE\Crawler\Controller\C...oller::$processFilename has been deprecated. ( Ignorable by Annotation )

If this is a false-positive, you can also ignore this issue in your code via the ignore-deprecated  annotation

365
            unlink(/** @scrutinizer ignore-deprecated */ $this->processFilename);
Loading history...
366
        }
367 3
    }
368
369
    /**
370
     * Get disable status
371
     * @deprecated
372
     */
373 3
    public function getDisabled(): bool
374
    {
375 3
        return is_file($this->processFilename);
0 ignored issues
show
Deprecated Code introduced by
The property AOE\Crawler\Controller\C...oller::$processFilename has been deprecated. ( Ignorable by Annotation )

If this is a false-positive, you can also ignore this issue in your code via the ignore-deprecated  annotation

375
        return is_file(/** @scrutinizer ignore-deprecated */ $this->processFilename);
Loading history...
376
    }
377
378
    /**
379
     * @param string $filenameWithPath
380
     * @deprecated
381
     */
382 4
    public function setProcessFilename($filenameWithPath): void
383
    {
384 4
        $this->processFilename = $filenameWithPath;
0 ignored issues
show
Deprecated Code introduced by
The property AOE\Crawler\Controller\C...oller::$processFilename has been deprecated. ( Ignorable by Annotation )

If this is a false-positive, you can also ignore this issue in your code via the ignore-deprecated  annotation

384
        /** @scrutinizer ignore-deprecated */ $this->processFilename = $filenameWithPath;
Loading history...
385 4
    }
386
387
    /**
388
     * @return string
389
     * @deprecated
390
     */
391 1
    public function getProcessFilename()
392
    {
393 1
        return $this->processFilename;
0 ignored issues
show
Deprecated Code introduced by
The property AOE\Crawler\Controller\C...oller::$processFilename has been deprecated. ( Ignorable by Annotation )

If this is a false-positive, you can also ignore this issue in your code via the ignore-deprecated  annotation

393
        return /** @scrutinizer ignore-deprecated */ $this->processFilename;
Loading history...
394
    }
395
396
    /**
397
     * Sets the extensions settings (unserialized pendant of $TYPO3_CONF_VARS['EXT']['extConf']['crawler']).
398
     */
399 6
    public function setExtensionSettings(array $extensionSettings): void
400
    {
401 6
        $this->extensionSettings = $extensionSettings;
402 6
    }
403
404
    /**
405
     * Check if the given page should be crawled
406
     *
407
     * @return false|string false if the page should be crawled (not excluded), true / skipMessage if it should be skipped
408
     * @deprecated
409
     */
410
    public function checkIfPageShouldBeSkipped(array $pageRow)
411
    {
412
        $pageService = GeneralUtility::makeInstance(PageService::class);
413
        return $pageService->checkIfPageShouldBeSkipped($pageRow);
414
    }
415
416
    /**
417
     * Wrapper method for getUrlsForPageId()
418
     * It returns an array of configurations and no urls!
419
     *
420
     * @param array $pageRow Page record with at least dok-type and uid columns.
421
     * @param string $skipMessage
422
     * @return array
423
     * @see getUrlsForPageId()
424
     */
425 9
    public function getUrlsForPageRow(array $pageRow, &$skipMessage = '')
426
    {
427 9
        if (! is_int($pageRow['uid'])) {
428
            $skipMessage = 'PageUid ' . $pageRow['uid'] . ' was not an integer';
429
            return [];
430
        }
431
432 9
        $message = $this->getPageService()->checkIfPageShouldBeSkipped($pageRow);
433 9
        if ($message === false) {
434 8
            $res = $this->getUrlsForPageId($pageRow['uid']);
435 8
            $skipMessage = '';
436
        } else {
437 1
            $skipMessage = $message;
438 1
            $res = [];
439
        }
440
441 9
        return $res;
442
    }
443
444
    /**
445
     * Creates a list of URLs from input array (and submits them to queue if asked for)
446
     * See Web > Info module script + "indexed_search"'s crawler hook-client using this!
447
     *
448
     * @param array $vv Information about URLs from pageRow to crawl.
449
     * @param array $pageRow Page row
450
     * @param int $scheduledTime Unix time to schedule indexing to, typically time()
451
     * @param int $reqMinute Number of requests per minute (creates the interleave between requests)
452
     * @param bool $submitCrawlUrls If set, submits the URLs to queue
453
     * @param bool $downloadCrawlUrls If set (and submitcrawlUrls is false) will fill $downloadUrls with entries)
454
     * @param array $duplicateTrack Array which is passed by reference and contains the an id per url to secure we will not crawl duplicates
455
     * @param array $downloadUrls Array which will be filled with URLS for download if flag is set.
456
     * @param array $incomingProcInstructions Array of processing instructions
457
     * @return string List of URLs (meant for display in backend module)
458
     */
459 7
    public function urlListFromUrlArray(
460
        array $vv,
461
        array $pageRow,
462
        $scheduledTime,
463
        $reqMinute,
464
        $submitCrawlUrls,
465
        $downloadCrawlUrls,
466
        array &$duplicateTrack,
467
        array &$downloadUrls,
468
        array $incomingProcInstructions
469
    ) {
470 7
        if (! is_array($vv['URLs'])) {
471
            return 'ERROR - no URL generated';
472
        }
473 7
        $urlLog = [];
474 7
        $pageId = (int) $pageRow['uid'];
475 7
        $configurationHash = $this->getConfigurationHash($vv);
476 7
        $skipInnerCheck = $this->queueRepository->noUnprocessedQueueEntriesForPageWithConfigurationHashExist($pageId, $configurationHash);
477
478 7
        $urlService = new UrlService();
479
480 7
        foreach ($vv['URLs'] as $urlQuery) {
481 7
            if (! $this->drawURLs_PIfilter($vv['subCfg']['procInstrFilter'], $incomingProcInstructions)) {
482
                continue;
483
            }
484 7
            $url = (string) $urlService->getUrlFromPageAndQueryParameters(
485 7
                $pageId,
486
                $urlQuery,
487 7
                $vv['subCfg']['baseUrl'] ?? null,
488 7
                $vv['subCfg']['force_ssl'] ?? 0
489
            );
490
491
            // Create key by which to determine unique-ness:
492 7
            $uKey = $url . '|' . $vv['subCfg']['userGroups'] . '|' . $vv['subCfg']['procInstrFilter'];
493
494 7
            if (isset($duplicateTrack[$uKey])) {
495
                //if the url key is registered just display it and do not resubmit is
496
                $urlLog[] = '<em><span class="text-muted">' . htmlspecialchars($url) . '</span></em>';
497
            } else {
498
                // Scheduled time:
499 7
                $schTime = $scheduledTime + round(count($duplicateTrack) * (60 / $reqMinute));
500 7
                $schTime = intval($schTime / 60) * 60;
501 7
                $formattedDate = BackendUtility::datetime($schTime);
502 7
                $this->urlList[] = '[' . $formattedDate . '] ' . $url;
503 7
                $urlList = '[' . $formattedDate . '] ' . htmlspecialchars($url);
504
505
                // Submit for crawling!
506 7
                if ($submitCrawlUrls) {
507 7
                    $added = $this->addUrl(
508 7
                        $pageId,
509
                        $url,
510 7
                        $vv['subCfg'],
511
                        $scheduledTime,
512
                        $configurationHash,
513
                        $skipInnerCheck
514
                    );
515 7
                    if ($added === false) {
516 7
                        $urlList .= ' (URL already existed)';
517
                    }
518
                } elseif ($downloadCrawlUrls) {
519
                    $downloadUrls[$url] = $url;
520
                }
521 7
                $urlLog[] = $urlList;
522
            }
523 7
            $duplicateTrack[$uKey] = true;
524
        }
525
526 7
        return implode('<br>', $urlLog);
527
    }
528
529
    /**
530
     * Returns true if input processing instruction is among registered ones.
531
     *
532
     * @param string $piString PI to test
533
     * @param array $incomingProcInstructions Processing instructions
534
     * @return boolean
535
     */
536 8
    public function drawURLs_PIfilter($piString, array $incomingProcInstructions)
537
    {
538 8
        if (empty($incomingProcInstructions)) {
539 4
            return true;
540
        }
541
542 4
        foreach ($incomingProcInstructions as $pi) {
543 4
            if (GeneralUtility::inList($piString, $pi)) {
544 2
                return true;
545
            }
546
        }
547 2
        return false;
548
    }
549
550 9
    public function getPageTSconfigForId(int $id): array
551
    {
552 9
        if (! $this->MP) {
553 9
            $pageTSconfig = BackendUtility::getPagesTSconfig($id);
554
        } else {
555
            // TODO: Please check, this makes no sense to split a boolean value.
556
            [, $mountPointId] = explode('-', $this->MP);
0 ignored issues
show
Bug introduced by
$this->MP of type true is incompatible with the type string expected by parameter $string of explode(). ( Ignorable by Annotation )

If this is a false-positive, you can also ignore this issue in your code via the ignore-type  annotation

556
            [, $mountPointId] = explode('-', /** @scrutinizer ignore-type */ $this->MP);
Loading history...
557
            $pageTSconfig = BackendUtility::getPagesTSconfig($mountPointId);
0 ignored issues
show
Bug introduced by
$mountPointId of type string is incompatible with the type integer expected by parameter $id of TYPO3\CMS\Backend\Utilit...ity::getPagesTSconfig(). ( Ignorable by Annotation )

If this is a false-positive, you can also ignore this issue in your code via the ignore-type  annotation

557
            $pageTSconfig = BackendUtility::getPagesTSconfig(/** @scrutinizer ignore-type */ $mountPointId);
Loading history...
558
        }
559
560
        // Call a hook to alter configuration
561 9
        if (is_array($GLOBALS['TYPO3_CONF_VARS']['EXTCONF']['crawler']['getPageTSconfigForId'])) {
562
            $params = [
563
                'pageId' => $id,
564
                'pageTSConfig' => &$pageTSconfig,
565
            ];
566
            foreach ($GLOBALS['TYPO3_CONF_VARS']['EXTCONF']['crawler']['getPageTSconfigForId'] as $userFunc) {
567
                GeneralUtility::callUserFunction($userFunc, $params, $this);
568
            }
569
        }
570 9
        return $pageTSconfig;
571
    }
572
573
    /**
574
     * This methods returns an array of configurations.
575
     * Adds no urls!
576
     */
577 7
    public function getUrlsForPageId(int $pageId): array
578
    {
579
        // Get page TSconfig for page ID
580 7
        $pageTSconfig = $this->getPageTSconfigForId($pageId);
581
582 7
        $mountPoint = is_string($this->MP) ? $this->MP : '';
0 ignored issues
show
introduced by
The condition is_string($this->MP) is always false.
Loading history...
583
584 7
        $res = [];
585
586
        // Fetch Crawler Configuration from pageTSConfig
587 7
        $res = $this->configurationService->getConfigurationFromPageTS($pageTSconfig, $pageId, $res, $mountPoint);
588
589
        // Get configuration from tx_crawler_configuration records up the rootline
590 7
        $res = $this->configurationService->getConfigurationFromDatabase($pageId, $res);
591
592 7
        foreach ($GLOBALS['TYPO3_CONF_VARS']['EXTCONF']['crawler']['processUrls'] ?? [] as $func) {
593
            $params = [
594
                'res' => &$res,
595
            ];
596
            GeneralUtility::callUserFunction($func, $params, $this);
597
        }
598 7
        return $res;
599
    }
600
601
    /**
602
     * Find all configurations of subpages of a page
603
     * TODO: Write Functional Tests
604
     */
605 2
    public function getConfigurationsForBranch(int $rootid, int $depth): array
606
    {
607 2
        $configurationsForBranch = [];
608 2
        $pageTSconfig = $this->getPageTSconfigForId($rootid);
609 2
        $sets = $pageTSconfig['tx_crawler.']['crawlerCfg.']['paramSets.'] ?? [];
610 2
        foreach ($sets as $key => $value) {
611
            if (! is_array($value)) {
612
                continue;
613
            }
614
            $configurationsForBranch[] = substr($key, -1) === '.' ? substr($key, 0, -1) : $key;
615
        }
616 2
        $pids = [];
617 2
        $rootLine = BackendUtility::BEgetRootLine($rootid);
618 2
        foreach ($rootLine as $node) {
619 1
            $pids[] = $node['uid'];
620
        }
621
        /* @var PageTreeView $tree */
622 2
        $tree = GeneralUtility::makeInstance(PageTreeView::class);
623 2
        $perms_clause = $this->getBackendUser()->getPagePermsClause(Permission::PAGE_SHOW);
624 2
        $tree->init(empty($perms_clause) ? '' : ('AND ' . $perms_clause));
625 2
        $tree->getTree($rootid, $depth, '');
626 2
        foreach ($tree->tree as $node) {
627
            $pids[] = $node['row']['uid'];
628
        }
629
630 2
        $configurations = $this->configurationRepository->getCrawlerConfigurationRecordsFromRootLine($rootid, $pids);
631
632 2
        foreach ($configurations as $configuration) {
633 1
            $configurationsForBranch[] = $configuration['name'];
634
        }
635 2
        return $configurationsForBranch;
636
    }
637
638
    /**
639
     * Check if a user has access to an item
640
     * (e.g. get the group list of the current logged in user from $GLOBALS['TSFE']->gr_list)
641
     *
642
     * @param string $groupList Comma-separated list of (fe_)group UIDs from a user
643
     * @param string $accessList Comma-separated list of (fe_)group UIDs of the item to access
644
     * @return bool TRUE if at least one of the users group UIDs is in the access list or the access list is empty
645
     * @see \TYPO3\CMS\Frontend\Page\PageRepository::getMultipleGroupsWhereClause()
646
     * @deprecated
647
     * @codeCoverageIgnore
648
     */
649
    public function hasGroupAccess($groupList, $accessList)
650
    {
651
        if (empty($accessList)) {
652
            return true;
653
        }
654
        foreach (GeneralUtility::intExplode(',', $groupList) as $groupUid) {
655
            if (GeneralUtility::inList($accessList, $groupUid)) {
656
                return true;
657
            }
658
        }
659
        return false;
660
    }
661
662
    /**
663
     * Will expand the parameters configuration to individual values. This follows a certain syntax of the value of each parameter.
664
     * Syntax of values:
665
     * - Basically: If the value is wrapped in [...] it will be expanded according to the following syntax, otherwise the value is taken literally
666
     * - Configuration is splitted by "|" and the parts are processed individually and finally added together
667
     * - For each configuration part:
668
     *         - "[int]-[int]" = Integer range, will be expanded to all values in between, values included, starting from low to high (max. 1000). Example "1-34" or "-40--30"
669
     *         - "_TABLE:[TCA table name];[_PID:[optional page id, default is current page]];[_ENABLELANG:1]" = Look up of table records from PID, filtering out deleted records. Example "_TABLE:tt_content; _PID:123"
670
     *        _ENABLELANG:1 picks only original records without their language overlays
671
     *         - Default: Literal value
672
     *
673
     * @param array $paramArray Array with key (GET var name) and values (value of GET var which is configuration for expansion)
674
     * @param integer $pid Current page ID
675
     * @return array
676
     * @deprecated
677
     * @codeCoverageIgnore
678
     */
679
    public function expandParameters($paramArray, $pid)
680
    {
681
        // Traverse parameter names:
682
        foreach ($paramArray as $p => $v) {
683
            $v = trim($v);
684
685
            // If value is encapsulated in square brackets it means there are some ranges of values to find, otherwise the value is literal
686
            if (strpos($v, '[') === 0 && substr($v, -1) === ']') {
687
                // So, find the value inside brackets and reset the paramArray value as an array.
688
                $v = substr($v, 1, -1);
689
                $paramArray[$p] = [];
690
691
                // Explode parts and traverse them:
692
                $parts = explode('|', $v);
693
                foreach ($parts as $pV) {
694
695
                    // Look for integer range: (fx. 1-34 or -40--30 // reads minus 40 to minus 30)
696
                    if (preg_match('/^(-?[0-9]+)\s*-\s*(-?[0-9]+)$/', trim($pV), $reg)) {
697
                        $reg = $this->swapIfFirstIsLargerThanSecond($reg);
0 ignored issues
show
Deprecated Code introduced by
The function AOE\Crawler\Controller\C...rstIsLargerThanSecond() has been deprecated. ( Ignorable by Annotation )

If this is a false-positive, you can also ignore this issue in your code via the ignore-deprecated  annotation

697
                        $reg = /** @scrutinizer ignore-deprecated */ $this->swapIfFirstIsLargerThanSecond($reg);
Loading history...
698
699
                        // Traverse range, add values:
700
                        // Limit to size of range!
701
                        $runAwayBrake = 1000;
702
                        for ($a = $reg[1]; $a <= $reg[2]; $a++) {
703
                            $paramArray[$p][] = $a;
704
                            $runAwayBrake--;
705
                            if ($runAwayBrake <= 0) {
706
                                break;
707
                            }
708
                        }
709
                    } elseif (strpos(trim($pV), '_TABLE:') === 0) {
710
711
                        // Parse parameters:
712
                        $subparts = GeneralUtility::trimExplode(';', $pV);
713
                        $subpartParams = [];
714
                        foreach ($subparts as $spV) {
715
                            [$pKey, $pVal] = GeneralUtility::trimExplode(':', $spV);
716
                            $subpartParams[$pKey] = $pVal;
717
                        }
718
719
                        // Table exists:
720
                        if (isset($GLOBALS['TCA'][$subpartParams['_TABLE']])) {
721
                            $lookUpPid = isset($subpartParams['_PID']) ? intval($subpartParams['_PID']) : intval($pid);
722
                            $recursiveDepth = isset($subpartParams['_RECURSIVE']) ? intval($subpartParams['_RECURSIVE']) : 0;
723
                            $pidField = isset($subpartParams['_PIDFIELD']) ? trim($subpartParams['_PIDFIELD']) : 'pid';
724
                            $where = $subpartParams['_WHERE'] ?? '';
725
                            $addTable = $subpartParams['_ADDTABLE'] ?? '';
726
727
                            $fieldName = $subpartParams['_FIELD'] ? $subpartParams['_FIELD'] : 'uid';
728
                            if ($fieldName === 'uid' || $GLOBALS['TCA'][$subpartParams['_TABLE']]['columns'][$fieldName]) {
729
                                $queryBuilder = $this->getQueryBuilder($subpartParams['_TABLE']);
730
731
                                if ($recursiveDepth > 0) {
732
                                    /** @var QueryGenerator $queryGenerator */
733
                                    $queryGenerator = GeneralUtility::makeInstance(QueryGenerator::class);
734
                                    $pidList = $queryGenerator->getTreeList($lookUpPid, $recursiveDepth, 0, 1);
735
                                    $pidArray = GeneralUtility::intExplode(',', $pidList);
736
                                } else {
737
                                    $pidArray = [(string) $lookUpPid];
738
                                }
739
740
                                $queryBuilder->getRestrictions()
741
                                    ->removeAll()
742
                                    ->add(GeneralUtility::makeInstance(DeletedRestriction::class));
743
744
                                $queryBuilder
745
                                    ->select($fieldName)
746
                                    ->from($subpartParams['_TABLE'])
747
                                    ->where(
748
                                        $queryBuilder->expr()->in($pidField, $queryBuilder->createNamedParameter($pidArray, Connection::PARAM_INT_ARRAY)),
749
                                        $where
750
                                    );
751
752
                                if (! empty($addTable)) {
753
                                    // TODO: Check if this works as intended!
754
                                    $queryBuilder->add('from', $addTable);
755
                                }
756
                                $transOrigPointerField = $GLOBALS['TCA'][$subpartParams['_TABLE']]['ctrl']['transOrigPointerField'];
757
758
                                if ($subpartParams['_ENABLELANG'] && $transOrigPointerField) {
759
                                    $queryBuilder->andWhere(
760
                                        $queryBuilder->expr()->lte(
761
                                            $transOrigPointerField,
762
                                            0
763
                                        )
764
                                    );
765
                                }
766
767
                                $statement = $queryBuilder->execute();
768
769
                                $rows = [];
770
                                while ($row = $statement->fetch()) {
0 ignored issues
show
Deprecated Code introduced by
The function Doctrine\DBAL\ForwardCompatibility\Result::fetch() has been deprecated: Use fetchNumeric(), fetchAssociative() or fetchOne() instead. ( Ignorable by Annotation )

If this is a false-positive, you can also ignore this issue in your code via the ignore-deprecated  annotation

770
                                while ($row = /** @scrutinizer ignore-deprecated */ $statement->fetch()) {

This function has been deprecated. The supplier of the function has supplied an explanatory message.

The explanatory message should give you some clue as to whether and when the function will be removed and what other function to use instead.

Loading history...
771
                                    $rows[$row[$fieldName]] = $row;
772
                                }
773
774
                                if (is_array($rows)) {
775
                                    $paramArray[$p] = array_merge($paramArray[$p], array_keys($rows));
776
                                }
777
                            }
778
                        }
779
                    } else {
780
                        // Just add value:
781
                        $paramArray[$p][] = $pV;
782
                    }
783
                    // Hook for processing own expandParameters place holder
784
                    if (is_array($GLOBALS['TYPO3_CONF_VARS']['SC_OPTIONS']['crawler/class.tx_crawler_lib.php']['expandParameters'])) {
785
                        $_params = [
786
                            'pObj' => &$this,
787
                            'paramArray' => &$paramArray,
788
                            'currentKey' => $p,
789
                            'currentValue' => $pV,
790
                            'pid' => $pid,
791
                        ];
792
                        foreach ($GLOBALS['TYPO3_CONF_VARS']['SC_OPTIONS']['crawler/class.tx_crawler_lib.php']['expandParameters'] as $_funcRef) {
793
                            GeneralUtility::callUserFunction($_funcRef, $_params, $this);
794
                        }
795
                    }
796
                }
797
798
                // Make unique set of values and sort array by key:
799
                $paramArray[$p] = array_unique($paramArray[$p]);
800
                ksort($paramArray);
801
            } else {
802
                // Set the literal value as only value in array:
803
                $paramArray[$p] = [$v];
804
            }
805
        }
806
807
        return $paramArray;
808
    }
809
810
    /**
811
     * Compiling URLs from parameter array (output of expandParameters())
812
     * The number of URLs will be the multiplication of the number of parameter values for each key
813
     *
814
     * @param array $paramArray Output of expandParameters(): Array with keys (GET var names) and for each an array of values
815
     * @param array $urls URLs accumulated in this array (for recursion)
816
     * @deprecated
817
     * @codeCoverageIgnore
818
     */
819
    public function compileUrls(array $paramArray, array $urls): array
820
    {
821
        return $this->urlService->compileUrls($paramArray, $urls, $this->getMaximumUrlsToCompile());
822
    }
823
824
    /************************************
825
     *
826
     * Crawler log
827
     *
828
     ************************************/
829
830
    /**
831
     * Return array of records from crawler queue for input page ID
832
     *
833
     * @param integer $id Page ID for which to look up log entries.
834
     * @param boolean $doFlush If TRUE, then entries selected at DELETED(!) instead of selected!
835
     * @param boolean $doFullFlush
836
     * @param integer $itemsPerPage Limit the amount of entries per page default is 10
837
     * @return array
838
     *
839
     * @deprecated
840
     */
841 4
    public function getLogEntriesForPageId($id, QueueFilter $queueFilter, $doFlush = false, $doFullFlush = false, $itemsPerPage = 10)
0 ignored issues
show
Unused Code introduced by
The parameter $doFullFlush is not used and could be removed. ( Ignorable by Annotation )

If this is a false-positive, you can also ignore this issue in your code via the ignore-unused  annotation

841
    public function getLogEntriesForPageId($id, QueueFilter $queueFilter, $doFlush = false, /** @scrutinizer ignore-unused */ $doFullFlush = false, $itemsPerPage = 10)

This check looks for parameters that have been defined for a function or method, but which are not used in the method body.

Loading history...
842
    {
843 4
        $queryBuilder = GeneralUtility::makeInstance(ConnectionPool::class)->getQueryBuilderForTable(QueueRepository::TABLE_NAME);
844
        $queryBuilder
845 4
            ->select('*')
846 4
            ->from(QueueRepository::TABLE_NAME)
847 4
            ->where(
848 4
                $queryBuilder->expr()->eq('page_id', $queryBuilder->createNamedParameter($id, PDO::PARAM_INT))
849
            )
850 4
            ->orderBy('scheduled', 'DESC');
851
852 4
        $expressionBuilder = GeneralUtility::makeInstance(ConnectionPool::class)
853 4
            ->getConnectionForTable(QueueRepository::TABLE_NAME)
854 4
            ->getExpressionBuilder();
855 4
        $query = $expressionBuilder->andX();
0 ignored issues
show
Unused Code introduced by
The assignment to $query is dead and can be removed.
Loading history...
856
        // PHPStorm adds the highlight that the $addWhere is immediately overwritten,
857
        // but the $query = $expressionBuilder->andX() ensures that the $addWhere is written correctly with AND
858
        // between the statements, it's not a mistake in the code.
859 4
        switch ($queueFilter) {
860 4
            case 'pending':
861
                $queryBuilder->andWhere($queryBuilder->expr()->eq('exec_time', 0));
862
                break;
863 4
            case 'finished':
864
                $queryBuilder->andWhere($queryBuilder->expr()->gt('exec_time', 0));
865
                break;
866
        }
867
868 4
        if ($doFlush) {
869 2
            $this->queueRepository->flushQueue($queueFilter);
870
        }
871 4
        if ($itemsPerPage > 0) {
872
            $queryBuilder
873 4
                ->setMaxResults((int) $itemsPerPage);
874
        }
875
876 4
        return $queryBuilder->execute()->fetchAll();
877
    }
878
879
    /**
880
     * Return array of records from crawler queue for input set ID
881
     *
882
     * @param int $set_id Set ID for which to look up log entries.
883
     * @param string $filter Filter: "all" => all entries, "pending" => all that is not yet run, "finished" => all complete ones
884
     * @param bool $doFlush If TRUE, then entries selected at DELETED(!) instead of selected!
885
     * @param int $itemsPerPage Limit the amount of entries per page default is 10
886
     * @return array
887
     *
888
     * @deprecated
889
     */
890 6
    public function getLogEntriesForSetId(int $set_id, string $filter = '', bool $doFlush = false, bool $doFullFlush = false, int $itemsPerPage = 10)
891
    {
892 6
        $queryBuilder = GeneralUtility::makeInstance(ConnectionPool::class)->getQueryBuilderForTable(QueueRepository::TABLE_NAME);
893
        $queryBuilder
894 6
            ->select('*')
895 6
            ->from(QueueRepository::TABLE_NAME)
896 6
            ->where(
897 6
                $queryBuilder->expr()->eq('set_id', $queryBuilder->createNamedParameter($set_id, PDO::PARAM_INT))
898
            )
899 6
            ->orderBy('scheduled', 'DESC');
900
901 6
        $expressionBuilder = GeneralUtility::makeInstance(ConnectionPool::class)
902 6
            ->getConnectionForTable(QueueRepository::TABLE_NAME)
903 6
            ->getExpressionBuilder();
904 6
        $query = $expressionBuilder->andX();
905
        // PHPStorm adds the highlight that the $addWhere is immediately overwritten,
906
        // but the $query = $expressionBuilder->andX() ensures that the $addWhere is written correctly with AND
907
        // between the statements, it's not a mistake in the code.
908 6
        $addWhere = '';
909 6
        switch ($filter) {
910 6
            case 'pending':
911 1
                $queryBuilder->andWhere($queryBuilder->expr()->eq('exec_time', 0));
912 1
                $addWhere = $query->add($expressionBuilder->eq('exec_time', 0));
0 ignored issues
show
Unused Code introduced by
The assignment to $addWhere is dead and can be removed.
Loading history...
913 1
                break;
914 5
            case 'finished':
915 1
                $queryBuilder->andWhere($queryBuilder->expr()->gt('exec_time', 0));
916 1
                $addWhere = $query->add($expressionBuilder->gt('exec_time', 0));
917 1
                break;
918
        }
919 6
        if ($doFlush) {
920 4
            $addWhere = $query->add($expressionBuilder->eq('set_id', (int) $set_id));
921 4
            $this->flushQueue($doFullFlush ? '' : $addWhere);
0 ignored issues
show
Deprecated Code introduced by
The function AOE\Crawler\Controller\C...ontroller::flushQueue() has been deprecated. ( Ignorable by Annotation )

If this is a false-positive, you can also ignore this issue in your code via the ignore-deprecated  annotation

921
            /** @scrutinizer ignore-deprecated */ $this->flushQueue($doFullFlush ? '' : $addWhere);
Loading history...
922 4
            return [];
923
        }
924 2
        if ($itemsPerPage > 0) {
925
            $queryBuilder
926 2
                ->setMaxResults((int) $itemsPerPage);
927
        }
928
929 2
        return $queryBuilder->execute()->fetchAll();
930
    }
931
932
    /**
933
     * Adding call back entries to log (called from hooks typically, see indexed search class "class.crawler.php"
934
     *
935
     * @param integer $setId Set ID
936
     * @param array $params Parameters to pass to call back function
937
     * @param string $callBack Call back object reference, eg. 'EXT:indexed_search/class.crawler.php:&tx_indexedsearch_crawler'
938
     * @param integer $page_id Page ID to attach it to
939
     * @param integer $schedule Time at which to activate
940
     */
941
    public function addQueueEntry_callBack($setId, $params, $callBack, $page_id = 0, $schedule = 0): void
942
    {
943
        if (! is_array($params)) {
0 ignored issues
show
introduced by
The condition is_array($params) is always true.
Loading history...
944
            $params = [];
945
        }
946
        $params['_CALLBACKOBJ'] = $callBack;
947
948
        GeneralUtility::makeInstance(ConnectionPool::class)->getConnectionForTable(QueueRepository::TABLE_NAME)
949
            ->insert(
950
                QueueRepository::TABLE_NAME,
951
                [
952
                    'page_id' => (int) $page_id,
953
                    'parameters' => json_encode($params),
954
                    'scheduled' => (int) $schedule ?: $this->getCurrentTime(),
955
                    'exec_time' => 0,
956
                    'set_id' => (int) $setId,
957
                    'result_data' => '',
958
                ]
959
            );
960
    }
961
962
    /************************************
963
     *
964
     * URL setting
965
     *
966
     ************************************/
967
968
    /**
969
     * Setting a URL for crawling:
970
     *
971
     * @param integer $id Page ID
972
     * @param string $url Complete URL
973
     * @param array $subCfg Sub configuration array (from TS config)
974
     * @param integer $tstamp Scheduled-time
975
     * @param string $configurationHash (optional) configuration hash
976
     * @param bool $skipInnerDuplicationCheck (optional) skip inner duplication check
977
     * @return bool
978
     */
979 11
    public function addUrl(
980
        $id,
981
        $url,
982
        array $subCfg,
983
        $tstamp,
984
        $configurationHash = '',
985
        $skipInnerDuplicationCheck = false
986
    ) {
987 11
        $urlAdded = false;
988 11
        $rows = [];
989
990
        // Creating parameters:
991
        $parameters = [
992 11
            'url' => $url,
993
        ];
994
995
        // fe user group simulation:
996 11
        $uGs = implode(',', array_unique(GeneralUtility::intExplode(',', $subCfg['userGroups'], true)));
997 11
        if ($uGs) {
998 1
            $parameters['feUserGroupList'] = $uGs;
999
        }
1000
1001
        // Setting processing instructions
1002 11
        $parameters['procInstructions'] = GeneralUtility::trimExplode(',', $subCfg['procInstrFilter']);
1003 11
        if (is_array($subCfg['procInstrParams.'])) {
1004 8
            $parameters['procInstrParams'] = $subCfg['procInstrParams.'];
1005
        }
1006
1007
        // Compile value array:
1008 11
        $parameters_serialized = json_encode($parameters);
1009
        $fieldArray = [
1010 11
            'page_id' => (int) $id,
1011 11
            'parameters' => $parameters_serialized,
1012 11
            'parameters_hash' => GeneralUtility::shortMD5($parameters_serialized),
1013 11
            'configuration_hash' => $configurationHash,
1014 11
            'scheduled' => $tstamp,
1015 11
            'exec_time' => 0,
1016 11
            'set_id' => (int) $this->setID,
1017 11
            'result_data' => '',
1018 11
            'configuration' => $subCfg['key'],
1019
        ];
1020
1021 11
        if ($this->registerQueueEntriesInternallyOnly) {
1022
            //the entries will only be registered and not stored to the database
1023 1
            $this->queueEntries[] = $fieldArray;
1024
        } else {
1025 10
            if (! $skipInnerDuplicationCheck) {
1026
                // check if there is already an equal entry
1027 9
                $rows = $this->queueRepository->getDuplicateQueueItemsIfExists(
1028 9
                    (bool) $this->extensionSettings['enableTimeslot'],
1029
                    $tstamp,
1030 9
                    $this->getCurrentTime(),
1031 9
                    $fieldArray['page_id'],
1032 9
                    $fieldArray['parameters_hash']
1033
                );
1034
            }
1035
1036 10
            if (empty($rows)) {
1037 9
                $connectionForCrawlerQueue = GeneralUtility::makeInstance(ConnectionPool::class)->getConnectionForTable(QueueRepository::TABLE_NAME);
1038 9
                $connectionForCrawlerQueue->insert(
1039 9
                    QueueRepository::TABLE_NAME,
1040
                    $fieldArray
1041
                );
1042 9
                $uid = $connectionForCrawlerQueue->lastInsertId(QueueRepository::TABLE_NAME, 'qid');
1043 9
                $rows[] = $uid;
1044 9
                $urlAdded = true;
1045
1046 9
                $signalPayload = ['uid' => $uid, 'fieldArray' => $fieldArray];
1047 9
                SignalSlotUtility::emitSignal(
0 ignored issues
show
Deprecated Code introduced by
The function AOE\Crawler\Utility\Sign...otUtility::emitSignal() has been deprecated. ( Ignorable by Annotation )

If this is a false-positive, you can also ignore this issue in your code via the ignore-deprecated  annotation

1047
                /** @scrutinizer ignore-deprecated */ SignalSlotUtility::emitSignal(
Loading history...
1048 9
                    self::class,
1049 9
                    SignalSlotUtility::SIGNAL_URL_ADDED_TO_QUEUE,
1050
                    $signalPayload
1051
                );
1052
            } else {
1053 5
                $signalPayload = ['rows' => $rows, 'fieldArray' => $fieldArray];
1054 5
                SignalSlotUtility::emitSignal(
0 ignored issues
show
Deprecated Code introduced by
The function AOE\Crawler\Utility\Sign...otUtility::emitSignal() has been deprecated. ( Ignorable by Annotation )

If this is a false-positive, you can also ignore this issue in your code via the ignore-deprecated  annotation

1054
                /** @scrutinizer ignore-deprecated */ SignalSlotUtility::emitSignal(
Loading history...
1055 5
                    self::class,
1056 5
                    SignalSlotUtility::SIGNAL_DUPLICATE_URL_IN_QUEUE,
1057
                    $signalPayload
1058
                );
1059
            }
1060
        }
1061
1062 11
        return $urlAdded;
1063
    }
1064
1065
    /**
1066
     * Returns the current system time
1067
     *
1068
     * @return int
1069
     */
1070 4
    public function getCurrentTime()
1071
    {
1072 4
        return time();
1073
    }
1074
1075
    /************************************
1076
     *
1077
     * URL reading
1078
     *
1079
     ************************************/
1080
1081
    /**
1082
     * Read URL for single queue entry
1083
     *
1084
     * @param integer $queueId
1085
     * @param boolean $force If set, will process even if exec_time has been set!
1086
     *
1087
     * @return int|null
1088
     */
1089 2
    public function readUrl($queueId, $force = false, string $processId = '')
1090
    {
1091 2
        $queryBuilder = GeneralUtility::makeInstance(ConnectionPool::class)->getQueryBuilderForTable(QueueRepository::TABLE_NAME);
1092 2
        $ret = 0;
1093 2
        $this->logger->debug('crawler-readurl start ' . microtime(true));
0 ignored issues
show
Bug introduced by
The method debug() does not exist on null. ( Ignorable by Annotation )

If this is a false-positive, you can also ignore this issue in your code via the ignore-call  annotation

1093
        $this->logger->/** @scrutinizer ignore-call */ 
1094
                       debug('crawler-readurl start ' . microtime(true));

This check looks for calls to methods that do not seem to exist on a given type. It looks for the method on the type itself as well as in inherited classes or implemented interfaces.

This is most likely a typographical error or the method has been renamed.

Loading history...
1094
1095
        $queryBuilder
1096 2
            ->select('*')
1097 2
            ->from(QueueRepository::TABLE_NAME)
1098 2
            ->where(
1099 2
                $queryBuilder->expr()->eq('qid', $queryBuilder->createNamedParameter($queueId, PDO::PARAM_INT))
1100
            );
1101 2
        if (! $force) {
1102
            $queryBuilder
1103 2
                ->andWhere('exec_time = 0')
1104 2
                ->andWhere('process_scheduled > 0');
1105
        }
1106 2
        $queueRec = $queryBuilder->execute()->fetch();
1107
1108 2
        if (! is_array($queueRec)) {
1109
            return;
1110
        }
1111
1112 2
        SignalSlotUtility::emitSignal(
0 ignored issues
show
Deprecated Code introduced by
The function AOE\Crawler\Utility\Sign...otUtility::emitSignal() has been deprecated. ( Ignorable by Annotation )

If this is a false-positive, you can also ignore this issue in your code via the ignore-deprecated  annotation

1112
        /** @scrutinizer ignore-deprecated */ SignalSlotUtility::emitSignal(
Loading history...
1113 2
            self::class,
1114 2
            SignalSlotUtility::SIGNAL_QUEUEITEM_PREPROCESS,
1115 2
            [$queueId, &$queueRec]
1116
        );
1117
1118
        // Set exec_time to lock record:
1119 2
        $field_array = ['exec_time' => $this->getCurrentTime()];
1120
1121 2
        if (! empty($processId)) {
1122
            //if mulitprocessing is used we need to store the id of the process which has handled this entry
1123 2
            $field_array['process_id_completed'] = $processId;
1124
        }
1125
1126 2
        GeneralUtility::makeInstance(ConnectionPool::class)->getConnectionForTable(QueueRepository::TABLE_NAME)
1127 2
            ->update(
1128 2
                QueueRepository::TABLE_NAME,
1129
                $field_array,
1130 2
                ['qid' => (int) $queueId]
1131
            );
1132
1133 2
        $result = $this->queueExecutor->executeQueueItem($queueRec, $this);
1134 2
        if ($result['content'] === null) {
1135
            $resultData = 'An errors happened';
0 ignored issues
show
Unused Code introduced by
The assignment to $resultData is dead and can be removed.
Loading history...
1136
        } else {
1137
            /** @var JsonCompatibilityConverter $jsonCompatibilityConverter */
1138 2
            $jsonCompatibilityConverter = GeneralUtility::makeInstance(JsonCompatibilityConverter::class);
1139 2
            $resultData = $jsonCompatibilityConverter->convert($result['content']);
1140
1141
            //atm there's no need to point to specific pollable extensions
1142 2
            if (is_array($resultData) && is_array($GLOBALS['TYPO3_CONF_VARS']['EXTCONF']['crawler']['pollSuccess'])) {
1143
                foreach ($GLOBALS['TYPO3_CONF_VARS']['EXTCONF']['crawler']['pollSuccess'] as $pollable) {
1144
                    // only check the success value if the instruction is runnig
1145
                    // it is important to name the pollSuccess key same as the procInstructions key
1146
                    if (is_array($resultData['parameters']['procInstructions'])
1147
                        && in_array(
1148
                            $pollable,
1149
                            $resultData['parameters']['procInstructions'], true
1150
                        )
1151
                    ) {
1152
                        if (! empty($resultData['success'][$pollable]) && $resultData['success'][$pollable]) {
1153
                            $ret |= self::CLI_STATUS_POLLABLE_PROCESSED;
0 ignored issues
show
Deprecated Code introduced by
The constant AOE\Crawler\Controller\C...ATUS_POLLABLE_PROCESSED has been deprecated: since 9.2.5 will be removed in v11.x ( Ignorable by Annotation )

If this is a false-positive, you can also ignore this issue in your code via the ignore-deprecated  annotation

1153
                            $ret |= /** @scrutinizer ignore-deprecated */ self::CLI_STATUS_POLLABLE_PROCESSED;

This class constant has been deprecated. The supplier of the class has supplied an explanatory message.

The explanatory message should give you some clue as to whether and when the constant will be removed from the class and what other constant to use instead.

Loading history...
1154
                        }
1155
                    }
1156
                }
1157
            }
1158
        }
1159
        // Set result in log which also denotes the end of the processing of this entry.
1160 2
        $field_array = ['result_data' => json_encode($result)];
1161
1162 2
        SignalSlotUtility::emitSignal(
0 ignored issues
show
Deprecated Code introduced by
The function AOE\Crawler\Utility\Sign...otUtility::emitSignal() has been deprecated. ( Ignorable by Annotation )

If this is a false-positive, you can also ignore this issue in your code via the ignore-deprecated  annotation

1162
        /** @scrutinizer ignore-deprecated */ SignalSlotUtility::emitSignal(
Loading history...
1163 2
            self::class,
1164 2
            SignalSlotUtility::SIGNAL_QUEUEITEM_POSTPROCESS,
1165 2
            [$queueId, &$field_array]
1166
        );
1167
1168 2
        GeneralUtility::makeInstance(ConnectionPool::class)->getConnectionForTable(QueueRepository::TABLE_NAME)
1169 2
            ->update(
1170 2
                QueueRepository::TABLE_NAME,
1171
                $field_array,
1172 2
                ['qid' => (int) $queueId]
1173
            );
1174
1175 2
        $this->logger->debug('crawler-readurl stop ' . microtime(true));
1176 2
        return $ret;
1177
    }
1178
1179
    /**
1180
     * Read URL for not-yet-inserted log-entry
1181
     *
1182
     * @param array $field_array Queue field array,
1183
     *
1184
     * @return array|bool|mixed|string
1185
     */
1186
    public function readUrlFromArray($field_array)
1187
    {
1188
        // Set exec_time to lock record:
1189
        $field_array['exec_time'] = $this->getCurrentTime();
1190
        $connectionForCrawlerQueue = GeneralUtility::makeInstance(ConnectionPool::class)->getConnectionForTable(QueueRepository::TABLE_NAME);
1191
        $connectionForCrawlerQueue->insert(
1192
            QueueRepository::TABLE_NAME,
1193
            $field_array
1194
        );
1195
        $queueId = $field_array['qid'] = $connectionForCrawlerQueue->lastInsertId(QueueRepository::TABLE_NAME, 'qid');
1196
        $result = $this->queueExecutor->executeQueueItem($field_array, $this);
1197
1198
        // Set result in log which also denotes the end of the processing of this entry.
1199
        $field_array = ['result_data' => json_encode($result)];
1200
1201
        SignalSlotUtility::emitSignal(
0 ignored issues
show
Deprecated Code introduced by
The function AOE\Crawler\Utility\Sign...otUtility::emitSignal() has been deprecated. ( Ignorable by Annotation )

If this is a false-positive, you can also ignore this issue in your code via the ignore-deprecated  annotation

1201
        /** @scrutinizer ignore-deprecated */ SignalSlotUtility::emitSignal(
Loading history...
1202
            self::class,
1203
            SignalSlotUtility::SIGNAL_QUEUEITEM_POSTPROCESS,
1204
            [$queueId, &$field_array]
1205
        );
1206
1207
        $connectionForCrawlerQueue->update(
1208
            QueueRepository::TABLE_NAME,
1209
            $field_array,
1210
            ['qid' => $queueId]
1211
        );
1212
1213
        return $result;
1214
    }
1215
1216
    /*****************************
1217
     *
1218
     * Compiling URLs to crawl - tools
1219
     *
1220
     *****************************/
1221
1222
    /**
1223
     * @param integer $id Root page id to start from.
1224
     * @param integer $depth Depth of tree, 0=only id-page, 1= on sublevel, 99 = infinite
1225
     * @param integer $scheduledTime Unix Time when the URL is timed to be visited when put in queue
1226
     * @param integer $reqMinute Number of requests per minute (creates the interleave between requests)
1227
     * @param boolean $submitCrawlUrls If set, submits the URLs to queue in database (real crawling)
1228
     * @param boolean $downloadCrawlUrls If set (and submitcrawlUrls is false) will fill $downloadUrls with entries)
1229
     * @param array $incomingProcInstructions Array of processing instructions
1230
     * @param array $configurationSelection Array of configuration keys
1231
     * @return string
1232
     */
1233
    public function getPageTreeAndUrls(
1234
        $id,
1235
        $depth,
1236
        $scheduledTime,
1237
        $reqMinute,
1238
        $submitCrawlUrls,
1239
        $downloadCrawlUrls,
1240
        array $incomingProcInstructions,
1241
        array $configurationSelection
1242
    ) {
1243
        $this->scheduledTime = $scheduledTime;
1244
        $this->reqMinute = $reqMinute;
1245
        $this->submitCrawlUrls = $submitCrawlUrls;
1246
        $this->downloadCrawlUrls = $downloadCrawlUrls;
1247
        $this->incomingProcInstructions = $incomingProcInstructions;
1248
        $this->incomingConfigurationSelection = $configurationSelection;
1249
1250
        $this->duplicateTrack = [];
1251
        $this->downloadUrls = [];
1252
1253
        // Drawing tree:
1254
        /* @var PageTreeView $tree */
1255
        $tree = GeneralUtility::makeInstance(PageTreeView::class);
1256
        $perms_clause = $this->getBackendUser()->getPagePermsClause(Permission::PAGE_SHOW);
1257
        $tree->init('AND ' . $perms_clause);
1258
1259
        $pageInfo = BackendUtility::readPageAccess($id, $perms_clause);
1260
        if (is_array($pageInfo)) {
1261
            // Set root row:
1262
            $tree->tree[] = [
1263
                'row' => $pageInfo,
1264
                'HTML' => $this->iconFactory->getIconForRecord('pages', $pageInfo, Icon::SIZE_SMALL),
1265
            ];
1266
        }
1267
1268
        // Get branch beneath:
1269
        if ($depth) {
1270
            $tree->getTree($id, $depth, '');
1271
        }
1272
1273
        // Traverse page tree:
1274
        $code = '';
1275
1276
        foreach ($tree->tree as $data) {
1277
            $this->MP = false;
1278
1279
            // recognize mount points
1280
            if ($data['row']['doktype'] === PageRepository::DOKTYPE_MOUNTPOINT) {
1281
                $mountpage = $this->pageRepository->getPage($data['row']['uid']);
1282
1283
                // fetch mounted pages
1284
                $this->MP = $mountpage[0]['mount_pid'] . '-' . $data['row']['uid'];
0 ignored issues
show
Documentation Bug introduced by
The property $MP was declared of type boolean, but $mountpage[0]['mount_pid...' . $data['row']['uid'] is of type string. Maybe add a type cast?

This check looks for assignments to scalar types that may be of the wrong type.

To ensure the code behaves as expected, it may be a good idea to add an explicit type cast.

$answer = 42;

$correct = false;

$correct = (bool) $answer;
Loading history...
1285
1286
                $mountTree = GeneralUtility::makeInstance(PageTreeView::class);
1287
                $mountTree->init('AND ' . $perms_clause);
1288
                $mountTree->getTree($mountpage[0]['mount_pid'], $depth);
1289
1290
                foreach ($mountTree->tree as $mountData) {
1291
                    $code .= $this->drawURLs_addRowsForPage(
1292
                        $mountData['row'],
1293
                        $mountData['HTML'] . BackendUtility::getRecordTitle('pages', $mountData['row'], true)
1294
                    );
1295
                }
1296
1297
                // replace page when mount_pid_ol is enabled
1298
                if ($mountpage[0]['mount_pid_ol']) {
1299
                    $data['row']['uid'] = $mountpage[0]['mount_pid'];
1300
                } else {
1301
                    // if the mount_pid_ol is not set the MP must not be used for the mountpoint page
1302
                    $this->MP = false;
1303
                }
1304
            }
1305
1306
            $code .= $this->drawURLs_addRowsForPage(
1307
                $data['row'],
1308
                $data['HTML'] . BackendUtility::getRecordTitle('pages', $data['row'], true)
1309
            );
1310
        }
1311
1312
        return $code;
1313
    }
1314
1315
    /**
1316
     * Expands exclude string
1317
     *
1318
     * @param string $excludeString Exclude string
1319
     * @return array
1320
     * @deprecated
1321
     */
1322 1
    public function expandExcludeString($excludeString)
1323
    {
1324 1
        return $this->configurationService->expandExcludeString($excludeString);
1325
    }
1326
1327
    /**
1328
     * Create the rows for display of the page tree
1329
     * For each page a number of rows are shown displaying GET variable configuration
1330
     */
1331
    public function drawURLs_addRowsForPage(array $pageRow, string $pageTitle): string
1332
    {
1333
        $skipMessage = '';
1334
1335
        // Get list of configurations
1336
        $configurations = $this->getUrlsForPageRow($pageRow, $skipMessage);
1337
        $configurations = ConfigurationService::removeDisallowedConfigurations($this->incomingConfigurationSelection, $configurations);
1338
1339
        // Traverse parameter combinations:
1340
        $c = 0;
1341
        $content = '';
1342
        if (! empty($configurations)) {
1343
            foreach ($configurations as $confKey => $confArray) {
1344
1345
                // Title column:
1346
                if (! $c) {
1347
                    $titleClm = '<td rowspan="' . count($configurations) . '">' . $pageTitle . '</td>';
1348
                } else {
1349
                    $titleClm = '';
1350
                }
1351
1352
                if (! in_array($pageRow['uid'], $this->configurationService->expandExcludeString($confArray['subCfg']['exclude'] ?? ''), true)) {
1353
1354
                    // URL list:
1355
                    $urlList = $this->urlListFromUrlArray(
1356
                        $confArray,
1357
                        $pageRow,
1358
                        $this->scheduledTime,
1359
                        $this->reqMinute,
1360
                        $this->submitCrawlUrls,
1361
                        $this->downloadCrawlUrls,
1362
                        $this->duplicateTrack,
1363
                        $this->downloadUrls,
1364
                        // if empty the urls won't be filtered by processing instructions
1365
                        $this->incomingProcInstructions
1366
                    );
1367
1368
                    // Expanded parameters:
1369
                    $paramExpanded = '';
1370
                    $calcAccu = [];
1371
                    $calcRes = 1;
1372
                    foreach ($confArray['paramExpanded'] as $gVar => $gVal) {
1373
                        $paramExpanded .= '
1374
                            <tr>
1375
                                <td>' . htmlspecialchars('&' . $gVar . '=') . '<br/>' .
1376
                            '(' . count($gVal) . ')' .
1377
                            '</td>
1378
                                <td nowrap="nowrap">' . nl2br(htmlspecialchars(implode(chr(10), $gVal))) . '</td>
1379
                            </tr>
1380
                        ';
1381
                        $calcRes *= count($gVal);
1382
                        $calcAccu[] = count($gVal);
1383
                    }
1384
                    $paramExpanded = '<table>' . $paramExpanded . '</table>';
1385
                    $paramExpanded .= 'Comb: ' . implode('*', $calcAccu) . '=' . $calcRes;
1386
1387
                    // Options
1388
                    $optionValues = '';
1389
                    if ($confArray['subCfg']['userGroups']) {
1390
                        $optionValues .= 'User Groups: ' . $confArray['subCfg']['userGroups'] . '<br/>';
1391
                    }
1392
                    if ($confArray['subCfg']['procInstrFilter']) {
1393
                        $optionValues .= 'ProcInstr: ' . $confArray['subCfg']['procInstrFilter'] . '<br/>';
1394
                    }
1395
1396
                    // Compile row:
1397
                    $content .= '
1398
                        <tr>
1399
                            ' . $titleClm . '
1400
                            <td>' . htmlspecialchars($confKey) . '</td>
1401
                            <td>' . nl2br(htmlspecialchars(rawurldecode(trim(str_replace('&', chr(10) . '&', GeneralUtility::implodeArrayForUrl('', $confArray['paramParsed'])))))) . '</td>
1402
                            <td>' . $paramExpanded . '</td>
1403
                            <td nowrap="nowrap">' . $urlList . '</td>
1404
                            <td nowrap="nowrap">' . $optionValues . '</td>
1405
                            <td nowrap="nowrap">' . DebugUtility::viewArray($confArray['subCfg']['procInstrParams.']) . '</td>
1406
                        </tr>';
1407
                } else {
1408
                    $content .= '<tr>
1409
                            ' . $titleClm . '
1410
                            <td>' . htmlspecialchars($confKey) . '</td>
1411
                            <td colspan="5"><em>No entries</em> (Page is excluded in this configuration)</td>
1412
                        </tr>';
1413
                }
1414
1415
                $c++;
1416
            }
1417
        } else {
1418
            $message = ! empty($skipMessage) ? ' (' . $skipMessage . ')' : '';
1419
1420
            // Compile row:
1421
            $content .= '
1422
                <tr>
1423
                    <td>' . $pageTitle . '</td>
1424
                    <td colspan="6"><em>No entries</em>' . $message . '</td>
1425
                </tr>';
1426
        }
1427
1428
        return $content;
1429
    }
1430
1431
    /*****************************
1432
     *
1433
     * CLI functions
1434
     *
1435
     *****************************/
1436
1437
    /**
1438
     * Running the functionality of the CLI (crawling URLs from queue)
1439
     * @deprecated
1440
     * @codeCoverageIgnore
1441
     */
1442
    public function CLI_run(int $countInARun, int $sleepTime, int $sleepAfterFinish): int
1443
    {
1444
        $result = 0;
1445
        $counter = 0;
1446
1447
        // First, run hooks:
1448
        foreach ($GLOBALS['TYPO3_CONF_VARS']['EXTCONF']['crawler']['cli_hooks'] ?? [] as $objRef) {
1449
            trigger_error(
1450
                'This hook (crawler/cli_hooks) is deprecated since 9.1.5 and will be removed when dropping support for TYPO3 9LTS and 10LTS',
1451
                E_USER_DEPRECATED
1452
            );
1453
            $hookObj = GeneralUtility::makeInstance($objRef);
1454
            if (is_object($hookObj)) {
1455
                $hookObj->crawler_init($this);
1456
            }
1457
        }
1458
1459
        // Clean up the queue
1460
        $this->queueRepository->cleanupQueue();
1461
1462
        // Select entries:
1463
        $rows = $this->queueRepository->fetchRecordsToBeCrawled($countInARun);
1464
1465
        if (! empty($rows)) {
1466
            $quidList = [];
1467
1468
            foreach ($rows as $r) {
1469
                $quidList[] = $r['qid'];
1470
            }
1471
1472
            $processId = $this->CLI_buildProcessId();
0 ignored issues
show
Deprecated Code introduced by
The function AOE\Crawler\Controller\C...r::CLI_buildProcessId() has been deprecated. ( Ignorable by Annotation )

If this is a false-positive, you can also ignore this issue in your code via the ignore-deprecated  annotation

1472
            $processId = /** @scrutinizer ignore-deprecated */ $this->CLI_buildProcessId();
Loading history...
1473
1474
            //save the number of assigned queue entries to determine how many have been processed later
1475
            $numberOfAffectedRows = $this->queueRepository->updateProcessIdAndSchedulerForQueueIds($quidList, $processId);
1476
            $this->processRepository->updateProcessAssignItemsCount($numberOfAffectedRows, $processId);
1477
1478
            if ($numberOfAffectedRows !== count($quidList)) {
1479
                return ($result | self::CLI_STATUS_ABORTED);
0 ignored issues
show
Deprecated Code introduced by
The constant AOE\Crawler\Controller\C...ler::CLI_STATUS_ABORTED has been deprecated: since 9.2.5 will be removed in v11.x ( Ignorable by Annotation )

If this is a false-positive, you can also ignore this issue in your code via the ignore-deprecated  annotation

1479
                return ($result | /** @scrutinizer ignore-deprecated */ self::CLI_STATUS_ABORTED);

This class constant has been deprecated. The supplier of the class has supplied an explanatory message.

The explanatory message should give you some clue as to whether and when the constant will be removed from the class and what other constant to use instead.

Loading history...
1480
            }
1481
1482
            foreach ($rows as $r) {
1483
                $result |= $this->readUrl($r['qid']);
1484
1485
                $counter++;
1486
                // Just to relax the system
1487
                usleep((int) $sleepTime);
1488
1489
                // if during the start and the current read url the cli has been disable we need to return from the function
1490
                // mark the process NOT as ended.
1491
                if ($this->crawler->isDisabled()) {
1492
                    return ($result | self::CLI_STATUS_ABORTED);
0 ignored issues
show
Deprecated Code introduced by
The constant AOE\Crawler\Controller\C...ler::CLI_STATUS_ABORTED has been deprecated: since 9.2.5 will be removed in v11.x ( Ignorable by Annotation )

If this is a false-positive, you can also ignore this issue in your code via the ignore-deprecated  annotation

1492
                    return ($result | /** @scrutinizer ignore-deprecated */ self::CLI_STATUS_ABORTED);

This class constant has been deprecated. The supplier of the class has supplied an explanatory message.

The explanatory message should give you some clue as to whether and when the constant will be removed from the class and what other constant to use instead.

Loading history...
1493
                }
1494
1495
                if (! $this->processRepository->isProcessActive($this->CLI_buildProcessId())) {
0 ignored issues
show
Deprecated Code introduced by
The function AOE\Crawler\Controller\C...r::CLI_buildProcessId() has been deprecated. ( Ignorable by Annotation )

If this is a false-positive, you can also ignore this issue in your code via the ignore-deprecated  annotation

1495
                if (! $this->processRepository->isProcessActive(/** @scrutinizer ignore-deprecated */ $this->CLI_buildProcessId())) {
Loading history...
1496
                    $this->CLI_debug('conflict / timeout (' . $this->CLI_buildProcessId() . ')');
0 ignored issues
show
Deprecated Code introduced by
The function AOE\Crawler\Controller\C...r::CLI_buildProcessId() has been deprecated. ( Ignorable by Annotation )

If this is a false-positive, you can also ignore this issue in your code via the ignore-deprecated  annotation

1496
                    $this->CLI_debug('conflict / timeout (' . /** @scrutinizer ignore-deprecated */ $this->CLI_buildProcessId() . ')');
Loading history...
Deprecated Code introduced by
The function AOE\Crawler\Controller\C...Controller::CLI_debug() has been deprecated. ( Ignorable by Annotation )

If this is a false-positive, you can also ignore this issue in your code via the ignore-deprecated  annotation

1496
                    /** @scrutinizer ignore-deprecated */ $this->CLI_debug('conflict / timeout (' . $this->CLI_buildProcessId() . ')');
Loading history...
1497
                    $result |= self::CLI_STATUS_ABORTED;
0 ignored issues
show
Deprecated Code introduced by
The constant AOE\Crawler\Controller\C...ler::CLI_STATUS_ABORTED has been deprecated: since 9.2.5 will be removed in v11.x ( Ignorable by Annotation )

If this is a false-positive, you can also ignore this issue in your code via the ignore-deprecated  annotation

1497
                    $result |= /** @scrutinizer ignore-deprecated */ self::CLI_STATUS_ABORTED;

This class constant has been deprecated. The supplier of the class has supplied an explanatory message.

The explanatory message should give you some clue as to whether and when the constant will be removed from the class and what other constant to use instead.

Loading history...
1498
                    //possible timeout
1499
                    break;
1500
                }
1501
            }
1502
1503
            sleep((int) $sleepAfterFinish);
1504
        }
1505
1506
        if ($counter > 0) {
1507
            $result |= self::CLI_STATUS_PROCESSED;
0 ignored issues
show
Deprecated Code introduced by
The constant AOE\Crawler\Controller\C...r::CLI_STATUS_PROCESSED has been deprecated: since 9.2.5 will be removed in v11.x ( Ignorable by Annotation )

If this is a false-positive, you can also ignore this issue in your code via the ignore-deprecated  annotation

1507
            $result |= /** @scrutinizer ignore-deprecated */ self::CLI_STATUS_PROCESSED;

This class constant has been deprecated. The supplier of the class has supplied an explanatory message.

The explanatory message should give you some clue as to whether and when the constant will be removed from the class and what other constant to use instead.

Loading history...
1508
        }
1509
1510
        return $result;
1511
    }
1512
1513
    /**
1514
     * Activate hooks
1515
     * @deprecated
1516
     * @codeCoverageIgnore
1517
     */
1518
    public function CLI_runHooks(): void
1519
    {
1520
        foreach ($GLOBALS['TYPO3_CONF_VARS']['EXTCONF']['crawler']['cli_hooks'] ?? [] as $objRef) {
1521
            $hookObj = GeneralUtility::makeInstance($objRef);
1522
            if (is_object($hookObj)) {
1523
                $hookObj->crawler_init($this);
1524
            }
1525
        }
1526
    }
1527
1528
    /**
1529
     * Try to acquire a new process with the given id
1530
     * also performs some auto-cleanup for orphan processes
1531
     * @param string $id identification string for the process
1532
     * @return boolean
1533
     * @todo preemption might not be the most elegant way to clean up
1534
     * @deprecated
1535
     * @codeCoverageIgnore
1536
     */
1537
    public function CLI_checkAndAcquireNewProcess($id)
1538
    {
1539
        $ret = true;
1540
1541
        $systemProcessId = getmypid();
1542
        if (! $systemProcessId) {
1543
            return false;
1544
        }
1545
1546
        $processCount = 0;
1547
        $orphanProcesses = [];
1548
1549
        $activeProcesses = $this->processRepository->findAllActive();
1550
        $currentTime = $this->getCurrentTime();
1551
1552
        /** @var Process $process */
1553
        foreach ($activeProcesses as $process) {
1554
            if ($process->getTtl() < $currentTime) {
1555
                $orphanProcesses[] = $process->getProcessId();
1556
            } else {
1557
                $processCount++;
1558
            }
1559
        }
1560
1561
        // if there are less than allowed active processes then add a new one
1562
        if ($processCount < (int) $this->extensionSettings['processLimit']) {
1563
            $this->processRepository->addProcess($id, $systemProcessId);
1564
        } else {
1565
            $ret = false;
1566
        }
1567
1568
        $this->processRepository->deleteProcessesMarkedAsDeleted();
1569
        $this->processRepository->markRequestedProcessesAsNotActive($orphanProcesses);
1570
        $this->queueRepository->unsetProcessScheduledAndProcessIdForQueueEntries($orphanProcesses);
1571
1572
        return $ret;
1573
    }
1574
1575
    /**
1576
     * Release a process and the required resources
1577
     *
1578
     * @param mixed $releaseIds string with a single process-id or array with multiple process-ids
1579
     * @return boolean
1580
     * @deprecated
1581
     * @codeCoverageIgnore
1582
     */
1583
    public function CLI_releaseProcesses($releaseIds)
1584
    {
1585
        $queryBuilder = GeneralUtility::makeInstance(ConnectionPool::class)->getQueryBuilderForTable(QueueRepository::TABLE_NAME);
1586
1587
        if (! is_array($releaseIds)) {
1588
            $releaseIds = [$releaseIds];
1589
        }
1590
1591
        if (empty($releaseIds)) {
1592
            //nothing to release
1593
            return false;
1594
        }
1595
1596
        // some kind of 2nd chance algo - this way you need at least 2 processes to have a real cleanup
1597
        // this ensures that a single process can't mess up the entire process table
1598
1599
        // mark all processes as deleted which have no "waiting" queue-entires and which are not active
1600
1601
        // ReleaseQueueEntries
1602
        $queryBuilder
1603
            ->update(QueueRepository::TABLE_NAME, 'q')
1604
            ->where(
1605
                'q.process_id IN(SELECT p.process_id FROM tx_crawler_process as p WHERE p.active = 0)'
1606
            )
1607
            ->set('q.process_scheduled', 0)
1608
            ->set('q.process_id', '')
1609
            ->execute();
1610
1611
        // FIXME: Not entirely sure that this is equivalent to the previous version
1612
        $queryBuilder->resetQueryPart('set');
1613
1614
        // ReleaseProcessEntries
1615
        $queryBuilder
1616
            ->update(ProcessRepository::TABLE_NAME)
1617
            ->where(
1618
                $queryBuilder->expr()->eq('active', 0),
1619
                'process_id IN(SELECT q.process_id FROM tx_crawler_queue as q WHERE q.exec_time = 0)'
1620
            )
1621
            ->set('system_process_id', 0)
1622
            ->execute();
1623
1624
        $this->processRepository->markRequestedProcessesAsNotActive($releaseIds);
1625
        $this->queueRepository->unsetProcessScheduledAndProcessIdForQueueEntries($releaseIds);
1626
1627
        return true;
1628
    }
1629
1630
    /**
1631
     * Create a unique Id for the current process
1632
     *
1633
     * @return string the ID
1634
     * @deprecated
1635
     * @codeCoverageIgnore
1636
     */
1637
    public function CLI_buildProcessId()
1638
    {
1639
        if (! $this->processID) {
1640
            $this->processID = GeneralUtility::shortMD5(microtime(true));
1641
        }
1642
        return $this->processID;
1643
    }
1644
1645
    /**
1646
     * Prints a message to the stdout (only if debug-mode is enabled)
1647
     *
1648
     * @param string $msg the message
1649
     * @deprecated
1650
     * @codeCoverageIgnore
1651
     */
1652
    public function CLI_debug($msg): void
1653
    {
1654
        if ((int) $this->extensionSettings['processDebug']) {
1655
            echo $msg . "\n";
1656
            flush();
1657
        }
1658
    }
1659
1660
    /**
1661
     * Cleans up entries that stayed for too long in the queue. These are:
1662
     * - processed entries that are over 1.5 days in age
1663
     * - scheduled entries that are over 7 days old
1664
     *
1665
     * @deprecated
1666
     */
1667 1
    public function cleanUpOldQueueEntries(): void
1668
    {
1669
        // 24*60*60 Seconds in 24 hours
1670 1
        $processedAgeInSeconds = $this->extensionSettings['cleanUpProcessedAge'] * 86400;
1671 1
        $scheduledAgeInSeconds = $this->extensionSettings['cleanUpScheduledAge'] * 86400;
1672
1673 1
        $now = time();
1674 1
        $condition = '(exec_time<>0 AND exec_time<' . ($now - $processedAgeInSeconds) . ') OR scheduled<=' . ($now - $scheduledAgeInSeconds);
1675 1
        $this->flushQueue($condition);
0 ignored issues
show
Deprecated Code introduced by
The function AOE\Crawler\Controller\C...ontroller::flushQueue() has been deprecated. ( Ignorable by Annotation )

If this is a false-positive, you can also ignore this issue in your code via the ignore-deprecated  annotation

1675
        /** @scrutinizer ignore-deprecated */ $this->flushQueue($condition);
Loading history...
1676 1
    }
1677
1678
    /**
1679
     * Removes queue entries
1680
     *
1681
     * @param string $where SQL related filter for the entries which should be removed
1682
     *
1683
     * @deprecated
1684
     */
1685 5
    protected function flushQueue($where = ''): void
1686
    {
1687 5
        $realWhere = strlen((string) $where) > 0 ? $where : '1=1';
1688
1689 5
        $queryBuilder = $this->getQueryBuilder(QueueRepository::TABLE_NAME);
1690
1691
        $groups = $queryBuilder
0 ignored issues
show
Deprecated Code introduced by
The function Doctrine\DBAL\ForwardCom...lity\Result::fetchAll() has been deprecated: Use fetchAllNumeric(), fetchAllAssociative() or fetchFirstColumn() instead. ( Ignorable by Annotation )

If this is a false-positive, you can also ignore this issue in your code via the ignore-deprecated  annotation

1691
        $groups = /** @scrutinizer ignore-deprecated */ $queryBuilder

This function has been deprecated. The supplier of the function has supplied an explanatory message.

The explanatory message should give you some clue as to whether and when the function will be removed and what other function to use instead.

Loading history...
1692 5
            ->selectLiteral('DISTINCT set_id')
1693 5
            ->from(QueueRepository::TABLE_NAME)
1694 5
            ->where($realWhere)
1695 5
            ->execute()
1696 5
            ->fetchAll();
1697 5
        if (is_array($groups)) {
0 ignored issues
show
introduced by
The condition is_array($groups) is always true.
Loading history...
1698 5
            foreach ($groups as $group) {
1699
                $subSet = $queryBuilder
0 ignored issues
show
Deprecated Code introduced by
The function Doctrine\DBAL\ForwardCom...lity\Result::fetchAll() has been deprecated: Use fetchAllNumeric(), fetchAllAssociative() or fetchFirstColumn() instead. ( Ignorable by Annotation )

If this is a false-positive, you can also ignore this issue in your code via the ignore-deprecated  annotation

1699
                $subSet = /** @scrutinizer ignore-deprecated */ $queryBuilder

This function has been deprecated. The supplier of the function has supplied an explanatory message.

The explanatory message should give you some clue as to whether and when the function will be removed and what other function to use instead.

Loading history...
1700 4
                    ->select('qid', 'set_id')
1701 4
                    ->from(QueueRepository::TABLE_NAME)
1702 4
                    ->where(
1703 4
                        $realWhere,
1704 4
                        $queryBuilder->expr()->eq('set_id', $group['set_id'])
1705
                    )
1706 4
                    ->execute()
1707 4
                    ->fetchAll();
1708
1709 4
                $payLoad = ['subSet' => $subSet];
1710 4
                SignalSlotUtility::emitSignal(
0 ignored issues
show
Deprecated Code introduced by
The function AOE\Crawler\Utility\Sign...otUtility::emitSignal() has been deprecated. ( Ignorable by Annotation )

If this is a false-positive, you can also ignore this issue in your code via the ignore-deprecated  annotation

1710
                /** @scrutinizer ignore-deprecated */ SignalSlotUtility::emitSignal(
Loading history...
1711 4
                    self::class,
1712 4
                    SignalSlotUtility::SIGNAL_QUEUE_ENTRY_FLUSH,
1713
                    $payLoad
1714
                );
1715
            }
1716
        }
1717
1718
        $queryBuilder
1719 5
            ->delete(QueueRepository::TABLE_NAME)
1720 5
            ->where($realWhere)
1721 5
            ->execute();
1722 5
    }
1723
1724
    /**
1725
     * This method determines duplicates for a queue entry with the same parameters and this timestamp.
1726
     * If the timestamp is in the past, it will check if there is any unprocessed queue entry in the past.
1727
     * If the timestamp is in the future it will check, if the queued entry has exactly the same timestamp
1728
     *
1729
     * @param int $tstamp
1730
     * @param array $fieldArray
1731
     *
1732
     * @return array
1733
     * @deprecated
1734
     */
1735 5
    protected function getDuplicateRowsIfExist($tstamp, $fieldArray)
1736
    {
1737 5
        $rows = [];
1738
1739 5
        $currentTime = $this->getCurrentTime();
1740
1741 5
        $queryBuilder = GeneralUtility::makeInstance(ConnectionPool::class)->getQueryBuilderForTable(QueueRepository::TABLE_NAME);
1742
        $queryBuilder
1743 5
            ->select('qid')
1744 5
            ->from(QueueRepository::TABLE_NAME);
1745
        //if this entry is scheduled with "now"
1746 5
        if ($tstamp <= $currentTime) {
1747 2
            if ($this->extensionSettings['enableTimeslot']) {
1748 1
                $timeBegin = $currentTime - 100;
1749 1
                $timeEnd = $currentTime + 100;
1750
                $queryBuilder
1751 1
                    ->where(
1752 1
                        'scheduled BETWEEN ' . $timeBegin . ' AND ' . $timeEnd . ''
1753
                    )
1754 1
                    ->orWhere(
1755 1
                        $queryBuilder->expr()->lte('scheduled', $currentTime)
1756
                    );
1757
            } else {
1758
                $queryBuilder
1759 1
                    ->where(
1760 2
                        $queryBuilder->expr()->lte('scheduled', $currentTime)
1761
                    );
1762
            }
1763 3
        } elseif ($tstamp > $currentTime) {
1764
            //entry with a timestamp in the future need to have the same schedule time
1765
            $queryBuilder
1766 3
                ->where(
1767 3
                    $queryBuilder->expr()->eq('scheduled', $tstamp)
1768
                );
1769
        }
1770
1771
        $queryBuilder
1772 5
            ->andWhere('NOT exec_time')
1773 5
            ->andWhere('NOT process_id')
1774 5
            ->andWhere($queryBuilder->expr()->eq('page_id', $queryBuilder->createNamedParameter($fieldArray['page_id'], PDO::PARAM_INT)))
1775 5
            ->andWhere($queryBuilder->expr()->eq('parameters_hash', $queryBuilder->createNamedParameter($fieldArray['parameters_hash'], PDO::PARAM_STR)));
1776
1777 5
        $statement = $queryBuilder->execute();
1778
1779 5
        while ($row = $statement->fetch()) {
1780 5
            $rows[] = $row['qid'];
1781
        }
1782
1783 5
        return $rows;
1784
    }
1785
1786
    /**
1787
     * Returns a md5 hash generated from a serialized configuration array.
1788
     *
1789
     * @return string
1790
     */
1791 13
    protected function getConfigurationHash(array $configuration)
1792
    {
1793 13
        unset($configuration['paramExpanded']);
1794 13
        unset($configuration['URLs']);
1795 13
        return md5(serialize($configuration));
1796
    }
1797
1798
    /**
1799
     * Build a URL from a Page and the Query String. If the page has a Site configuration, it can be built by using
1800
     * the Site instance.
1801
     *
1802
     * @param int $httpsOrHttp see tx_crawler_configuration.force_ssl
1803
     * @throws SiteNotFoundException
1804
     * @throws InvalidRouteArgumentsException
1805
     *
1806
     * @deprecated Using CrawlerController::getUrlFromPageAndQueryParameters() is deprecated since 9.1.1 and will be removed in v11.x, please use UrlService->getUrlFromPageAndQueryParameters() instead.
1807
     * @codeCoverageIgnore
1808
     */
1809
    protected function getUrlFromPageAndQueryParameters(int $pageId, string $queryString, ?string $alternativeBaseUrl, int $httpsOrHttp): UriInterface
1810
    {
1811
        $urlService = new UrlService();
1812
        return $urlService->getUrlFromPageAndQueryParameters($pageId, $queryString, $alternativeBaseUrl, $httpsOrHttp);
1813
    }
1814
1815
    /**
1816
     * @deprecated
1817
     */
1818 1
    protected function swapIfFirstIsLargerThanSecond(array $reg): array
1819
    {
1820
        // Swap if first is larger than last:
1821 1
        if ($reg[1] > $reg[2]) {
1822
            $temp = $reg[2];
1823
            $reg[2] = $reg[1];
1824
            $reg[1] = $temp;
1825
        }
1826
1827 1
        return $reg;
1828
    }
1829
1830 7
    protected function getPageService(): PageService
1831
    {
1832 7
        return new PageService();
1833
    }
1834
1835
    private function getMaximumUrlsToCompile(): int
1836
    {
1837
        return $this->maximumUrlsToCompile;
1838
    }
1839
1840
    /**
1841
     * @return BackendUserAuthentication
1842
     */
1843 2
    private function getBackendUser()
1844
    {
1845
        // Make sure the _cli_ user is loaded
1846 2
        Bootstrap::initializeBackendAuthentication();
1847 2
        if ($this->backendUser === null) {
1848 2
            $this->backendUser = $GLOBALS['BE_USER'];
1849
        }
1850 2
        return $this->backendUser;
1851
    }
1852
1853
    /**
1854
     * Get querybuilder for given table
1855
     *
1856
     * @return QueryBuilder
1857
     */
1858 11
    private function getQueryBuilder(string $table)
1859
    {
1860 11
        return GeneralUtility::makeInstance(ConnectionPool::class)->getQueryBuilderForTable($table);
1861
    }
1862
}
1863