Passed
Pull Request — main (#764)
by
unknown
18:32 queued 03:50
created

CrawlerController::setProcessID()   A

Complexity

Conditions 1
Paths 1

Size

Total Lines 3
Code Lines 1

Duplication

Lines 0
Ratio 0 %

Code Coverage

Tests 2
CRAP Score 1

Importance

Changes 1
Bugs 0 Features 0
Metric Value
cc 1
eloc 1
nc 1
nop 1
dl 0
loc 3
ccs 2
cts 2
cp 1
crap 1
rs 10
c 1
b 0
f 0
1
<?php
2
3
declare(strict_types=1);
4
5
namespace AOE\Crawler\Controller;
6
7
/***************************************************************
8
 *  Copyright notice
9
 *
10
 *  (c) 2020 AOE GmbH <[email protected]>
11
 *
12
 *  All rights reserved
13
 *
14
 *  This script is part of the TYPO3 project. The TYPO3 project is
15
 *  free software; you can redistribute it and/or modify
16
 *  it under the terms of the GNU General Public License as published by
17
 *  the Free Software Foundation; either version 3 of the License, or
18
 *  (at your option) any later version.
19
 *
20
 *  The GNU General Public License can be found at
21
 *  http://www.gnu.org/copyleft/gpl.html.
22
 *
23
 *  This script is distributed in the hope that it will be useful,
24
 *  but WITHOUT ANY WARRANTY; without even the implied warranty of
25
 *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
26
 *  GNU General Public License for more details.
27
 *
28
 *  This copyright notice MUST APPEAR in all copies of the script!
29
 ***************************************************************/
30
31
use AOE\Crawler\Configuration\ExtensionConfigurationProvider;
32
use AOE\Crawler\Converter\JsonCompatibilityConverter;
33
use AOE\Crawler\Crawler;
34
use AOE\Crawler\CrawlStrategy\CrawlStrategyFactory;
35
use AOE\Crawler\Domain\Model\Process;
36
use AOE\Crawler\Domain\Repository\ConfigurationRepository;
37
use AOE\Crawler\Domain\Repository\ProcessRepository;
38
use AOE\Crawler\Domain\Repository\QueueRepository;
39
use AOE\Crawler\QueueExecutor;
40
use AOE\Crawler\Service\ConfigurationService;
41
use AOE\Crawler\Service\PageService;
42
use AOE\Crawler\Service\UrlService;
43
use AOE\Crawler\Utility\SignalSlotUtility;
44
use AOE\Crawler\Value\QueueFilter;
45
use PDO;
46
use Psr\Http\Message\UriInterface;
47
use Psr\Log\LoggerAwareInterface;
48
use Psr\Log\LoggerAwareTrait;
49
use TYPO3\CMS\Backend\Tree\View\PageTreeView;
50
use TYPO3\CMS\Backend\Utility\BackendUtility;
51
use TYPO3\CMS\Core\Authentication\BackendUserAuthentication;
52
use TYPO3\CMS\Core\Compatibility\PublicMethodDeprecationTrait;
53
use TYPO3\CMS\Core\Compatibility\PublicPropertyDeprecationTrait;
54
use TYPO3\CMS\Core\Core\Bootstrap;
55
use TYPO3\CMS\Core\Core\Environment;
56
use TYPO3\CMS\Core\Database\Connection;
57
use TYPO3\CMS\Core\Database\ConnectionPool;
58
use TYPO3\CMS\Core\Database\Query\QueryBuilder;
59
use TYPO3\CMS\Core\Database\Query\Restriction\DeletedRestriction;
60
use TYPO3\CMS\Core\Database\QueryGenerator;
61
use TYPO3\CMS\Core\Domain\Repository\PageRepository;
62
use TYPO3\CMS\Core\Exception\SiteNotFoundException;
63
use TYPO3\CMS\Core\Imaging\Icon;
64
use TYPO3\CMS\Core\Imaging\IconFactory;
65
use TYPO3\CMS\Core\Routing\InvalidRouteArgumentsException;
66
use TYPO3\CMS\Core\Site\Entity\Site;
67
use TYPO3\CMS\Core\Type\Bitmask\Permission;
68
use TYPO3\CMS\Core\Utility\DebugUtility;
69
use TYPO3\CMS\Core\Utility\GeneralUtility;
70
use TYPO3\CMS\Core\Utility\MathUtility;
71
use TYPO3\CMS\Extbase\Object\ObjectManager;
72
73
/**
74
 * Class CrawlerController
75
 *
76
 * @package AOE\Crawler\Controller
77
 */
78
class CrawlerController implements LoggerAwareInterface
79
{
80
    use LoggerAwareTrait;
81
    use PublicMethodDeprecationTrait;
82
    use PublicPropertyDeprecationTrait;
83
84
    /**
85
     * @deprecated since 9.2.5 will be removed in v11.x
86
     */
87
    public const CLI_STATUS_NOTHING_PROCCESSED = 0;
88
89
    /**
90
     * queue not empty
91
     * @deprecated since 9.2.5 will be removed in v11.x
92
     */
93
    public const CLI_STATUS_REMAIN = 1;
94
95
    /**
96
     * (some) queue items where processed
97
     * @deprecated since 9.2.5 will be removed in v11.x
98
     */
99
    public const CLI_STATUS_PROCESSED = 2;
100
101
    /**
102
     * instance didn't finish
103
     * @deprecated since 9.2.5 will be removed in v11.x
104
     */
105
    public const CLI_STATUS_ABORTED = 4;
106
107
    /**
108
     * @deprecated since 9.2.5 will be removed in v11.x
109
     */
110
    public const CLI_STATUS_POLLABLE_PROCESSED = 8;
111
112
    /**
113
     * @var integer
114
     */
115
    public $setID = 0;
116
117
    /**
118
     * @var string
119
     */
120
    public $processID = '';
121
122
    /**
123
     * @var array
124
     */
125
    public $duplicateTrack = [];
126
127
    /**
128
     * @var array
129
     */
130
    public $downloadUrls = [];
131
132
    /**
133
     * @var array
134
     */
135
    public $incomingProcInstructions = [];
136
137
    /**
138
     * @var array
139
     */
140
    public $incomingConfigurationSelection = [];
141
142
    /**
143
     * @var bool
144
     */
145
    public $registerQueueEntriesInternallyOnly = false;
146
147
    /**
148
     * @var array
149
     */
150
    public $queueEntries = [];
151
152
    /**
153
     * @var array
154
     */
155
    public $urlList = [];
156
157
    /**
158
     * @var array
159
     */
160
    public $extensionSettings = [];
161
162
    /**
163
     * Mount Point
164
     *
165
     * @var bool
166
     * Todo: Check what this is used for and adjust the type hint or code, as bool doesn't match the current code.
167
     */
168
    public $MP = false;
169
170
    /**
171
     * @var string
172
     * @deprecated
173
     */
174
    protected $processFilename;
175
176
    /**
177
     * Holds the internal access mode can be 'gui','cli' or 'cli_im'
178
     *
179
     * @var string
180
     * @deprecated
181
     */
182
    protected $accessMode;
183
184
    /**
185
     * @var QueueRepository
186
     */
187
    protected $queueRepository;
188
189
    /**
190
     * @var ProcessRepository
191
     */
192
    protected $processRepository;
193
194
    /**
195
     * @var ConfigurationRepository
196
     */
197
    protected $configurationRepository;
198
199
    /**
200
     * @var QueueExecutor
201
     */
202
    protected $queueExecutor;
203
204
    /**
205
     * @var int
206
     */
207
    protected $maximumUrlsToCompile = 10000;
208
209
    /**
210
     * @var IconFactory
211
     */
212
    protected $iconFactory;
213
214
    /**
215
     * @var string[]
216
     */
217
    private $deprecatedPublicMethods = [
0 ignored issues
show
introduced by
The private property $deprecatedPublicMethods is not used, and could be removed.
Loading history...
218
        'compileUrls' => 'Using CrawlerController->compileUrls() is deprecated since 9.2.5, and will be removed in v11.x',
219
        'cleanUpOldQueueEntries' => 'Using CrawlerController::cleanUpOldQueueEntries() is deprecated since 9.0.1 and will be removed in v11.x, please use QueueRepository->cleanUpOldQueueEntries() instead.',
220
        'CLI_buildProcessId' => 'Using CrawlerController->CLI_buildProcessId() is deprecated since 9.2.5 and will be removed in v11.x',
221
        'CLI_checkAndAcquireNewProcess' => 'Using CrawlerController->CLI_checkAndAcquireNewProcess() is deprecated since 9.2.5 and will be removed in v11.x',
222
        'CLI_debug' => 'Using CrawlerController->CLI_debug() is deprecated since 9.1.3 and will be removed in v11.x',
223
        'CLI_releaseProcesses' => 'Using CrawlerController->CLI_releaseProcesses() is deprecated since 9.2.2 and will be removed in v11.x',
224
        'CLI_run' => 'Using CrawlerController->CLI_run() is deprecated since 9.2.2 and will be removed in v11.x',
225
        'CLI_runHooks' => 'Using CrawlerController->CLI_runHooks() is deprecated since 9.1.5 and will be removed in v11.x',
226
        'expandExcludeString' => 'Using CrawlerController->expandExcludeString() is deprecated since 9.2.5 and will be removed in v11.x',
227
        'getAccessMode' => 'Using CrawlerController->getAccessMode() is deprecated since 9.1.3 and will be removed in v11.x',
228
        'getLogEntriesForPageId' => 'Using CrawlerController->getLogEntriesForPageId() is deprecated since 9.1.5 and will be remove in v11.x',
229
        'getLogEntriesForSetId' => 'Using crawlerController::getLogEntriesForSetId() is deprecated since 9.0.1 and will be removed in v11.x',
230
        'hasGroupAccess' => 'Using CrawlerController->getLogEntriesForPageId() is deprecated since 9.2.2 and will be remove in v11.x, please use UserService::hasGroupAccess() instead.',
231
        'flushQueue' => 'Using CrawlerController::flushQueue() is deprecated since 9.0.1 and will be removed in v11.x, please use QueueRepository->flushQueue() instead.',
232
        'setAccessMode' => 'Using CrawlerController->setAccessMode() is deprecated since 9.1.3 and will be removed in v11.x',
233
        'getDisabled' => 'Using CrawlerController->getDisabled() is deprecated since 9.1.3 and will be removed in v11.x, please use Crawler->isDisabled() instead',
234
        'setDisabled' => 'Using CrawlerController->setDisabled() is deprecated since 9.1.3 and will be removed in v11.x, please use Crawler->setDisabled() instead',
235
        'getProcessFilename' => 'Using CrawlerController->getProcessFilename() is deprecated since 9.1.3 and will be removed in v11.x',
236
        'setProcessFilename' => 'Using CrawlerController->setProcessFilename() is deprecated since 9.1.3 and will be removed in v11.x',
237
        'getDuplicateRowsIfExist' => 'Using CrawlerController->getDuplicateRowsIfExist() is deprecated since 9.1.4 and will be remove in v11.x, please use QueueRepository->getDuplicateQueueItemsIfExists() instead',
238
        'checkIfPageShouldBeSkipped' => 'Using CrawlerController->checkIfPageShouldBeSkipped() is deprecated since 9.2.5 and will be removed in v11.x',
239
        'swapIfFirstIsLargerThanSecond' => 'Using CrawlerController->swapIfFirstIsLargerThanSecond() is deprecated since 9.2.5, and will be removed in v11.x',
240
        'expandParameters' => 'Using CrawlerController->expandParameters() is deprecated since 9.2.5, and will be removed in v11.x',
241
    ];
242
243
    /**
244
     * @var string[]
245
     */
246
    private $deprecatedPublicProperties = [
247
        'accessMode' => 'Using CrawlerController->accessMode is deprecated since 9.1.3 and will be removed in v11.x',
248
        'processFilename' => 'Using CrawlerController->accessMode is deprecated since 9.1.3 and will be removed in v11.x',
249
    ];
250
251
    /**
252
     * @var BackendUserAuthentication|null
253
     */
254
    private $backendUser;
255
256
    /**
257
     * @var integer
258
     */
259
    private $scheduledTime = 0;
260
261
    /**
262
     * @var integer
263
     */
264
    private $reqMinute = 0;
265
266
    /**
267
     * @var bool
268
     */
269
    private $submitCrawlUrls = false;
270
271
    /**
272
     * @var bool
273
     */
274
    private $downloadCrawlUrls = false;
275
276
    /**
277
     * @var PageRepository
278
     */
279
    private $pageRepository;
280
281
    /**
282
     * @var Crawler
283
     */
284
    private $crawler;
285
286
    /**
287
     * @var ConfigurationService
288
     */
289
    private $configurationService;
290
291
    /**
292
     * @var UrlService
293
     */
294
    private $urlService;
295
296
    /************************************
297
     *
298
     * Getting URLs based on Page TSconfig
299
     *
300
     ************************************/
301
302 41
    public function __construct()
303
    {
304 41
        $objectManager = GeneralUtility::makeInstance(ObjectManager::class);
305 41
        $crawlStrategyFactory = GeneralUtility::makeInstance(CrawlStrategyFactory::class);
306 41
        $this->queueRepository = $objectManager->get(QueueRepository::class);
307 41
        $this->processRepository = $objectManager->get(ProcessRepository::class);
308 41
        $this->configurationRepository = $objectManager->get(ConfigurationRepository::class);
309 41
        $this->pageRepository = GeneralUtility::makeInstance(PageRepository::class);
310 41
        $this->queueExecutor = GeneralUtility::makeInstance(QueueExecutor::class, $crawlStrategyFactory);
311 41
        $this->iconFactory = GeneralUtility::makeInstance(IconFactory::class);
312 41
        $this->crawler = GeneralUtility::makeInstance(Crawler::class);
313 41
        $this->configurationService = GeneralUtility::makeInstance(ConfigurationService::class);
314 41
        $this->urlService = GeneralUtility::makeInstance(UrlService::class);
315
316 41
        $this->processFilename = Environment::getVarPath() . '/lock/tx_crawler.proc';
0 ignored issues
show
Deprecated Code introduced by
The property AOE\Crawler\Controller\C...oller::$processFilename has been deprecated. ( Ignorable by Annotation )

If this is a false-positive, you can also ignore this issue in your code via the ignore-deprecated  annotation

316
        /** @scrutinizer ignore-deprecated */ $this->processFilename = Environment::getVarPath() . '/lock/tx_crawler.proc';
Loading history...
317
318
        /** @var ExtensionConfigurationProvider $configurationProvider */
319 41
        $configurationProvider = GeneralUtility::makeInstance(ExtensionConfigurationProvider::class);
320 41
        $settings = $configurationProvider->getExtensionConfiguration();
321 41
        $this->extensionSettings = is_array($settings) ? $settings : [];
322
323 41
        if (MathUtility::convertToPositiveInteger($this->extensionSettings['countInARun']) === 0) {
324
            $this->extensionSettings['countInARun'] = 100;
325
        }
326
327 41
        $this->extensionSettings['processLimit'] = MathUtility::forceIntegerInRange($this->extensionSettings['processLimit'], 1, 99, 1);
328 41
        $this->setMaximumUrlsToCompile(MathUtility::forceIntegerInRange($this->extensionSettings['maxCompileUrls'], 1, 1000000000, 10000));
329 41
    }
330
331 41
    public function setMaximumUrlsToCompile(int $maximumUrlsToCompile): void
332
    {
333 41
        $this->maximumUrlsToCompile = $maximumUrlsToCompile;
334 41
    }
335
336
    /**
337
     * Method to set the accessMode can be gui, cli or cli_im
338
     *
339
     * @return string
340
     * @deprecated
341
     */
342 1
    public function getAccessMode()
343
    {
344 1
        return $this->accessMode;
0 ignored issues
show
Deprecated Code introduced by
The property AOE\Crawler\Controller\C...Controller::$accessMode has been deprecated. ( Ignorable by Annotation )

If this is a false-positive, you can also ignore this issue in your code via the ignore-deprecated  annotation

344
        return /** @scrutinizer ignore-deprecated */ $this->accessMode;
Loading history...
345
    }
346
347
    /**
348
     * @param string $accessMode
349
     * @deprecated
350
     */
351 1
    public function setAccessMode($accessMode): void
352
    {
353 1
        $this->accessMode = $accessMode;
0 ignored issues
show
Deprecated Code introduced by
The property AOE\Crawler\Controller\C...Controller::$accessMode has been deprecated. ( Ignorable by Annotation )

If this is a false-positive, you can also ignore this issue in your code via the ignore-deprecated  annotation

353
        /** @scrutinizer ignore-deprecated */ $this->accessMode = $accessMode;
Loading history...
354 1
    }
355
356
    /**
357
     * Set disabled status to prevent processes from being processed
358
     * @deprecated
359
     */
360 3
    public function setDisabled(?bool $disabled = true): void
361
    {
362 3
        if ($disabled) {
363 2
            GeneralUtility::writeFile($this->processFilename, 'disabled');
0 ignored issues
show
Deprecated Code introduced by
The property AOE\Crawler\Controller\C...oller::$processFilename has been deprecated. ( Ignorable by Annotation )

If this is a false-positive, you can also ignore this issue in your code via the ignore-deprecated  annotation

363
            GeneralUtility::writeFile(/** @scrutinizer ignore-deprecated */ $this->processFilename, 'disabled');
Loading history...
364 1
        } elseif (is_file($this->processFilename)) {
0 ignored issues
show
Deprecated Code introduced by
The property AOE\Crawler\Controller\C...oller::$processFilename has been deprecated. ( Ignorable by Annotation )

If this is a false-positive, you can also ignore this issue in your code via the ignore-deprecated  annotation

364
        } elseif (is_file(/** @scrutinizer ignore-deprecated */ $this->processFilename)) {
Loading history...
365 1
            unlink($this->processFilename);
0 ignored issues
show
Deprecated Code introduced by
The property AOE\Crawler\Controller\C...oller::$processFilename has been deprecated. ( Ignorable by Annotation )

If this is a false-positive, you can also ignore this issue in your code via the ignore-deprecated  annotation

365
            unlink(/** @scrutinizer ignore-deprecated */ $this->processFilename);
Loading history...
366
        }
367 3
    }
368
369
    /**
370
     * Get disable status
371
     * @deprecated
372
     */
373 3
    public function getDisabled(): bool
374
    {
375 3
        return is_file($this->processFilename);
0 ignored issues
show
Deprecated Code introduced by
The property AOE\Crawler\Controller\C...oller::$processFilename has been deprecated. ( Ignorable by Annotation )

If this is a false-positive, you can also ignore this issue in your code via the ignore-deprecated  annotation

375
        return is_file(/** @scrutinizer ignore-deprecated */ $this->processFilename);
Loading history...
376
    }
377
378
    /**
379
     * @param string $filenameWithPath
380
     * @deprecated
381
     */
382 4
    public function setProcessFilename($filenameWithPath): void
383
    {
384 4
        $this->processFilename = $filenameWithPath;
0 ignored issues
show
Deprecated Code introduced by
The property AOE\Crawler\Controller\C...oller::$processFilename has been deprecated. ( Ignorable by Annotation )

If this is a false-positive, you can also ignore this issue in your code via the ignore-deprecated  annotation

384
        /** @scrutinizer ignore-deprecated */ $this->processFilename = $filenameWithPath;
Loading history...
385 4
    }
386
387
    /**
388
     * @param string $processID
389
     * @deprecated
390
     */
391 1
    public function setProcessID($processID): void
392
    {
393 1
        $this->processID = $processID;
394
    }
395
396
    /**
397
     * @return string
398
     * @deprecated
399 6
     */
400
    public function getProcessFilename()
401 6
    {
402 6
        return $this->processFilename;
0 ignored issues
show
Deprecated Code introduced by
The property AOE\Crawler\Controller\C...oller::$processFilename has been deprecated. ( Ignorable by Annotation )

If this is a false-positive, you can also ignore this issue in your code via the ignore-deprecated  annotation

402
        return /** @scrutinizer ignore-deprecated */ $this->processFilename;
Loading history...
403
    }
404
405
    /**
406
     * Sets the extensions settings (unserialized pendant of $TYPO3_CONF_VARS['EXT']['extConf']['crawler']).
407
     */
408
    public function setExtensionSettings(array $extensionSettings): void
409
    {
410
        $this->extensionSettings = $extensionSettings;
411
    }
412
413
    /**
414
     * Check if the given page should be crawled
415
     *
416
     * @return false|string false if the page should be crawled (not excluded), true / skipMessage if it should be skipped
417
     * @deprecated
418
     */
419
    public function checkIfPageShouldBeSkipped(array $pageRow)
420
    {
421
        $pageService = GeneralUtility::makeInstance(PageService::class);
422
        return $pageService->checkIfPageShouldBeSkipped($pageRow);
423
    }
424
425 9
    /**
426
     * Wrapper method for getUrlsForPageId()
427 9
     * It returns an array of configurations and no urls!
428
     *
429
     * @param array $pageRow Page record with at least dok-type and uid columns.
430
     * @param string $skipMessage
431
     * @return array
432 9
     * @see getUrlsForPageId()
433 9
     */
434 8
    public function getUrlsForPageRow(array $pageRow, &$skipMessage = '')
435 8
    {
436
        if (! is_int($pageRow['uid'])) {
437 1
            $skipMessage = 'PageUid ' . $pageRow['uid'] . ' was not an integer';
438 1
            return [];
439
        }
440
441 9
        $message = $this->getPageService()->checkIfPageShouldBeSkipped($pageRow);
442
        if ($message === false) {
443
            $res = $this->getUrlsForPageId($pageRow['uid']);
444
            $skipMessage = '';
445
        } else {
446
            $skipMessage = $message;
447
            $res = [];
448
        }
449
450
        return $res;
451
    }
452
453
    /**
454
     * Creates a list of URLs from input array (and submits them to queue if asked for)
455
     * See Web > Info module script + "indexed_search"'s crawler hook-client using this!
456
     *
457
     * @param array $vv Information about URLs from pageRow to crawl.
458
     * @param array $pageRow Page row
459 7
     * @param int $scheduledTime Unix time to schedule indexing to, typically time()
460
     * @param int $reqMinute Number of requests per minute (creates the interleave between requests)
461
     * @param bool $submitCrawlUrls If set, submits the URLs to queue
462
     * @param bool $downloadCrawlUrls If set (and submitcrawlUrls is false) will fill $downloadUrls with entries)
463
     * @param array $duplicateTrack Array which is passed by reference and contains the an id per url to secure we will not crawl duplicates
464
     * @param array $downloadUrls Array which will be filled with URLS for download if flag is set.
465
     * @param array $incomingProcInstructions Array of processing instructions
466
     * @return string List of URLs (meant for display in backend module)
467
     */
468
    public function urlListFromUrlArray(
469
        array $vv,
470 7
        array $pageRow,
471
        $scheduledTime,
472
        $reqMinute,
473 7
        $submitCrawlUrls,
474 7
        $downloadCrawlUrls,
475 7
        array &$duplicateTrack,
476 7
        array &$downloadUrls,
477
        array $incomingProcInstructions
478 7
    ) {
479
        if (! is_array($vv['URLs'])) {
480 7
            return 'ERROR - no URL generated';
481 7
        }
482
        $urlLog = [];
483
        $pageId = (int) $pageRow['uid'];
484 7
        $configurationHash = $this->getConfigurationHash($vv);
485 7
        $skipInnerCheck = $this->queueRepository->noUnprocessedQueueEntriesForPageWithConfigurationHashExist($pageId, $configurationHash);
486
487 7
        $urlService = new UrlService();
488 7
489
        foreach ($vv['URLs'] as $urlQuery) {
490
            if (! $this->drawURLs_PIfilter($vv['subCfg']['procInstrFilter'], $incomingProcInstructions)) {
491
                continue;
492 7
            }
493
            $url = (string) $urlService->getUrlFromPageAndQueryParameters(
494 7
                $pageId,
495
                $urlQuery,
496
                $vv['subCfg']['baseUrl'] ?? null,
497
                $vv['subCfg']['force_ssl'] ?? 0
498
            );
499 7
500 7
            // Create key by which to determine unique-ness:
501 7
            $uKey = $url . '|' . $vv['subCfg']['userGroups'] . '|' . $vv['subCfg']['procInstrFilter'];
502 7
503 7
            if (isset($duplicateTrack[$uKey])) {
504
                //if the url key is registered just display it and do not resubmit is
505
                $urlLog[] = '<em><span class="text-muted">' . htmlspecialchars($url) . '</span></em>';
506 7
            } else {
507 7
                // Scheduled time:
508 7
                $schTime = $scheduledTime + round(count($duplicateTrack) * (60 / $reqMinute));
509
                $schTime = intval($schTime / 60) * 60;
510 7
                $formattedDate = BackendUtility::datetime($schTime);
511
                $this->urlList[] = '[' . $formattedDate . '] ' . $url;
512
                $urlList = '[' . $formattedDate . '] ' . htmlspecialchars($url);
513
514
                // Submit for crawling!
515 7
                if ($submitCrawlUrls) {
516 7
                    $added = $this->addUrl(
517
                        $pageId,
518
                        $url,
519
                        $vv['subCfg'],
520
                        $scheduledTime,
521 7
                        $configurationHash,
522
                        $skipInnerCheck
523 7
                    );
524
                    if ($added === false) {
525
                        $urlList .= ' (URL already existed)';
526 7
                    }
527
                } elseif ($downloadCrawlUrls) {
528
                    $downloadUrls[$url] = $url;
529
                }
530
                $urlLog[] = $urlList;
531
            }
532
            $duplicateTrack[$uKey] = true;
533
        }
534
535
        return implode('<br>', $urlLog);
536 8
    }
537
538 8
    /**
539 4
     * Returns true if input processing instruction is among registered ones.
540
     *
541
     * @param string $piString PI to test
542 4
     * @param array $incomingProcInstructions Processing instructions
543 4
     * @return boolean
544 2
     */
545
    public function drawURLs_PIfilter($piString, array $incomingProcInstructions)
546
    {
547 2
        if (empty($incomingProcInstructions)) {
548
            return true;
549
        }
550 9
551
        foreach ($incomingProcInstructions as $pi) {
552 9
            if (GeneralUtility::inList($piString, $pi)) {
553 9
                return true;
554
            }
555
        }
556
        return false;
557
    }
558
559
    public function getPageTSconfigForId(int $id): array
560
    {
561 9
        if (! $this->MP) {
562
            $pageTSconfig = BackendUtility::getPagesTSconfig($id);
563
        } else {
564
            // TODO: Please check, this makes no sense to split a boolean value.
565
            [, $mountPointId] = explode('-', $this->MP);
0 ignored issues
show
Bug introduced by
$this->MP of type true is incompatible with the type string expected by parameter $string of explode(). ( Ignorable by Annotation )

If this is a false-positive, you can also ignore this issue in your code via the ignore-type  annotation

565
            [, $mountPointId] = explode('-', /** @scrutinizer ignore-type */ $this->MP);
Loading history...
566
            $pageTSconfig = BackendUtility::getPagesTSconfig($mountPointId);
0 ignored issues
show
Bug introduced by
$mountPointId of type string is incompatible with the type integer expected by parameter $id of TYPO3\CMS\Backend\Utilit...ity::getPagesTSconfig(). ( Ignorable by Annotation )

If this is a false-positive, you can also ignore this issue in your code via the ignore-type  annotation

566
            $pageTSconfig = BackendUtility::getPagesTSconfig(/** @scrutinizer ignore-type */ $mountPointId);
Loading history...
567
        }
568
569
        // Call a hook to alter configuration
570 9
        if (is_array($GLOBALS['TYPO3_CONF_VARS']['EXTCONF']['crawler']['getPageTSconfigForId'])) {
571
            $params = [
572
                'pageId' => $id,
573
                'pageTSConfig' => &$pageTSconfig,
574
            ];
575
            foreach ($GLOBALS['TYPO3_CONF_VARS']['EXTCONF']['crawler']['getPageTSconfigForId'] as $userFunc) {
576
                GeneralUtility::callUserFunction($userFunc, $params, $this);
577 7
            }
578
        }
579
        return $pageTSconfig;
580 7
    }
581
582 7
    /**
583
     * This methods returns an array of configurations.
584 7
     * Adds no urls!
585
     */
586
    public function getUrlsForPageId(int $pageId): array
587 7
    {
588
        // Get page TSconfig for page ID
589
        $pageTSconfig = $this->getPageTSconfigForId($pageId);
590 7
591
        $mountPoint = is_string($this->MP) ? $this->MP : '';
0 ignored issues
show
introduced by
The condition is_string($this->MP) is always false.
Loading history...
592 7
593
        $res = [];
594
595
        // Fetch Crawler Configuration from pageTSConfig
596
        $res = $this->configurationService->getConfigurationFromPageTS($pageTSconfig, $pageId, $res, $mountPoint);
597
598 7
        // Get configuration from tx_crawler_configuration records up the rootline
599
        $res = $this->configurationService->getConfigurationFromDatabase($pageId, $res);
600
601
        foreach ($GLOBALS['TYPO3_CONF_VARS']['EXTCONF']['crawler']['processUrls'] ?? [] as $func) {
602
            $params = [
603
                'res' => &$res,
604
            ];
605 2
            GeneralUtility::callUserFunction($func, $params, $this);
606
        }
607 2
        return $res;
608 2
    }
609 2
610 2
    /**
611
     * Find all configurations of subpages of a page
612
     * TODO: Write Functional Tests
613
     */
614
    public function getConfigurationsForBranch(int $rootid, int $depth): array
615
    {
616 2
        $configurationsForBranch = [];
617 2
        $pageTSconfig = $this->getPageTSconfigForId($rootid);
618 2
        $sets = $pageTSconfig['tx_crawler.']['crawlerCfg.']['paramSets.'] ?? [];
619 1
        foreach ($sets as $key => $value) {
620
            if (! is_array($value)) {
621
                continue;
622 2
            }
623 2
            $configurationsForBranch[] = substr($key, -1) === '.' ? substr($key, 0, -1) : $key;
624 2
        }
625 2
        $pids = [];
626 2
        $rootLine = BackendUtility::BEgetRootLine($rootid);
627
        foreach ($rootLine as $node) {
628
            $pids[] = $node['uid'];
629
        }
630 2
        /* @var PageTreeView $tree */
631
        $tree = GeneralUtility::makeInstance(PageTreeView::class);
632 2
        $perms_clause = $this->getBackendUser()->getPagePermsClause(Permission::PAGE_SHOW);
633 1
        $tree->init(empty($perms_clause) ? '' : ('AND ' . $perms_clause));
634
        $tree->getTree($rootid, $depth, '');
635 2
        foreach ($tree->tree as $node) {
636
            $pids[] = $node['row']['uid'];
637
        }
638
639
        $configurations = $this->configurationRepository->getCrawlerConfigurationRecordsFromRootLine($rootid, $pids);
640
641
        foreach ($configurations as $configuration) {
642
            $configurationsForBranch[] = $configuration['name'];
643
        }
644
        return $configurationsForBranch;
645
    }
646
647
    /**
648
     * Check if a user has access to an item
649
     * (e.g. get the group list of the current logged in user from $GLOBALS['TSFE']->gr_list)
650
     *
651
     * @param string $groupList Comma-separated list of (fe_)group UIDs from a user
652
     * @param string $accessList Comma-separated list of (fe_)group UIDs of the item to access
653
     * @return bool TRUE if at least one of the users group UIDs is in the access list or the access list is empty
654
     * @see \TYPO3\CMS\Frontend\Page\PageRepository::getMultipleGroupsWhereClause()
655
     * @deprecated
656
     * @codeCoverageIgnore
657
     */
658
    public function hasGroupAccess($groupList, $accessList)
659
    {
660
        if (empty($accessList)) {
661
            return true;
662
        }
663
        foreach (GeneralUtility::intExplode(',', $groupList) as $groupUid) {
664
            if (GeneralUtility::inList($accessList, $groupUid)) {
665
                return true;
666
            }
667
        }
668
        return false;
669
    }
670
671
    /**
672
     * Will expand the parameters configuration to individual values. This follows a certain syntax of the value of each parameter.
673
     * Syntax of values:
674
     * - Basically: If the value is wrapped in [...] it will be expanded according to the following syntax, otherwise the value is taken literally
675
     * - Configuration is splitted by "|" and the parts are processed individually and finally added together
676
     * - For each configuration part:
677
     *         - "[int]-[int]" = Integer range, will be expanded to all values in between, values included, starting from low to high (max. 1000). Example "1-34" or "-40--30"
678
     *         - "_TABLE:[TCA table name];[_PID:[optional page id, default is current page]];[_ENABLELANG:1]" = Look up of table records from PID, filtering out deleted records. Example "_TABLE:tt_content; _PID:123"
679
     *        _ENABLELANG:1 picks only original records without their language overlays
680
     *         - Default: Literal value
681
     *
682
     * @param array $paramArray Array with key (GET var name) and values (value of GET var which is configuration for expansion)
683
     * @param integer $pid Current page ID
684
     * @return array
685
     * @deprecated
686
     * @codeCoverageIgnore
687
     */
688
    public function expandParameters($paramArray, $pid)
689
    {
690
        // Traverse parameter names:
691
        foreach ($paramArray as $p => $v) {
692
            $v = trim($v);
693
694
            // If value is encapsulated in square brackets it means there are some ranges of values to find, otherwise the value is literal
695
            if (strpos($v, '[') === 0 && substr($v, -1) === ']') {
696
                // So, find the value inside brackets and reset the paramArray value as an array.
697
                $v = substr($v, 1, -1);
698
                $paramArray[$p] = [];
699
700
                // Explode parts and traverse them:
701
                $parts = explode('|', $v);
702
                foreach ($parts as $pV) {
703
704
                    // Look for integer range: (fx. 1-34 or -40--30 // reads minus 40 to minus 30)
705
                    if (preg_match('/^(-?[0-9]+)\s*-\s*(-?[0-9]+)$/', trim($pV), $reg)) {
706
                        $reg = $this->swapIfFirstIsLargerThanSecond($reg);
0 ignored issues
show
Deprecated Code introduced by
The function AOE\Crawler\Controller\C...rstIsLargerThanSecond() has been deprecated. ( Ignorable by Annotation )

If this is a false-positive, you can also ignore this issue in your code via the ignore-deprecated  annotation

706
                        $reg = /** @scrutinizer ignore-deprecated */ $this->swapIfFirstIsLargerThanSecond($reg);
Loading history...
707
708
                        // Traverse range, add values:
709
                        // Limit to size of range!
710
                        $runAwayBrake = 1000;
711
                        for ($a = $reg[1]; $a <= $reg[2]; $a++) {
712
                            $paramArray[$p][] = $a;
713
                            $runAwayBrake--;
714
                            if ($runAwayBrake <= 0) {
715
                                break;
716
                            }
717
                        }
718
                    } elseif (strpos(trim($pV), '_TABLE:') === 0) {
719
720
                        // Parse parameters:
721
                        $subparts = GeneralUtility::trimExplode(';', $pV);
722
                        $subpartParams = [];
723
                        foreach ($subparts as $spV) {
724
                            [$pKey, $pVal] = GeneralUtility::trimExplode(':', $spV);
725
                            $subpartParams[$pKey] = $pVal;
726
                        }
727
728
                        // Table exists:
729
                        if (isset($GLOBALS['TCA'][$subpartParams['_TABLE']])) {
730
                            $lookUpPid = isset($subpartParams['_PID']) ? intval($subpartParams['_PID']) : intval($pid);
731
                            $recursiveDepth = isset($subpartParams['_RECURSIVE']) ? intval($subpartParams['_RECURSIVE']) : 0;
732
                            $pidField = isset($subpartParams['_PIDFIELD']) ? trim($subpartParams['_PIDFIELD']) : 'pid';
733
                            $where = $subpartParams['_WHERE'] ?? '';
734
                            $addTable = $subpartParams['_ADDTABLE'] ?? '';
735
736
                            $fieldName = $subpartParams['_FIELD'] ? $subpartParams['_FIELD'] : 'uid';
737
                            if ($fieldName === 'uid' || $GLOBALS['TCA'][$subpartParams['_TABLE']]['columns'][$fieldName]) {
738
                                $queryBuilder = $this->getQueryBuilder($subpartParams['_TABLE']);
739
740
                                if ($recursiveDepth > 0) {
741
                                    /** @var QueryGenerator $queryGenerator */
742
                                    $queryGenerator = GeneralUtility::makeInstance(QueryGenerator::class);
743
                                    $pidList = $queryGenerator->getTreeList($lookUpPid, $recursiveDepth, 0, 1);
744
                                    $pidArray = GeneralUtility::intExplode(',', $pidList);
745
                                } else {
746
                                    $pidArray = [(string) $lookUpPid];
747
                                }
748
749
                                $queryBuilder->getRestrictions()
750
                                    ->removeAll()
751
                                    ->add(GeneralUtility::makeInstance(DeletedRestriction::class));
752
753
                                $queryBuilder
754
                                    ->select($fieldName)
755
                                    ->from($subpartParams['_TABLE'])
756
                                    ->where(
757
                                        $queryBuilder->expr()->in($pidField, $queryBuilder->createNamedParameter($pidArray, Connection::PARAM_INT_ARRAY)),
758
                                        $where
759
                                    );
760
761
                                if (! empty($addTable)) {
762
                                    // TODO: Check if this works as intended!
763
                                    $queryBuilder->add('from', $addTable);
764
                                }
765
                                $transOrigPointerField = $GLOBALS['TCA'][$subpartParams['_TABLE']]['ctrl']['transOrigPointerField'];
766
767
                                if ($subpartParams['_ENABLELANG'] && $transOrigPointerField) {
768
                                    $queryBuilder->andWhere(
769
                                        $queryBuilder->expr()->lte(
770
                                            $transOrigPointerField,
771
                                            0
772
                                        )
773
                                    );
774
                                }
775
776
                                $statement = $queryBuilder->execute();
777
778
                                $rows = [];
779
                                while ($row = $statement->fetch()) {
0 ignored issues
show
Deprecated Code introduced by
The function Doctrine\DBAL\ForwardCompatibility\Result::fetch() has been deprecated: Use fetchNumeric(), fetchAssociative() or fetchOne() instead. ( Ignorable by Annotation )

If this is a false-positive, you can also ignore this issue in your code via the ignore-deprecated  annotation

779
                                while ($row = /** @scrutinizer ignore-deprecated */ $statement->fetch()) {

This function has been deprecated. The supplier of the function has supplied an explanatory message.

The explanatory message should give you some clue as to whether and when the function will be removed and what other function to use instead.

Loading history...
780
                                    $rows[$row[$fieldName]] = $row;
781
                                }
782
783
                                if (is_array($rows)) {
784
                                    $paramArray[$p] = array_merge($paramArray[$p], array_keys($rows));
785
                                }
786
                            }
787
                        }
788
                    } else {
789
                        // Just add value:
790
                        $paramArray[$p][] = $pV;
791
                    }
792
                    // Hook for processing own expandParameters place holder
793
                    if (is_array($GLOBALS['TYPO3_CONF_VARS']['SC_OPTIONS']['crawler/class.tx_crawler_lib.php']['expandParameters'])) {
794
                        $_params = [
795
                            'pObj' => &$this,
796
                            'paramArray' => &$paramArray,
797
                            'currentKey' => $p,
798
                            'currentValue' => $pV,
799
                            'pid' => $pid,
800
                        ];
801
                        foreach ($GLOBALS['TYPO3_CONF_VARS']['SC_OPTIONS']['crawler/class.tx_crawler_lib.php']['expandParameters'] as $_funcRef) {
802
                            GeneralUtility::callUserFunction($_funcRef, $_params, $this);
803
                        }
804
                    }
805
                }
806
807
                // Make unique set of values and sort array by key:
808
                $paramArray[$p] = array_unique($paramArray[$p]);
809
                ksort($paramArray);
810
            } else {
811
                // Set the literal value as only value in array:
812
                $paramArray[$p] = [$v];
813
            }
814
        }
815
816
        return $paramArray;
817
    }
818
819
    /**
820
     * Compiling URLs from parameter array (output of expandParameters())
821
     * The number of URLs will be the multiplication of the number of parameter values for each key
822
     *
823
     * @param array $paramArray Output of expandParameters(): Array with keys (GET var names) and for each an array of values
824
     * @param array $urls URLs accumulated in this array (for recursion)
825
     * @deprecated
826
     * @codeCoverageIgnore
827
     */
828
    public function compileUrls(array $paramArray, array $urls): array
829
    {
830
        return $this->urlService->compileUrls($paramArray, $urls, $this->getMaximumUrlsToCompile());
831
    }
832
833
    /************************************
834
     *
835
     * Crawler log
836
     *
837
     ************************************/
838
839
    /**
840
     * Return array of records from crawler queue for input page ID
841 4
     *
842
     * @param integer $id Page ID for which to look up log entries.
843 4
     * @param boolean $doFlush If TRUE, then entries selected at DELETED(!) instead of selected!
844
     * @param boolean $doFullFlush
845 4
     * @param integer $itemsPerPage Limit the amount of entries per page default is 10
846 4
     * @return array
847 4
     *
848 4
     * @deprecated
849
     */
850 4
    public function getLogEntriesForPageId($id, QueueFilter $queueFilter, $doFlush = false, $doFullFlush = false, $itemsPerPage = 10)
0 ignored issues
show
Unused Code introduced by
The parameter $doFullFlush is not used and could be removed. ( Ignorable by Annotation )

If this is a false-positive, you can also ignore this issue in your code via the ignore-unused  annotation

850
    public function getLogEntriesForPageId($id, QueueFilter $queueFilter, $doFlush = false, /** @scrutinizer ignore-unused */ $doFullFlush = false, $itemsPerPage = 10)

This check looks for parameters that have been defined for a function or method, but which are not used in the method body.

Loading history...
851
    {
852 4
        $queryBuilder = GeneralUtility::makeInstance(ConnectionPool::class)->getQueryBuilderForTable(QueueRepository::TABLE_NAME);
853 4
        $queryBuilder
854 4
            ->select('*')
855 4
            ->from(QueueRepository::TABLE_NAME)
856
            ->where(
857
                $queryBuilder->expr()->eq('page_id', $queryBuilder->createNamedParameter($id, PDO::PARAM_INT))
858
            )
859 4
            ->orderBy('scheduled', 'DESC');
860 4
861
        $expressionBuilder = GeneralUtility::makeInstance(ConnectionPool::class)
862
            ->getConnectionForTable(QueueRepository::TABLE_NAME)
863 4
            ->getExpressionBuilder();
864
        $query = $expressionBuilder->andX();
0 ignored issues
show
Unused Code introduced by
The assignment to $query is dead and can be removed.
Loading history...
865
        // PHPStorm adds the highlight that the $addWhere is immediately overwritten,
866
        // but the $query = $expressionBuilder->andX() ensures that the $addWhere is written correctly with AND
867
        // between the statements, it's not a mistake in the code.
868 4
        switch ($queueFilter) {
869 2
            case 'pending':
870
                $queryBuilder->andWhere($queryBuilder->expr()->eq('exec_time', 0));
871 4
                break;
872
            case 'finished':
873 4
                $queryBuilder->andWhere($queryBuilder->expr()->gt('exec_time', 0));
874
                break;
875
        }
876 4
877
        if ($doFlush) {
878
            $this->queueRepository->flushQueue($queueFilter);
879
        }
880
        if ($itemsPerPage > 0) {
881
            $queryBuilder
882
                ->setMaxResults((int) $itemsPerPage);
883
        }
884
885
        return $queryBuilder->execute()->fetchAll();
886
    }
887
888
    /**
889
     * Return array of records from crawler queue for input set ID
890 6
     *
891
     * @param int $set_id Set ID for which to look up log entries.
892 6
     * @param string $filter Filter: "all" => all entries, "pending" => all that is not yet run, "finished" => all complete ones
893
     * @param bool $doFlush If TRUE, then entries selected at DELETED(!) instead of selected!
894 6
     * @param int $itemsPerPage Limit the amount of entries per page default is 10
895 6
     * @return array
896 6
     *
897 6
     * @deprecated
898
     */
899 6
    public function getLogEntriesForSetId(int $set_id, string $filter = '', bool $doFlush = false, bool $doFullFlush = false, int $itemsPerPage = 10)
900
    {
901 6
        $queryBuilder = GeneralUtility::makeInstance(ConnectionPool::class)->getQueryBuilderForTable(QueueRepository::TABLE_NAME);
902 6
        $queryBuilder
903 6
            ->select('*')
904 6
            ->from(QueueRepository::TABLE_NAME)
905
            ->where(
906
                $queryBuilder->expr()->eq('set_id', $queryBuilder->createNamedParameter($set_id, PDO::PARAM_INT))
907
            )
908 6
            ->orderBy('scheduled', 'DESC');
909 6
910 6
        $expressionBuilder = GeneralUtility::makeInstance(ConnectionPool::class)
911 1
            ->getConnectionForTable(QueueRepository::TABLE_NAME)
912 1
            ->getExpressionBuilder();
913 1
        $query = $expressionBuilder->andX();
914 5
        // PHPStorm adds the highlight that the $addWhere is immediately overwritten,
915 1
        // but the $query = $expressionBuilder->andX() ensures that the $addWhere is written correctly with AND
916 1
        // between the statements, it's not a mistake in the code.
917 1
        $addWhere = '';
918
        switch ($filter) {
919 6
            case 'pending':
920 4
                $queryBuilder->andWhere($queryBuilder->expr()->eq('exec_time', 0));
921 4
                $addWhere = $query->add($expressionBuilder->eq('exec_time', 0));
0 ignored issues
show
Unused Code introduced by
The assignment to $addWhere is dead and can be removed.
Loading history...
922 4
                break;
923
            case 'finished':
924 2
                $queryBuilder->andWhere($queryBuilder->expr()->gt('exec_time', 0));
925
                $addWhere = $query->add($expressionBuilder->gt('exec_time', 0));
926 2
                break;
927
        }
928
        if ($doFlush) {
929 2
            $addWhere = $query->add($expressionBuilder->eq('set_id', (int) $set_id));
930
            $this->flushQueue($doFullFlush ? '' : $addWhere);
0 ignored issues
show
Deprecated Code introduced by
The function AOE\Crawler\Controller\C...ontroller::flushQueue() has been deprecated. ( Ignorable by Annotation )

If this is a false-positive, you can also ignore this issue in your code via the ignore-deprecated  annotation

930
            /** @scrutinizer ignore-deprecated */ $this->flushQueue($doFullFlush ? '' : $addWhere);
Loading history...
931
            return [];
932
        }
933
        if ($itemsPerPage > 0) {
934
            $queryBuilder
935
                ->setMaxResults((int) $itemsPerPage);
936
        }
937
938
        return $queryBuilder->execute()->fetchAll();
939
    }
940
941
    /**
942
     * Adding call back entries to log (called from hooks typically, see indexed search class "class.crawler.php"
943
     *
944
     * @param integer $setId Set ID
945
     * @param array $params Parameters to pass to call back function
946
     * @param string $callBack Call back object reference, eg. 'EXT:indexed_search/class.crawler.php:&tx_indexedsearch_crawler'
947
     * @param integer $page_id Page ID to attach it to
948
     * @param integer $schedule Time at which to activate
949
     */
950
    public function addQueueEntry_callBack($setId, $params, $callBack, $page_id = 0, $schedule = 0): void
951
    {
952
        if (! is_array($params)) {
0 ignored issues
show
introduced by
The condition is_array($params) is always true.
Loading history...
953
            $params = [];
954
        }
955
        $params['_CALLBACKOBJ'] = $callBack;
956
957
        GeneralUtility::makeInstance(ConnectionPool::class)->getConnectionForTable(QueueRepository::TABLE_NAME)
958
            ->insert(
959
                QueueRepository::TABLE_NAME,
960
                [
961
                    'page_id' => (int) $page_id,
962
                    'parameters' => json_encode($params),
963
                    'scheduled' => (int) $schedule ?: $this->getCurrentTime(),
964
                    'exec_time' => 0,
965
                    'set_id' => (int) $setId,
966
                    'result_data' => '',
967
                ]
968
            );
969
    }
970
971
    /************************************
972
     *
973
     * URL setting
974
     *
975
     ************************************/
976
977
    /**
978
     * Setting a URL for crawling:
979 11
     *
980
     * @param integer $id Page ID
981
     * @param string $url Complete URL
982
     * @param array $subCfg Sub configuration array (from TS config)
983
     * @param integer $tstamp Scheduled-time
984
     * @param string $configurationHash (optional) configuration hash
985
     * @param bool $skipInnerDuplicationCheck (optional) skip inner duplication check
986
     * @return bool
987 11
     */
988 11
    public function addUrl(
989
        $id,
990
        $url,
991
        array $subCfg,
992 11
        $tstamp,
993
        $configurationHash = '',
994
        $skipInnerDuplicationCheck = false
995
    ) {
996 11
        $urlAdded = false;
997 11
        $rows = [];
998 1
999
        // Creating parameters:
1000
        $parameters = [
1001
            'url' => $url,
1002 11
        ];
1003 11
1004 8
        // fe user group simulation:
1005
        $uGs = implode(',', array_unique(GeneralUtility::intExplode(',', $subCfg['userGroups'], true)));
1006
        if ($uGs) {
1007
            $parameters['feUserGroupList'] = $uGs;
1008 11
        }
1009
1010 11
        // Setting processing instructions
1011 11
        $parameters['procInstructions'] = GeneralUtility::trimExplode(',', $subCfg['procInstrFilter']);
1012 11
        if (is_array($subCfg['procInstrParams.'])) {
1013 11
            $parameters['procInstrParams'] = $subCfg['procInstrParams.'];
1014 11
        }
1015 11
1016 11
        // Compile value array:
1017 11
        $parameters_serialized = json_encode($parameters);
1018 11
        $fieldArray = [
1019
            'page_id' => (int) $id,
1020
            'parameters' => $parameters_serialized,
1021 11
            'parameters_hash' => GeneralUtility::shortMD5($parameters_serialized),
1022
            'configuration_hash' => $configurationHash,
1023 1
            'scheduled' => $tstamp,
1024
            'exec_time' => 0,
1025 10
            'set_id' => (int) $this->setID,
1026
            'result_data' => '',
1027 9
            'configuration' => $subCfg['key'],
1028 9
        ];
1029
1030 9
        if ($this->registerQueueEntriesInternallyOnly) {
1031 9
            //the entries will only be registered and not stored to the database
1032 9
            $this->queueEntries[] = $fieldArray;
1033
        } else {
1034
            if (! $skipInnerDuplicationCheck) {
1035
                // check if there is already an equal entry
1036 10
                $rows = $this->queueRepository->getDuplicateQueueItemsIfExists(
1037 9
                    (bool) $this->extensionSettings['enableTimeslot'],
1038 9
                    $tstamp,
1039 9
                    $this->getCurrentTime(),
1040
                    $fieldArray['page_id'],
1041
                    $fieldArray['parameters_hash']
1042 9
                );
1043 9
            }
1044 9
1045
            if (empty($rows)) {
1046 9
                $connectionForCrawlerQueue = GeneralUtility::makeInstance(ConnectionPool::class)->getConnectionForTable(QueueRepository::TABLE_NAME);
1047 9
                $connectionForCrawlerQueue->insert(
1048 9
                    QueueRepository::TABLE_NAME,
1049 9
                    $fieldArray
1050
                );
1051
                $uid = $connectionForCrawlerQueue->lastInsertId(QueueRepository::TABLE_NAME, 'qid');
1052
                $rows[] = $uid;
1053 5
                $urlAdded = true;
1054 5
1055 5
                $signalPayload = ['uid' => $uid, 'fieldArray' => $fieldArray];
1056 5
                SignalSlotUtility::emitSignal(
0 ignored issues
show
Deprecated Code introduced by
The function AOE\Crawler\Utility\Sign...otUtility::emitSignal() has been deprecated. ( Ignorable by Annotation )

If this is a false-positive, you can also ignore this issue in your code via the ignore-deprecated  annotation

1056
                /** @scrutinizer ignore-deprecated */ SignalSlotUtility::emitSignal(
Loading history...
1057
                    self::class,
1058
                    SignalSlotUtility::SIGNAL_URL_ADDED_TO_QUEUE,
1059
                    $signalPayload
1060
                );
1061
            } else {
1062 11
                $signalPayload = ['rows' => $rows, 'fieldArray' => $fieldArray];
1063
                SignalSlotUtility::emitSignal(
0 ignored issues
show
Deprecated Code introduced by
The function AOE\Crawler\Utility\Sign...otUtility::emitSignal() has been deprecated. ( Ignorable by Annotation )

If this is a false-positive, you can also ignore this issue in your code via the ignore-deprecated  annotation

1063
                /** @scrutinizer ignore-deprecated */ SignalSlotUtility::emitSignal(
Loading history...
1064
                    self::class,
1065
                    SignalSlotUtility::SIGNAL_DUPLICATE_URL_IN_QUEUE,
1066
                    $signalPayload
1067
                );
1068
            }
1069
        }
1070 4
1071
        return $urlAdded;
1072 4
    }
1073
1074
    /**
1075
     * Returns the current system time
1076
     *
1077
     * @return int
1078
     */
1079
    public function getCurrentTime()
1080
    {
1081
        return time();
1082
    }
1083
1084
    /************************************
1085
     *
1086
     * URL reading
1087
     *
1088
     ************************************/
1089 2
1090
    /**
1091 2
     * Read URL for single queue entry
1092 2
     *
1093 2
     * @param integer $queueId
1094
     * @param boolean $force If set, will process even if exec_time has been set!
1095
     *
1096 2
     * @return int|null
1097 2
     */
1098 2
    public function readUrl($queueId, $force = false)
1099 2
    {
1100
        $queryBuilder = GeneralUtility::makeInstance(ConnectionPool::class)->getQueryBuilderForTable(QueueRepository::TABLE_NAME);
1101 2
        $ret = 0;
1102
        $this->logger->debug('crawler-readurl start ' . microtime(true));
0 ignored issues
show
Bug introduced by
The method debug() does not exist on null. ( Ignorable by Annotation )

If this is a false-positive, you can also ignore this issue in your code via the ignore-call  annotation

1102
        $this->logger->/** @scrutinizer ignore-call */ 
1103
                       debug('crawler-readurl start ' . microtime(true));

This check looks for calls to methods that do not seem to exist on a given type. It looks for the method on the type itself as well as in inherited classes or implemented interfaces.

This is most likely a typographical error or the method has been renamed.

Loading history...
1103 2
1104 2
        $queryBuilder
1105
            ->select('*')
1106 2
            ->from(QueueRepository::TABLE_NAME)
1107
            ->where(
1108 2
                $queryBuilder->expr()->eq('qid', $queryBuilder->createNamedParameter($queueId, PDO::PARAM_INT))
1109
            );
1110
        if (! $force) {
1111
            $queryBuilder
1112 2
                ->andWhere('exec_time = 0')
1113 2
                ->andWhere('process_scheduled > 0');
1114 2
        }
1115 2
        $queueRec = $queryBuilder->execute()->fetch();
1116
1117
        if (! is_array($queueRec)) {
1118
            return;
1119 2
        }
1120
1121 2
        SignalSlotUtility::emitSignal(
0 ignored issues
show
Deprecated Code introduced by
The function AOE\Crawler\Utility\Sign...otUtility::emitSignal() has been deprecated. ( Ignorable by Annotation )

If this is a false-positive, you can also ignore this issue in your code via the ignore-deprecated  annotation

1121
        /** @scrutinizer ignore-deprecated */ SignalSlotUtility::emitSignal(
Loading history...
1122
            self::class,
1123 2
            SignalSlotUtility::SIGNAL_QUEUEITEM_PREPROCESS,
1124
            [$queueId, &$queueRec]
1125
        );
1126 2
1127 2
        // Set exec_time to lock record:
1128 2
        $field_array = ['exec_time' => $this->getCurrentTime()];
1129
1130 2
        $this->setProcessID($queueRec['process_id']);
0 ignored issues
show
Deprecated Code introduced by
The function AOE\Crawler\Controller\C...troller::setProcessID() has been deprecated. ( Ignorable by Annotation )

If this is a false-positive, you can also ignore this issue in your code via the ignore-deprecated  annotation

1130
        /** @scrutinizer ignore-deprecated */ $this->setProcessID($queueRec['process_id']);
Loading history...
1131
1132
        if (isset($this->processID)) {
1133 2
            //if mulitprocessing is used we need to store the id of the process which has handled this entry
1134 2
            $field_array['process_id_completed'] = $this->processID;
1135
        }
1136
1137
        GeneralUtility::makeInstance(ConnectionPool::class)->getConnectionForTable(QueueRepository::TABLE_NAME)
1138 2
            ->update(
1139 2
                QueueRepository::TABLE_NAME,
1140
                $field_array,
1141
                ['qid' => (int) $queueId]
1142 2
            );
1143
1144
        $result = $this->queueExecutor->executeQueueItem($queueRec, $this);
1145
        if ($result['content'] === null) {
1146
            $resultData = 'An errors happened';
0 ignored issues
show
Unused Code introduced by
The assignment to $resultData is dead and can be removed.
Loading history...
1147
        } else {
1148
            /** @var JsonCompatibilityConverter $jsonCompatibilityConverter */
1149
            $jsonCompatibilityConverter = GeneralUtility::makeInstance(JsonCompatibilityConverter::class);
1150
            $resultData = $jsonCompatibilityConverter->convert($result['content']);
1151
1152
            //atm there's no need to point to specific pollable extensions
1153
            if (is_array($resultData) && is_array($GLOBALS['TYPO3_CONF_VARS']['EXTCONF']['crawler']['pollSuccess'])) {
1154
                foreach ($GLOBALS['TYPO3_CONF_VARS']['EXTCONF']['crawler']['pollSuccess'] as $pollable) {
1155
                    // only check the success value if the instruction is runnig
1156
                    // it is important to name the pollSuccess key same as the procInstructions key
1157
                    if (is_array($resultData['parameters']['procInstructions'])
1158
                        && in_array(
1159
                            $pollable,
1160 2
                            $resultData['parameters']['procInstructions'], true
1161
                        )
1162 2
                    ) {
1163 2
                        if (! empty($resultData['success'][$pollable]) && $resultData['success'][$pollable]) {
1164 2
                            $ret |= self::CLI_STATUS_POLLABLE_PROCESSED;
0 ignored issues
show
Deprecated Code introduced by
The constant AOE\Crawler\Controller\C...ATUS_POLLABLE_PROCESSED has been deprecated: since 9.2.5 will be removed in v11.x ( Ignorable by Annotation )

If this is a false-positive, you can also ignore this issue in your code via the ignore-deprecated  annotation

1164
                            $ret |= /** @scrutinizer ignore-deprecated */ self::CLI_STATUS_POLLABLE_PROCESSED;

This class constant has been deprecated. The supplier of the class has supplied an explanatory message.

The explanatory message should give you some clue as to whether and when the constant will be removed from the class and what other constant to use instead.

Loading history...
1165 2
                        }
1166
                    }
1167
                }
1168 2
            }
1169 2
        }
1170 2
        // Set result in log which also denotes the end of the processing of this entry.
1171
        $field_array = ['result_data' => json_encode($result)];
1172 2
1173
        SignalSlotUtility::emitSignal(
0 ignored issues
show
Deprecated Code introduced by
The function AOE\Crawler\Utility\Sign...otUtility::emitSignal() has been deprecated. ( Ignorable by Annotation )

If this is a false-positive, you can also ignore this issue in your code via the ignore-deprecated  annotation

1173
        /** @scrutinizer ignore-deprecated */ SignalSlotUtility::emitSignal(
Loading history...
1174
            self::class,
1175 2
            SignalSlotUtility::SIGNAL_QUEUEITEM_POSTPROCESS,
1176 2
            [$queueId, &$field_array]
1177
        );
1178
1179
        GeneralUtility::makeInstance(ConnectionPool::class)->getConnectionForTable(QueueRepository::TABLE_NAME)
1180
            ->update(
1181
                QueueRepository::TABLE_NAME,
1182
                $field_array,
1183
                ['qid' => (int) $queueId]
1184
            );
1185
1186
        $this->logger->debug('crawler-readurl stop ' . microtime(true));
1187
        return $ret;
1188
    }
1189
1190
    /**
1191
     * Read URL for not-yet-inserted log-entry
1192
     *
1193
     * @param array $field_array Queue field array,
1194
     *
1195
     * @return array|bool|mixed|string
1196
     */
1197
    public function readUrlFromArray($field_array)
1198
    {
1199
        // Set exec_time to lock record:
1200
        $field_array['exec_time'] = $this->getCurrentTime();
1201
        $connectionForCrawlerQueue = GeneralUtility::makeInstance(ConnectionPool::class)->getConnectionForTable(QueueRepository::TABLE_NAME);
1202
        $connectionForCrawlerQueue->insert(
1203
            QueueRepository::TABLE_NAME,
1204
            $field_array
1205
        );
1206
        $queueId = $field_array['qid'] = $connectionForCrawlerQueue->lastInsertId(QueueRepository::TABLE_NAME, 'qid');
1207
        $result = $this->queueExecutor->executeQueueItem($field_array, $this);
1208
1209
        // Set result in log which also denotes the end of the processing of this entry.
1210
        $field_array = ['result_data' => json_encode($result)];
1211
1212
        SignalSlotUtility::emitSignal(
0 ignored issues
show
Deprecated Code introduced by
The function AOE\Crawler\Utility\Sign...otUtility::emitSignal() has been deprecated. ( Ignorable by Annotation )

If this is a false-positive, you can also ignore this issue in your code via the ignore-deprecated  annotation

1212
        /** @scrutinizer ignore-deprecated */ SignalSlotUtility::emitSignal(
Loading history...
1213
            self::class,
1214
            SignalSlotUtility::SIGNAL_QUEUEITEM_POSTPROCESS,
1215
            [$queueId, &$field_array]
1216
        );
1217
1218
        $connectionForCrawlerQueue->update(
1219
            QueueRepository::TABLE_NAME,
1220
            $field_array,
1221
            ['qid' => $queueId]
1222
        );
1223
1224
        return $result;
1225
    }
1226
1227
    /*****************************
1228
     *
1229
     * Compiling URLs to crawl - tools
1230
     *
1231
     *****************************/
1232
1233
    /**
1234
     * @param integer $id Root page id to start from.
1235
     * @param integer $depth Depth of tree, 0=only id-page, 1= on sublevel, 99 = infinite
1236
     * @param integer $scheduledTime Unix Time when the URL is timed to be visited when put in queue
1237
     * @param integer $reqMinute Number of requests per minute (creates the interleave between requests)
1238
     * @param boolean $submitCrawlUrls If set, submits the URLs to queue in database (real crawling)
1239
     * @param boolean $downloadCrawlUrls If set (and submitcrawlUrls is false) will fill $downloadUrls with entries)
1240
     * @param array $incomingProcInstructions Array of processing instructions
1241
     * @param array $configurationSelection Array of configuration keys
1242
     * @return string
1243
     */
1244
    public function getPageTreeAndUrls(
1245
        $id,
1246
        $depth,
1247
        $scheduledTime,
1248
        $reqMinute,
1249
        $submitCrawlUrls,
1250
        $downloadCrawlUrls,
1251
        array $incomingProcInstructions,
1252
        array $configurationSelection
1253
    ) {
1254
        $this->scheduledTime = $scheduledTime;
1255
        $this->reqMinute = $reqMinute;
1256
        $this->submitCrawlUrls = $submitCrawlUrls;
1257
        $this->downloadCrawlUrls = $downloadCrawlUrls;
1258
        $this->incomingProcInstructions = $incomingProcInstructions;
1259
        $this->incomingConfigurationSelection = $configurationSelection;
1260
1261
        $this->duplicateTrack = [];
1262
        $this->downloadUrls = [];
1263
1264
        // Drawing tree:
1265
        /* @var PageTreeView $tree */
1266
        $tree = GeneralUtility::makeInstance(PageTreeView::class);
1267
        $perms_clause = $this->getBackendUser()->getPagePermsClause(Permission::PAGE_SHOW);
1268
        $tree->init('AND ' . $perms_clause);
1269
1270
        $pageInfo = BackendUtility::readPageAccess($id, $perms_clause);
1271
        if (is_array($pageInfo)) {
1272
            // Set root row:
1273
            $tree->tree[] = [
1274
                'row' => $pageInfo,
1275
                'HTML' => $this->iconFactory->getIconForRecord('pages', $pageInfo, Icon::SIZE_SMALL),
1276
            ];
1277
        }
1278
1279
        // Get branch beneath:
1280
        if ($depth) {
1281
            $tree->getTree($id, $depth, '');
1282
        }
1283
1284
        // Traverse page tree:
1285
        $code = '';
1286
1287
        foreach ($tree->tree as $data) {
1288
            $this->MP = false;
1289
1290
            // recognize mount points
1291
            if ($data['row']['doktype'] === PageRepository::DOKTYPE_MOUNTPOINT) {
1292
                $mountpage = $this->pageRepository->getPage($data['row']['uid']);
1293
1294
                // fetch mounted pages
1295
                $this->MP = $mountpage[0]['mount_pid'] . '-' . $data['row']['uid'];
0 ignored issues
show
Documentation Bug introduced by
The property $MP was declared of type boolean, but $mountpage[0]['mount_pid...' . $data['row']['uid'] is of type string. Maybe add a type cast?

This check looks for assignments to scalar types that may be of the wrong type.

To ensure the code behaves as expected, it may be a good idea to add an explicit type cast.

$answer = 42;

$correct = false;

$correct = (bool) $answer;
Loading history...
1296
1297
                $mountTree = GeneralUtility::makeInstance(PageTreeView::class);
1298
                $mountTree->init('AND ' . $perms_clause);
1299
                $mountTree->getTree($mountpage[0]['mount_pid'], $depth);
1300
1301
                foreach ($mountTree->tree as $mountData) {
1302
                    $code .= $this->drawURLs_addRowsForPage(
1303
                        $mountData['row'],
1304
                        $mountData['HTML'] . BackendUtility::getRecordTitle('pages', $mountData['row'], true)
1305
                    );
1306
                }
1307
1308
                // replace page when mount_pid_ol is enabled
1309
                if ($mountpage[0]['mount_pid_ol']) {
1310
                    $data['row']['uid'] = $mountpage[0]['mount_pid'];
1311
                } else {
1312
                    // if the mount_pid_ol is not set the MP must not be used for the mountpoint page
1313
                    $this->MP = false;
1314
                }
1315
            }
1316
1317
            $code .= $this->drawURLs_addRowsForPage(
1318
                $data['row'],
1319
                $data['HTML'] . BackendUtility::getRecordTitle('pages', $data['row'], true)
1320
            );
1321
        }
1322 1
1323
        return $code;
1324 1
    }
1325
1326
    /**
1327
     * Expands exclude string
1328
     *
1329
     * @param string $excludeString Exclude string
1330
     * @return array
1331
     * @deprecated
1332
     */
1333
    public function expandExcludeString($excludeString)
1334
    {
1335
        return $this->configurationService->expandExcludeString($excludeString);
1336
    }
1337
1338
    /**
1339
     * Create the rows for display of the page tree
1340
     * For each page a number of rows are shown displaying GET variable configuration
1341
     */
1342
    public function drawURLs_addRowsForPage(array $pageRow, string $pageTitle): string
1343
    {
1344
        $skipMessage = '';
1345
1346
        // Get list of configurations
1347
        $configurations = $this->getUrlsForPageRow($pageRow, $skipMessage);
1348
        $configurations = ConfigurationService::removeDisallowedConfigurations($this->incomingConfigurationSelection, $configurations);
1349
1350
        // Traverse parameter combinations:
1351
        $c = 0;
1352
        $content = '';
1353
        if (! empty($configurations)) {
1354
            foreach ($configurations as $confKey => $confArray) {
1355
1356
                // Title column:
1357
                if (! $c) {
1358
                    $titleClm = '<td rowspan="' . count($configurations) . '">' . $pageTitle . '</td>';
1359
                } else {
1360
                    $titleClm = '';
1361
                }
1362
1363
                if (! in_array($pageRow['uid'], $this->configurationService->expandExcludeString($confArray['subCfg']['exclude'] ?? ''), true)) {
1364
1365
                    // URL list:
1366
                    $urlList = $this->urlListFromUrlArray(
1367
                        $confArray,
1368
                        $pageRow,
1369
                        $this->scheduledTime,
1370
                        $this->reqMinute,
1371
                        $this->submitCrawlUrls,
1372
                        $this->downloadCrawlUrls,
1373
                        $this->duplicateTrack,
1374
                        $this->downloadUrls,
1375
                        // if empty the urls won't be filtered by processing instructions
1376
                        $this->incomingProcInstructions
1377
                    );
1378
1379
                    // Expanded parameters:
1380
                    $paramExpanded = '';
1381
                    $calcAccu = [];
1382
                    $calcRes = 1;
1383
                    foreach ($confArray['paramExpanded'] as $gVar => $gVal) {
1384
                        $paramExpanded .= '
1385
                            <tr>
1386
                                <td>' . htmlspecialchars('&' . $gVar . '=') . '<br/>' .
1387
                            '(' . count($gVal) . ')' .
1388
                            '</td>
1389
                                <td nowrap="nowrap">' . nl2br(htmlspecialchars(implode(chr(10), $gVal))) . '</td>
1390
                            </tr>
1391
                        ';
1392
                        $calcRes *= count($gVal);
1393
                        $calcAccu[] = count($gVal);
1394
                    }
1395
                    $paramExpanded = '<table>' . $paramExpanded . '</table>';
1396
                    $paramExpanded .= 'Comb: ' . implode('*', $calcAccu) . '=' . $calcRes;
1397
1398
                    // Options
1399
                    $optionValues = '';
1400
                    if ($confArray['subCfg']['userGroups']) {
1401
                        $optionValues .= 'User Groups: ' . $confArray['subCfg']['userGroups'] . '<br/>';
1402
                    }
1403
                    if ($confArray['subCfg']['procInstrFilter']) {
1404
                        $optionValues .= 'ProcInstr: ' . $confArray['subCfg']['procInstrFilter'] . '<br/>';
1405
                    }
1406
1407
                    // Compile row:
1408
                    $content .= '
1409
                        <tr>
1410
                            ' . $titleClm . '
1411
                            <td>' . htmlspecialchars($confKey) . '</td>
1412
                            <td>' . nl2br(htmlspecialchars(rawurldecode(trim(str_replace('&', chr(10) . '&', GeneralUtility::implodeArrayForUrl('', $confArray['paramParsed'])))))) . '</td>
1413
                            <td>' . $paramExpanded . '</td>
1414
                            <td nowrap="nowrap">' . $urlList . '</td>
1415
                            <td nowrap="nowrap">' . $optionValues . '</td>
1416
                            <td nowrap="nowrap">' . DebugUtility::viewArray($confArray['subCfg']['procInstrParams.']) . '</td>
1417
                        </tr>';
1418
                } else {
1419
                    $content .= '<tr>
1420
                            ' . $titleClm . '
1421
                            <td>' . htmlspecialchars($confKey) . '</td>
1422
                            <td colspan="5"><em>No entries</em> (Page is excluded in this configuration)</td>
1423
                        </tr>';
1424
                }
1425
1426
                $c++;
1427
            }
1428
        } else {
1429
            $message = ! empty($skipMessage) ? ' (' . $skipMessage . ')' : '';
1430
1431
            // Compile row:
1432
            $content .= '
1433
                <tr>
1434
                    <td>' . $pageTitle . '</td>
1435
                    <td colspan="6"><em>No entries</em>' . $message . '</td>
1436
                </tr>';
1437
        }
1438
1439
        return $content;
1440
    }
1441
1442
    /*****************************
1443
     *
1444
     * CLI functions
1445
     *
1446
     *****************************/
1447
1448
    /**
1449
     * Running the functionality of the CLI (crawling URLs from queue)
1450
     * @deprecated
1451
     * @codeCoverageIgnore
1452
     */
1453
    public function CLI_run(int $countInARun, int $sleepTime, int $sleepAfterFinish): int
1454
    {
1455
        $result = 0;
1456
        $counter = 0;
1457
1458
        // First, run hooks:
1459
        foreach ($GLOBALS['TYPO3_CONF_VARS']['EXTCONF']['crawler']['cli_hooks'] ?? [] as $objRef) {
1460
            trigger_error(
1461
                'This hook (crawler/cli_hooks) is deprecated since 9.1.5 and will be removed when dropping support for TYPO3 9LTS and 10LTS',
1462
                E_USER_DEPRECATED
1463
            );
1464
            $hookObj = GeneralUtility::makeInstance($objRef);
1465
            if (is_object($hookObj)) {
1466
                $hookObj->crawler_init($this);
1467
            }
1468
        }
1469
1470
        // Clean up the queue
1471
        $this->queueRepository->cleanupQueue();
1472
1473
        // Select entries:
1474
        $rows = $this->queueRepository->fetchRecordsToBeCrawled($countInARun);
1475
1476
        if (! empty($rows)) {
1477
            $quidList = [];
1478
1479
            foreach ($rows as $r) {
1480
                $quidList[] = $r['qid'];
1481
            }
1482
1483
            $processId = $this->CLI_buildProcessId();
0 ignored issues
show
Deprecated Code introduced by
The function AOE\Crawler\Controller\C...r::CLI_buildProcessId() has been deprecated. ( Ignorable by Annotation )

If this is a false-positive, you can also ignore this issue in your code via the ignore-deprecated  annotation

1483
            $processId = /** @scrutinizer ignore-deprecated */ $this->CLI_buildProcessId();
Loading history...
1484
1485
            //save the number of assigned queue entries to determine how many have been processed later
1486
            $numberOfAffectedRows = $this->queueRepository->updateProcessIdAndSchedulerForQueueIds($quidList, $processId);
1487
            $this->processRepository->updateProcessAssignItemsCount($numberOfAffectedRows, $processId);
1488
1489
            if ($numberOfAffectedRows !== count($quidList)) {
1490
                return ($result | self::CLI_STATUS_ABORTED);
0 ignored issues
show
Deprecated Code introduced by
The constant AOE\Crawler\Controller\C...ler::CLI_STATUS_ABORTED has been deprecated: since 9.2.5 will be removed in v11.x ( Ignorable by Annotation )

If this is a false-positive, you can also ignore this issue in your code via the ignore-deprecated  annotation

1490
                return ($result | /** @scrutinizer ignore-deprecated */ self::CLI_STATUS_ABORTED);

This class constant has been deprecated. The supplier of the class has supplied an explanatory message.

The explanatory message should give you some clue as to whether and when the constant will be removed from the class and what other constant to use instead.

Loading history...
1491
            }
1492
1493
            foreach ($rows as $r) {
1494
                $result |= $this->readUrl($r['qid']);
1495
1496
                $counter++;
1497
                // Just to relax the system
1498
                usleep((int) $sleepTime);
1499
1500
                // if during the start and the current read url the cli has been disable we need to return from the function
1501
                // mark the process NOT as ended.
1502
                if ($this->crawler->isDisabled()) {
1503
                    return ($result | self::CLI_STATUS_ABORTED);
0 ignored issues
show
Deprecated Code introduced by
The constant AOE\Crawler\Controller\C...ler::CLI_STATUS_ABORTED has been deprecated: since 9.2.5 will be removed in v11.x ( Ignorable by Annotation )

If this is a false-positive, you can also ignore this issue in your code via the ignore-deprecated  annotation

1503
                    return ($result | /** @scrutinizer ignore-deprecated */ self::CLI_STATUS_ABORTED);

This class constant has been deprecated. The supplier of the class has supplied an explanatory message.

The explanatory message should give you some clue as to whether and when the constant will be removed from the class and what other constant to use instead.

Loading history...
1504
                }
1505
1506
                if (! $this->processRepository->isProcessActive($this->CLI_buildProcessId())) {
0 ignored issues
show
Deprecated Code introduced by
The function AOE\Crawler\Controller\C...r::CLI_buildProcessId() has been deprecated. ( Ignorable by Annotation )

If this is a false-positive, you can also ignore this issue in your code via the ignore-deprecated  annotation

1506
                if (! $this->processRepository->isProcessActive(/** @scrutinizer ignore-deprecated */ $this->CLI_buildProcessId())) {
Loading history...
1507
                    $this->CLI_debug('conflict / timeout (' . $this->CLI_buildProcessId() . ')');
0 ignored issues
show
Deprecated Code introduced by
The function AOE\Crawler\Controller\C...r::CLI_buildProcessId() has been deprecated. ( Ignorable by Annotation )

If this is a false-positive, you can also ignore this issue in your code via the ignore-deprecated  annotation

1507
                    $this->CLI_debug('conflict / timeout (' . /** @scrutinizer ignore-deprecated */ $this->CLI_buildProcessId() . ')');
Loading history...
Deprecated Code introduced by
The function AOE\Crawler\Controller\C...Controller::CLI_debug() has been deprecated. ( Ignorable by Annotation )

If this is a false-positive, you can also ignore this issue in your code via the ignore-deprecated  annotation

1507
                    /** @scrutinizer ignore-deprecated */ $this->CLI_debug('conflict / timeout (' . $this->CLI_buildProcessId() . ')');
Loading history...
1508
                    $result |= self::CLI_STATUS_ABORTED;
0 ignored issues
show
Deprecated Code introduced by
The constant AOE\Crawler\Controller\C...ler::CLI_STATUS_ABORTED has been deprecated: since 9.2.5 will be removed in v11.x ( Ignorable by Annotation )

If this is a false-positive, you can also ignore this issue in your code via the ignore-deprecated  annotation

1508
                    $result |= /** @scrutinizer ignore-deprecated */ self::CLI_STATUS_ABORTED;

This class constant has been deprecated. The supplier of the class has supplied an explanatory message.

The explanatory message should give you some clue as to whether and when the constant will be removed from the class and what other constant to use instead.

Loading history...
1509
                    //possible timeout
1510
                    break;
1511
                }
1512
            }
1513
1514
            sleep((int) $sleepAfterFinish);
1515
        }
1516
1517
        if ($counter > 0) {
1518
            $result |= self::CLI_STATUS_PROCESSED;
0 ignored issues
show
Deprecated Code introduced by
The constant AOE\Crawler\Controller\C...r::CLI_STATUS_PROCESSED has been deprecated: since 9.2.5 will be removed in v11.x ( Ignorable by Annotation )

If this is a false-positive, you can also ignore this issue in your code via the ignore-deprecated  annotation

1518
            $result |= /** @scrutinizer ignore-deprecated */ self::CLI_STATUS_PROCESSED;

This class constant has been deprecated. The supplier of the class has supplied an explanatory message.

The explanatory message should give you some clue as to whether and when the constant will be removed from the class and what other constant to use instead.

Loading history...
1519
        }
1520
1521
        return $result;
1522
    }
1523
1524
    /**
1525
     * Activate hooks
1526
     * @deprecated
1527
     * @codeCoverageIgnore
1528
     */
1529
    public function CLI_runHooks(): void
1530
    {
1531
        foreach ($GLOBALS['TYPO3_CONF_VARS']['EXTCONF']['crawler']['cli_hooks'] ?? [] as $objRef) {
1532
            $hookObj = GeneralUtility::makeInstance($objRef);
1533
            if (is_object($hookObj)) {
1534
                $hookObj->crawler_init($this);
1535
            }
1536
        }
1537
    }
1538
1539
    /**
1540
     * Try to acquire a new process with the given id
1541
     * also performs some auto-cleanup for orphan processes
1542
     * @param string $id identification string for the process
1543
     * @return boolean
1544
     * @todo preemption might not be the most elegant way to clean up
1545
     * @deprecated
1546
     * @codeCoverageIgnore
1547
     */
1548
    public function CLI_checkAndAcquireNewProcess($id)
1549
    {
1550
        $ret = true;
1551
1552
        $systemProcessId = getmypid();
1553
        if (! $systemProcessId) {
1554
            return false;
1555
        }
1556
1557
        $processCount = 0;
1558
        $orphanProcesses = [];
1559
1560
        $activeProcesses = $this->processRepository->findAllActive();
1561
        $currentTime = $this->getCurrentTime();
1562
1563
        /** @var Process $process */
1564
        foreach ($activeProcesses as $process) {
1565
            if ($process->getTtl() < $currentTime) {
1566
                $orphanProcesses[] = $process->getProcessId();
1567
            } else {
1568
                $processCount++;
1569
            }
1570
        }
1571
1572
        // if there are less than allowed active processes then add a new one
1573
        if ($processCount < (int) $this->extensionSettings['processLimit']) {
1574
            $this->processRepository->addProcess($id, $systemProcessId);
1575
        } else {
1576
            $ret = false;
1577
        }
1578
1579
        $this->processRepository->deleteProcessesMarkedAsDeleted();
1580
        $this->processRepository->markRequestedProcessesAsNotActive($orphanProcesses);
1581
        $this->queueRepository->unsetProcessScheduledAndProcessIdForQueueEntries($orphanProcesses);
1582
1583
        return $ret;
1584
    }
1585
1586
    /**
1587
     * Release a process and the required resources
1588
     *
1589
     * @param mixed $releaseIds string with a single process-id or array with multiple process-ids
1590
     * @return boolean
1591
     * @deprecated
1592
     * @codeCoverageIgnore
1593
     */
1594
    public function CLI_releaseProcesses($releaseIds)
1595
    {
1596
        $queryBuilder = GeneralUtility::makeInstance(ConnectionPool::class)->getQueryBuilderForTable(QueueRepository::TABLE_NAME);
1597
1598
        if (! is_array($releaseIds)) {
1599
            $releaseIds = [$releaseIds];
1600
        }
1601
1602
        if (empty($releaseIds)) {
1603
            //nothing to release
1604
            return false;
1605
        }
1606
1607
        // some kind of 2nd chance algo - this way you need at least 2 processes to have a real cleanup
1608
        // this ensures that a single process can't mess up the entire process table
1609
1610
        // mark all processes as deleted which have no "waiting" queue-entires and which are not active
1611
1612
        // ReleaseQueueEntries
1613
        $queryBuilder
1614
            ->update(QueueRepository::TABLE_NAME, 'q')
1615
            ->where(
1616
                'q.process_id IN(SELECT p.process_id FROM tx_crawler_process as p WHERE p.active = 0)'
1617
            )
1618
            ->set('q.process_scheduled', 0)
1619
            ->set('q.process_id', '')
1620
            ->execute();
1621
1622
        // FIXME: Not entirely sure that this is equivalent to the previous version
1623
        $queryBuilder->resetQueryPart('set');
1624
1625
        // ReleaseProcessEntries
1626
        $queryBuilder
1627
            ->update(ProcessRepository::TABLE_NAME)
1628
            ->where(
1629
                $queryBuilder->expr()->eq('active', 0),
1630
                'process_id IN(SELECT q.process_id FROM tx_crawler_queue as q WHERE q.exec_time = 0)'
1631
            )
1632
            ->set('system_process_id', 0)
1633
            ->execute();
1634
1635
        $this->processRepository->markRequestedProcessesAsNotActive($releaseIds);
1636
        $this->queueRepository->unsetProcessScheduledAndProcessIdForQueueEntries($releaseIds);
1637
1638
        return true;
1639
    }
1640
1641
    /**
1642
     * Create a unique Id for the current process
1643
     *
1644
     * @return string the ID
1645
     * @deprecated
1646
     * @codeCoverageIgnore
1647
     */
1648
    public function CLI_buildProcessId()
1649
    {
1650
        if (! $this->processID) {
1651
            $this->processID = GeneralUtility::shortMD5(microtime(true));
1652
        }
1653
        return $this->processID;
1654
    }
1655
1656
    /**
1657
     * Prints a message to the stdout (only if debug-mode is enabled)
1658
     *
1659
     * @param string $msg the message
1660
     * @deprecated
1661
     * @codeCoverageIgnore
1662
     */
1663
    public function CLI_debug($msg): void
1664
    {
1665
        if ((int) $this->extensionSettings['processDebug']) {
1666
            echo $msg . "\n";
1667 1
            flush();
1668
        }
1669
    }
1670 1
1671 1
    /**
1672
     * Cleans up entries that stayed for too long in the queue. These are:
1673 1
     * - processed entries that are over 1.5 days in age
1674 1
     * - scheduled entries that are over 7 days old
1675 1
     *
1676 1
     * @deprecated
1677
     */
1678
    public function cleanUpOldQueueEntries(): void
1679
    {
1680
        // 24*60*60 Seconds in 24 hours
1681
        $processedAgeInSeconds = $this->extensionSettings['cleanUpProcessedAge'] * 86400;
1682
        $scheduledAgeInSeconds = $this->extensionSettings['cleanUpScheduledAge'] * 86400;
1683
1684
        $now = time();
1685 5
        $condition = '(exec_time<>0 AND exec_time<' . ($now - $processedAgeInSeconds) . ') OR scheduled<=' . ($now - $scheduledAgeInSeconds);
1686
        $this->flushQueue($condition);
0 ignored issues
show
Deprecated Code introduced by
The function AOE\Crawler\Controller\C...ontroller::flushQueue() has been deprecated. ( Ignorable by Annotation )

If this is a false-positive, you can also ignore this issue in your code via the ignore-deprecated  annotation

1686
        /** @scrutinizer ignore-deprecated */ $this->flushQueue($condition);
Loading history...
1687 5
    }
1688
1689 5
    /**
1690
     * Removes queue entries
1691
     *
1692 5
     * @param string $where SQL related filter for the entries which should be removed
1693 5
     *
1694 5
     * @deprecated
1695 5
     */
1696 5
    protected function flushQueue($where = ''): void
1697 5
    {
1698 5
        $realWhere = strlen((string) $where) > 0 ? $where : '1=1';
1699
1700 4
        $queryBuilder = $this->getQueryBuilder(QueueRepository::TABLE_NAME);
1701 4
1702 4
        $groups = $queryBuilder
0 ignored issues
show
Deprecated Code introduced by
The function Doctrine\DBAL\ForwardCom...lity\Result::fetchAll() has been deprecated: Use fetchAllNumeric(), fetchAllAssociative() or fetchFirstColumn() instead. ( Ignorable by Annotation )

If this is a false-positive, you can also ignore this issue in your code via the ignore-deprecated  annotation

1702
        $groups = /** @scrutinizer ignore-deprecated */ $queryBuilder

This function has been deprecated. The supplier of the function has supplied an explanatory message.

The explanatory message should give you some clue as to whether and when the function will be removed and what other function to use instead.

Loading history...
1703 4
            ->selectLiteral('DISTINCT set_id')
1704 4
            ->from(QueueRepository::TABLE_NAME)
1705
            ->where($realWhere)
1706 4
            ->execute()
1707 4
            ->fetchAll();
1708
        if (is_array($groups)) {
0 ignored issues
show
introduced by
The condition is_array($groups) is always true.
Loading history...
1709 4
            foreach ($groups as $group) {
1710 4
                $subSet = $queryBuilder
0 ignored issues
show
Deprecated Code introduced by
The function Doctrine\DBAL\ForwardCom...lity\Result::fetchAll() has been deprecated: Use fetchAllNumeric(), fetchAllAssociative() or fetchFirstColumn() instead. ( Ignorable by Annotation )

If this is a false-positive, you can also ignore this issue in your code via the ignore-deprecated  annotation

1710
                $subSet = /** @scrutinizer ignore-deprecated */ $queryBuilder

This function has been deprecated. The supplier of the function has supplied an explanatory message.

The explanatory message should give you some clue as to whether and when the function will be removed and what other function to use instead.

Loading history...
1711 4
                    ->select('qid', 'set_id')
1712 4
                    ->from(QueueRepository::TABLE_NAME)
1713
                    ->where(
1714
                        $realWhere,
1715
                        $queryBuilder->expr()->eq('set_id', $group['set_id'])
1716
                    )
1717
                    ->execute()
1718
                    ->fetchAll();
1719 5
1720 5
                $payLoad = ['subSet' => $subSet];
1721 5
                SignalSlotUtility::emitSignal(
0 ignored issues
show
Deprecated Code introduced by
The function AOE\Crawler\Utility\Sign...otUtility::emitSignal() has been deprecated. ( Ignorable by Annotation )

If this is a false-positive, you can also ignore this issue in your code via the ignore-deprecated  annotation

1721
                /** @scrutinizer ignore-deprecated */ SignalSlotUtility::emitSignal(
Loading history...
1722 5
                    self::class,
1723
                    SignalSlotUtility::SIGNAL_QUEUE_ENTRY_FLUSH,
1724
                    $payLoad
1725
                );
1726
            }
1727
        }
1728
1729
        $queryBuilder
1730
            ->delete(QueueRepository::TABLE_NAME)
1731
            ->where($realWhere)
1732
            ->execute();
1733
    }
1734
1735 5
    /**
1736
     * This method determines duplicates for a queue entry with the same parameters and this timestamp.
1737 5
     * If the timestamp is in the past, it will check if there is any unprocessed queue entry in the past.
1738
     * If the timestamp is in the future it will check, if the queued entry has exactly the same timestamp
1739 5
     *
1740
     * @param int $tstamp
1741 5
     * @param array $fieldArray
1742
     *
1743 5
     * @return array
1744 5
     * @deprecated
1745
     */
1746 5
    protected function getDuplicateRowsIfExist($tstamp, $fieldArray)
1747 2
    {
1748 1
        $rows = [];
1749 1
1750
        $currentTime = $this->getCurrentTime();
1751 1
1752 1
        $queryBuilder = GeneralUtility::makeInstance(ConnectionPool::class)->getQueryBuilderForTable(QueueRepository::TABLE_NAME);
1753
        $queryBuilder
1754 1
            ->select('qid')
1755 1
            ->from(QueueRepository::TABLE_NAME);
1756
        //if this entry is scheduled with "now"
1757
        if ($tstamp <= $currentTime) {
1758
            if ($this->extensionSettings['enableTimeslot']) {
1759 1
                $timeBegin = $currentTime - 100;
1760 2
                $timeEnd = $currentTime + 100;
1761
                $queryBuilder
1762
                    ->where(
1763 3
                        'scheduled BETWEEN ' . $timeBegin . ' AND ' . $timeEnd . ''
1764
                    )
1765
                    ->orWhere(
1766 3
                        $queryBuilder->expr()->lte('scheduled', $currentTime)
1767 3
                    );
1768
            } else {
1769
                $queryBuilder
1770
                    ->where(
1771
                        $queryBuilder->expr()->lte('scheduled', $currentTime)
1772 5
                    );
1773 5
            }
1774 5
        } elseif ($tstamp > $currentTime) {
1775 5
            //entry with a timestamp in the future need to have the same schedule time
1776
            $queryBuilder
1777 5
                ->where(
1778
                    $queryBuilder->expr()->eq('scheduled', $tstamp)
1779 5
                );
1780 5
        }
1781
1782
        $queryBuilder
1783 5
            ->andWhere('NOT exec_time')
1784
            ->andWhere('NOT process_id')
1785
            ->andWhere($queryBuilder->expr()->eq('page_id', $queryBuilder->createNamedParameter($fieldArray['page_id'], PDO::PARAM_INT)))
1786
            ->andWhere($queryBuilder->expr()->eq('parameters_hash', $queryBuilder->createNamedParameter($fieldArray['parameters_hash'], PDO::PARAM_STR)));
1787
1788
        $statement = $queryBuilder->execute();
1789
1790
        while ($row = $statement->fetch()) {
1791 13
            $rows[] = $row['qid'];
1792
        }
1793 13
1794 13
        return $rows;
1795 13
    }
1796
1797
    /**
1798
     * Returns a md5 hash generated from a serialized configuration array.
1799
     *
1800
     * @return string
1801
     */
1802
    protected function getConfigurationHash(array $configuration)
1803
    {
1804
        unset($configuration['paramExpanded']);
1805
        unset($configuration['URLs']);
1806
        return md5(serialize($configuration));
1807
    }
1808
1809
    /**
1810
     * Build a URL from a Page and the Query String. If the page has a Site configuration, it can be built by using
1811
     * the Site instance.
1812
     *
1813
     * @param int $httpsOrHttp see tx_crawler_configuration.force_ssl
1814
     * @throws SiteNotFoundException
1815
     * @throws InvalidRouteArgumentsException
1816
     *
1817
     * @deprecated Using CrawlerController::getUrlFromPageAndQueryParameters() is deprecated since 9.1.1 and will be removed in v11.x, please use UrlService->getUrlFromPageAndQueryParameters() instead.
1818 1
     * @codeCoverageIgnore
1819
     */
1820
    protected function getUrlFromPageAndQueryParameters(int $pageId, string $queryString, ?string $alternativeBaseUrl, int $httpsOrHttp): UriInterface
1821 1
    {
1822
        $urlService = new UrlService();
1823
        return $urlService->getUrlFromPageAndQueryParameters($pageId, $queryString, $alternativeBaseUrl, $httpsOrHttp);
1824
    }
1825
1826
    /**
1827 1
     * @deprecated
1828
     */
1829
    protected function swapIfFirstIsLargerThanSecond(array $reg): array
1830 7
    {
1831
        // Swap if first is larger than last:
1832 7
        if ($reg[1] > $reg[2]) {
1833
            $temp = $reg[2];
1834
            $reg[2] = $reg[1];
1835
            $reg[1] = $temp;
1836
        }
1837
1838
        return $reg;
1839
    }
1840
1841
    protected function getPageService(): PageService
1842
    {
1843 2
        return new PageService();
1844
    }
1845
1846 2
    private function getMaximumUrlsToCompile(): int
1847 2
    {
1848 2
        return $this->maximumUrlsToCompile;
1849
    }
1850 2
1851
    /**
1852
     * @return BackendUserAuthentication
1853
     */
1854
    private function getBackendUser()
1855
    {
1856
        // Make sure the _cli_ user is loaded
1857
        Bootstrap::initializeBackendAuthentication();
1858 11
        if ($this->backendUser === null) {
1859
            $this->backendUser = $GLOBALS['BE_USER'];
1860 11
        }
1861
        return $this->backendUser;
1862
    }
1863
1864
    /**
1865
     * Get querybuilder for given table
1866
     *
1867
     * @return QueryBuilder
1868
     */
1869
    private function getQueryBuilder(string $table)
1870
    {
1871
        return GeneralUtility::makeInstance(ConnectionPool::class)->getQueryBuilderForTable($table);
1872
    }
1873
}
1874