Passed
Pull Request — master (#672)
by Tomas Norre
09:00 queued 05:34
created

CrawlerController::getPageTreeAndUrls()   B

Complexity

Conditions 7
Paths 16

Size

Total Lines 80
Code Lines 39

Duplication

Lines 0
Ratio 0 %

Code Coverage

Tests 0
CRAP Score 56

Importance

Changes 1
Bugs 0 Features 0
Metric Value
cc 7
eloc 39
nc 16
nop 8
dl 0
loc 80
ccs 0
cts 39
cp 0
crap 56
rs 8.3626
c 1
b 0
f 0

How to fix   Long Method    Many Parameters   

Long Method

Small methods make your code easier to understand, in particular if combined with a good name. Besides, if your method is small, finding a good name is usually much easier.

For example, if you find yourself adding comments to a method's body, this is usually a good sign to extract the commented part to a new method, and use the comment as a starting point when coming up with a good name for this new method.

Commonly applied refactorings include:

Many Parameters

Methods with many parameters are not only hard to understand, but their parameters also often become inconsistent when you need more, or different data.

There are several approaches to avoid long parameter lists:

1
<?php
2
3
declare(strict_types=1);
4
5
namespace AOE\Crawler\Controller;
6
7
/***************************************************************
8
 *  Copyright notice
9
 *
10
 *  (c) 2020 AOE GmbH <[email protected]>
11
 *
12
 *  All rights reserved
13
 *
14
 *  This script is part of the TYPO3 project. The TYPO3 project is
15
 *  free software; you can redistribute it and/or modify
16
 *  it under the terms of the GNU General Public License as published by
17
 *  the Free Software Foundation; either version 3 of the License, or
18
 *  (at your option) any later version.
19
 *
20
 *  The GNU General Public License can be found at
21
 *  http://www.gnu.org/copyleft/gpl.html.
22
 *
23
 *  This script is distributed in the hope that it will be useful,
24
 *  but WITHOUT ANY WARRANTY; without even the implied warranty of
25
 *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
26
 *  GNU General Public License for more details.
27
 *
28
 *  This copyright notice MUST APPEAR in all copies of the script!
29
 ***************************************************************/
30
31
use AOE\Crawler\Configuration\ExtensionConfigurationProvider;
32
use AOE\Crawler\Converter\JsonCompatibilityConverter;
33
use AOE\Crawler\Crawler;
34
use AOE\Crawler\CrawlStrategy\CrawlStrategyFactory;
35
use AOE\Crawler\Domain\Model\Process;
36
use AOE\Crawler\Domain\Repository\ConfigurationRepository;
37
use AOE\Crawler\Domain\Repository\ProcessRepository;
38
use AOE\Crawler\Domain\Repository\QueueRepository;
39
use AOE\Crawler\QueueExecutor;
40
use AOE\Crawler\Service\UrlService;
41
use AOE\Crawler\Utility\SignalSlotUtility;
42
use AOE\Crawler\Value\QueueFilter;
43
use PDO;
44
use Psr\Http\Message\UriInterface;
45
use Psr\Log\LoggerAwareInterface;
46
use Psr\Log\LoggerAwareTrait;
47
use TYPO3\CMS\Backend\Tree\View\PageTreeView;
48
use TYPO3\CMS\Backend\Utility\BackendUtility;
49
use TYPO3\CMS\Core\Authentication\BackendUserAuthentication;
50
use TYPO3\CMS\Core\Compatibility\PublicMethodDeprecationTrait;
51
use TYPO3\CMS\Core\Compatibility\PublicPropertyDeprecationTrait;
52
use TYPO3\CMS\Core\Core\Bootstrap;
53
use TYPO3\CMS\Core\Core\Environment;
54
use TYPO3\CMS\Core\Database\Connection;
55
use TYPO3\CMS\Core\Database\ConnectionPool;
56
use TYPO3\CMS\Core\Database\Query\QueryBuilder;
57
use TYPO3\CMS\Core\Database\Query\Restriction\DeletedRestriction;
58
use TYPO3\CMS\Core\Database\QueryGenerator;
59
use TYPO3\CMS\Core\Exception\SiteNotFoundException;
60
use TYPO3\CMS\Core\Imaging\Icon;
61
use TYPO3\CMS\Core\Imaging\IconFactory;
62
use TYPO3\CMS\Core\Routing\InvalidRouteArgumentsException;
63
use TYPO3\CMS\Core\Site\Entity\Site;
64
use TYPO3\CMS\Core\Type\Bitmask\Permission;
65
use TYPO3\CMS\Core\TypoScript\Parser\TypoScriptParser;
66
use TYPO3\CMS\Core\Utility\DebugUtility;
67
use TYPO3\CMS\Core\Utility\GeneralUtility;
68
use TYPO3\CMS\Core\Utility\MathUtility;
69
use TYPO3\CMS\Extbase\Object\ObjectManager;
70
use TYPO3\CMS\Frontend\Page\PageRepository;
71
72
/**
73
 * Class CrawlerController
74
 *
75
 * @package AOE\Crawler\Controller
76
 */
77
class CrawlerController implements LoggerAwareInterface
78
{
79
    use LoggerAwareTrait;
80
    use PublicMethodDeprecationTrait;
81
    use PublicPropertyDeprecationTrait;
82
83
    public const CLI_STATUS_NOTHING_PROCCESSED = 0;
84
85
    //queue not empty
86
    public const CLI_STATUS_REMAIN = 1;
87
88
    //(some) queue items where processed
89
    public const CLI_STATUS_PROCESSED = 2;
90
91
    //instance didn't finish
92
    public const CLI_STATUS_ABORTED = 4;
93
94
    public const CLI_STATUS_POLLABLE_PROCESSED = 8;
95
96
    /**
97
     * @var integer
98
     */
99
    public $setID = 0;
100
101
    /**
102
     * @var string
103
     */
104
    public $processID = '';
105
106
    /**
107
     * @var array
108
     */
109
    public $duplicateTrack = [];
110
111
    /**
112
     * @var array
113
     */
114
    public $downloadUrls = [];
115
116
    /**
117
     * @var array
118
     */
119
    public $incomingProcInstructions = [];
120
121
    /**
122
     * @var array
123
     */
124
    public $incomingConfigurationSelection = [];
125
126
    /**
127
     * @var bool
128
     */
129
    public $registerQueueEntriesInternallyOnly = false;
130
131
    /**
132
     * @var array
133
     */
134
    public $queueEntries = [];
135
136
    /**
137
     * @var array
138
     */
139
    public $urlList = [];
140
141
    /**
142
     * @var array
143
     */
144
    public $extensionSettings = [];
145
146
    /**
147
     * Mount Point
148
     *
149
     * @var bool
150
     * Todo: Check what this is used for and adjust the type hint or code, as bool doesn't match the current code.
151
     */
152
    public $MP = false;
153
154
    /**
155
     * @var string
156
     * @deprecated
157
     */
158
    protected $processFilename;
159
160
    /**
161
     * Holds the internal access mode can be 'gui','cli' or 'cli_im'
162
     *
163
     * @var string
164
     * @deprecated
165
     */
166
    protected $accessMode;
167
168
    /**
169
     * @var QueueRepository
170
     */
171
    protected $queueRepository;
172
173
    /**
174
     * @var ProcessRepository
175
     */
176
    protected $processRepository;
177
178
    /**
179
     * @var ConfigurationRepository
180
     */
181
    protected $configurationRepository;
182
183
    /**
184
     * @var string
185
     */
186
    protected $tableName = 'tx_crawler_queue';
187
188
    /**
189
     * @var QueueExecutor
190
     */
191
    protected $queueExecutor;
192
193
    /**
194
     * @var int
195
     */
196
    protected $maximumUrlsToCompile = 10000;
197
198
    /**
199
     * @var IconFactory
200
     */
201
    protected $iconFactory;
202
203
    /**
204
     * @var string[]
205
     */
206
    private $deprecatedPublicMethods = [
0 ignored issues
show
introduced by
The private property $deprecatedPublicMethods is not used, and could be removed.
Loading history...
207
        'cleanUpOldQueueEntries' => 'Using CrawlerController::cleanUpOldQueueEntries() is deprecated since 9.0.1 and will be removed in v11.x, please use QueueRepository->cleanUpOldQueueEntries() instead.',
208
        'CLI_debug' => 'Using CrawlerController->CLI_debug() is deprecated since 9.1.3 and will be removed in v11.x',
209
        'getAccessMode' => 'Using CrawlerController->getAccessMode() is deprecated since 9.1.3 and will be removed in v11.x',
210
        'getLogEntriesForSetId' => 'Using crawlerController::getLogEntriesForSetId() is deprecated since 9.0.1 and will be removed in v11.x',
211
        'flushQueue' => 'Using CrawlerController::flushQueue() is deprecated since 9.0.1 and will be removed in v11.x, please use QueueRepository->flushQueue() instead.',
212
        'setAccessMode' => 'Using CrawlerController->setAccessMode() is deprecated since 9.1.3 and will be removed in v11.x',
213
        'getDisabled' => 'Using CrawlerController->getDisabled() is deprecated since 9.1.3 and will be removed in v11.x, please use Crawler->isDisabled() instead',
214
        'setDisabled' => 'Using CrawlerController->setDisabled() is deprecated since 9.1.3 and will be removed in v11.x, please use Crawler->setDisabled() instead',
215
        'getProcessFilename' => 'Using CrawlerController->getProcessFilename() is deprecated since 9.1.3 and will be removed in v11.x',
216
        'setProcessFilename' => 'Using CrawlerController->setProcessFilename() is deprecated since 9.1.3 and will be removed in v11.x',
217
        'getDuplicateRowsIfExist' => 'Using CrawlerController->getDuplicateRowsIfExist() is deprecated since 9.1.4 and will be remove in v11.x, please use QueueRepository->getDuplicateQueueItemsIfExists() instead',
218
219
    ];
220
221
    /**
222
     * @var string[]
223
     */
224
    private $deprecatedPublicProperties = [
1 ignored issue
show
introduced by
The private property $deprecatedPublicProperties is not used, and could be removed.
Loading history...
225
        'accessMode' => 'Using CrawlerController->accessMode is deprecated since 9.1.3 and will be removed in v11.x',
226
        'processFilename' => 'Using CrawlerController->accessMode is deprecated since 9.1.3 and will be removed in v11.x',
227
    ];
228
229
    /**
230
     * @var BackendUserAuthentication|null
231
     */
232
    private $backendUser;
233
234
    /**
235
     * @var integer
236
     */
237
    private $scheduledTime = 0;
238
239
    /**
240
     * @var integer
241
     */
242
    private $reqMinute = 0;
243
244
    /**
245
     * @var bool
246
     */
247
    private $submitCrawlUrls = false;
248
249
    /**
250
     * @var bool
251
     */
252
    private $downloadCrawlUrls = false;
253
254
    /**
255
     * @var PageRepository
256
     */
257
    private $pageRepository;
258
259
    /**
260
     * @var Crawler
261
     */
262
    private $crawler;
263
264
    /************************************
265
     *
266
     * Getting URLs based on Page TSconfig
267
     *
268
     ************************************/
269
270 36
    public function __construct()
271
    {
272 36
        $objectManager = GeneralUtility::makeInstance(ObjectManager::class);
273 36
        $crawlStrategyFactory = GeneralUtility::makeInstance(CrawlStrategyFactory::class);
274 36
        $this->queueRepository = $objectManager->get(QueueRepository::class);
275 36
        $this->processRepository = $objectManager->get(ProcessRepository::class);
276 36
        $this->configurationRepository = $objectManager->get(ConfigurationRepository::class);
277 36
        $this->pageRepository = $objectManager->get(PageRepository::class);
278 36
        $this->queueExecutor = GeneralUtility::makeInstance(QueueExecutor::class, $crawlStrategyFactory);
279 36
        $this->iconFactory = GeneralUtility::makeInstance(IconFactory::class);
280 36
        $this->crawler = GeneralUtility::makeInstance(Crawler::class);
281
282 36
        $this->processFilename = Environment::getVarPath() . '/lock/tx_crawler.proc';
0 ignored issues
show
Deprecated Code introduced by
The property AOE\Crawler\Controller\C...oller::$processFilename has been deprecated. ( Ignorable by Annotation )

If this is a false-positive, you can also ignore this issue in your code via the ignore-deprecated  annotation

282
        /** @scrutinizer ignore-deprecated */ $this->processFilename = Environment::getVarPath() . '/lock/tx_crawler.proc';
Loading history...
283
284
        /** @var ExtensionConfigurationProvider $configurationProvider */
285 36
        $configurationProvider = GeneralUtility::makeInstance(ExtensionConfigurationProvider::class);
286 36
        $settings = $configurationProvider->getExtensionConfiguration();
287 36
        $this->extensionSettings = is_array($settings) ? $settings : [];
0 ignored issues
show
introduced by
The condition is_array($settings) is always true.
Loading history...
288
289
        // set defaults:
290 36
        if (MathUtility::convertToPositiveInteger($this->extensionSettings['countInARun']) === 0) {
291
            $this->extensionSettings['countInARun'] = 100;
292
        }
293
294 36
        $this->extensionSettings['processLimit'] = MathUtility::forceIntegerInRange($this->extensionSettings['processLimit'], 1, 99, 1);
295 36
        $this->maximumUrlsToCompile = MathUtility::forceIntegerInRange($this->extensionSettings['maxCompileUrls'], 1, 1000000000, 10000);
296 36
    }
297
298
    /**
299
     * Method to set the accessMode can be gui, cli or cli_im
300
     *
301
     * @return string
302
     * @deprecated
303
     */
304 1
    public function getAccessMode()
305
    {
306 1
        return $this->accessMode;
0 ignored issues
show
Deprecated Code introduced by
The property AOE\Crawler\Controller\C...Controller::$accessMode has been deprecated. ( Ignorable by Annotation )

If this is a false-positive, you can also ignore this issue in your code via the ignore-deprecated  annotation

306
        return /** @scrutinizer ignore-deprecated */ $this->accessMode;
Loading history...
307
    }
308
309
    /**
310
     * @param string $accessMode
311
     * @deprecated
312
     */
313 1
    public function setAccessMode($accessMode): void
314
    {
315 1
        $this->accessMode = $accessMode;
0 ignored issues
show
Deprecated Code introduced by
The property AOE\Crawler\Controller\C...Controller::$accessMode has been deprecated. ( Ignorable by Annotation )

If this is a false-positive, you can also ignore this issue in your code via the ignore-deprecated  annotation

315
        /** @scrutinizer ignore-deprecated */ $this->accessMode = $accessMode;
Loading history...
316 1
    }
317
318
    /**
319
     * Set disabled status to prevent processes from being processed
320
     *
321
     * @param bool $disabled (optional, defaults to true)
322
     * @deprecated
323
     */
324 2
    public function setDisabled($disabled = true): void
325
    {
326 2
        if ($disabled) {
327 1
            GeneralUtility::writeFile($this->processFilename, '');
0 ignored issues
show
Deprecated Code introduced by
The property AOE\Crawler\Controller\C...oller::$processFilename has been deprecated. ( Ignorable by Annotation )

If this is a false-positive, you can also ignore this issue in your code via the ignore-deprecated  annotation

327
            GeneralUtility::writeFile(/** @scrutinizer ignore-deprecated */ $this->processFilename, '');
Loading history...
328
        } else {
329 1
            if (is_file($this->processFilename)) {
0 ignored issues
show
Deprecated Code introduced by
The property AOE\Crawler\Controller\C...oller::$processFilename has been deprecated. ( Ignorable by Annotation )

If this is a false-positive, you can also ignore this issue in your code via the ignore-deprecated  annotation

329
            if (is_file(/** @scrutinizer ignore-deprecated */ $this->processFilename)) {
Loading history...
330 1
                unlink($this->processFilename);
0 ignored issues
show
Deprecated Code introduced by
The property AOE\Crawler\Controller\C...oller::$processFilename has been deprecated. ( Ignorable by Annotation )

If this is a false-positive, you can also ignore this issue in your code via the ignore-deprecated  annotation

330
                unlink(/** @scrutinizer ignore-deprecated */ $this->processFilename);
Loading history...
331
            }
332
        }
333 2
    }
334
335
    /**
336
     * Get disable status
337
     *
338
     * @return bool true if disabled
339
     * @deprecated
340
     */
341 2
    public function getDisabled()
342
    {
343 2
        return is_file($this->processFilename);
0 ignored issues
show
Deprecated Code introduced by
The property AOE\Crawler\Controller\C...oller::$processFilename has been deprecated. ( Ignorable by Annotation )

If this is a false-positive, you can also ignore this issue in your code via the ignore-deprecated  annotation

343
        return is_file(/** @scrutinizer ignore-deprecated */ $this->processFilename);
Loading history...
344
    }
345
346
    /**
347
     * @param string $filenameWithPath
348
     * @deprecated
349
     */
350 3
    public function setProcessFilename($filenameWithPath): void
351
    {
352 3
        $this->processFilename = $filenameWithPath;
0 ignored issues
show
Deprecated Code introduced by
The property AOE\Crawler\Controller\C...oller::$processFilename has been deprecated. ( Ignorable by Annotation )

If this is a false-positive, you can also ignore this issue in your code via the ignore-deprecated  annotation

352
        /** @scrutinizer ignore-deprecated */ $this->processFilename = $filenameWithPath;
Loading history...
353 3
    }
354
355
    /**
356
     * @return string
357
     * @deprecated
358
     */
359 1
    public function getProcessFilename()
360
    {
361 1
        return $this->processFilename;
0 ignored issues
show
Deprecated Code introduced by
The property AOE\Crawler\Controller\C...oller::$processFilename has been deprecated. ( Ignorable by Annotation )

If this is a false-positive, you can also ignore this issue in your code via the ignore-deprecated  annotation

361
        return /** @scrutinizer ignore-deprecated */ $this->processFilename;
Loading history...
362
    }
363
364
    /**
365
     * Sets the extensions settings (unserialized pendant of $TYPO3_CONF_VARS['EXT']['extConf']['crawler']).
366
     */
367 14
    public function setExtensionSettings(array $extensionSettings): void
368
    {
369 14
        $this->extensionSettings = $extensionSettings;
370 14
    }
371
372
    /**
373
     * Check if the given page should be crawled
374
     *
375
     * @return false|string false if the page should be crawled (not excluded), true / skipMessage if it should be skipped
376
     */
377 12
    public function checkIfPageShouldBeSkipped(array $pageRow)
378
    {
379
        // if page is hidden
380 12
        if (! $this->extensionSettings['crawlHiddenPages'] && $pageRow['hidden']) {
381 1
            return 'Because page is hidden';
382
        }
383
384 11
        if (GeneralUtility::inList('3,4,199,254,255', $pageRow['doktype'])) {
385 3
            return 'Because doktype is not allowed';
386
        }
387
388 8
        foreach ($GLOBALS['TYPO3_CONF_VARS']['EXTCONF']['crawler']['excludeDoktype'] ?? [] as $key => $doktypeList) {
389 1
            if (GeneralUtility::inList($doktypeList, $pageRow['doktype'])) {
390 1
                $skipPage = true;
0 ignored issues
show
Unused Code introduced by
The assignment to $skipPage is dead and can be removed.
Loading history...
391 1
                return 'Doktype was excluded by "' . $key . '"';
392
            }
393
        }
394
395
        // veto hook
396 7
        foreach ($GLOBALS['TYPO3_CONF_VARS']['EXTCONF']['crawler']['pageVeto'] ?? [] as $key => $func) {
397
            $params = [
398 2
                'pageRow' => $pageRow,
399
            ];
400
            // expects "false" if page is ok and "true" or a skipMessage if this page should _not_ be crawled
401 2
            $veto = GeneralUtility::callUserFunction($func, $params, $this);
402 2
            if ($veto !== false) {
403 2
                $skipPage = true;
404 2
                if (is_string($veto)) {
405 1
                    return $veto;
406
                }
407 1
                return 'Veto from hook "' . htmlspecialchars($key) . '"';
408
409
                // no need to execute other hooks if a previous one return a veto
410
                break;
0 ignored issues
show
Unused Code introduced by
break is not strictly necessary here and could be removed.

The break statement is not necessary if it is preceded for example by a return statement:

switch ($x) {
    case 1:
        return 'foo';
        break; // This break is not necessary and can be left off.
}

If you would like to keep this construct to be consistent with other case statements, you can safely mark this issue as a false-positive.

Loading history...
411
            }
412
        }
413
414 5
        return false;
415
    }
416
417
    /**
418
     * Wrapper method for getUrlsForPageId()
419
     * It returns an array of configurations and no urls!
420
     *
421
     * @param array $pageRow Page record with at least dok-type and uid columns.
422
     * @param string $skipMessage
423
     * @return array
424
     * @see getUrlsForPageId()
425
     */
426 6
    public function getUrlsForPageRow(array $pageRow, &$skipMessage = '')
427
    {
428 6
        $message = $this->checkIfPageShouldBeSkipped($pageRow);
429 6
        if ($message === false) {
430 5
            $res = $this->getUrlsForPageId($pageRow['uid']);
431 5
            $skipMessage = '';
432
        } else {
433 1
            $skipMessage = $message;
434 1
            $res = [];
435
        }
436
437 6
        return $res;
438
    }
439
440
    /**
441
     * Creates a list of URLs from input array (and submits them to queue if asked for)
442
     * See Web > Info module script + "indexed_search"'s crawler hook-client using this!
443
     *
444
     * @param array $vv Information about URLs from pageRow to crawl.
445
     * @param array $pageRow Page row
446
     * @param int $scheduledTime Unix time to schedule indexing to, typically time()
447
     * @param int $reqMinute Number of requests per minute (creates the interleave between requests)
448
     * @param bool $submitCrawlUrls If set, submits the URLs to queue
449
     * @param bool $downloadCrawlUrls If set (and submitcrawlUrls is false) will fill $downloadUrls with entries)
450
     * @param array $duplicateTrack Array which is passed by reference and contains the an id per url to secure we will not crawl duplicates
451
     * @param array $downloadUrls Array which will be filled with URLS for download if flag is set.
452
     * @param array $incomingProcInstructions Array of processing instructions
453
     * @return string List of URLs (meant for display in backend module)
454
     */
455 4
    public function urlListFromUrlArray(
456
        array $vv,
457
        array $pageRow,
458
        $scheduledTime,
459
        $reqMinute,
460
        $submitCrawlUrls,
461
        $downloadCrawlUrls,
462
        array &$duplicateTrack,
463
        array &$downloadUrls,
464
        array $incomingProcInstructions
465
    ) {
466 4
        if (! is_array($vv['URLs'])) {
467
            return 'ERROR - no URL generated';
468
        }
469 4
        $urlLog = [];
470 4
        $pageId = (int) $pageRow['uid'];
471 4
        $configurationHash = $this->getConfigurationHash($vv);
472 4
        $skipInnerCheck = $this->queueRepository->noUnprocessedQueueEntriesForPageWithConfigurationHashExist($pageId, $configurationHash);
473
474 4
        $urlService = new UrlService();
475
476 4
        foreach ($vv['URLs'] as $urlQuery) {
477 4
            if (! $this->drawURLs_PIfilter($vv['subCfg']['procInstrFilter'], $incomingProcInstructions)) {
478
                continue;
479
            }
480 4
            $url = (string) $urlService->getUrlFromPageAndQueryParameters(
481 4
                $pageId,
482
                $urlQuery,
483 4
                $vv['subCfg']['baseUrl'] ?? null,
484 4
                $vv['subCfg']['force_ssl'] ?? 0
485
            );
486
487
            // Create key by which to determine unique-ness:
488 4
            $uKey = $url . '|' . $vv['subCfg']['userGroups'] . '|' . $vv['subCfg']['procInstrFilter'];
489
490 4
            if (isset($duplicateTrack[$uKey])) {
491
                //if the url key is registered just display it and do not resubmit is
492
                $urlLog[] = '<em><span class="text-muted">' . htmlspecialchars($url) . '</span></em>';
493
            } else {
494
                // Scheduled time:
495 4
                $schTime = $scheduledTime + round(count($duplicateTrack) * (60 / $reqMinute));
496 4
                $schTime = intval($schTime / 60) * 60;
497 4
                $formattedDate = BackendUtility::datetime($schTime);
498 4
                $this->urlList[] = '[' . $formattedDate . '] ' . $url;
499 4
                $urlList = '[' . $formattedDate . '] ' . htmlspecialchars($url);
500
501
                // Submit for crawling!
502 4
                if ($submitCrawlUrls) {
503 4
                    $added = $this->addUrl(
504 4
                        $pageId,
505
                        $url,
506 4
                        $vv['subCfg'],
507
                        $scheduledTime,
508
                        $configurationHash,
509
                        $skipInnerCheck
510
                    );
511 4
                    if ($added === false) {
512 4
                        $urlList .= ' (URL already existed)';
513
                    }
514
                } elseif ($downloadCrawlUrls) {
515
                    $downloadUrls[$url] = $url;
516
                }
517 4
                $urlLog[] = $urlList;
518
            }
519 4
            $duplicateTrack[$uKey] = true;
520
        }
521
522 4
        return implode('<br>', $urlLog);
523
    }
524
525
    /**
526
     * Returns true if input processing instruction is among registered ones.
527
     *
528
     * @param string $piString PI to test
529
     * @param array $incomingProcInstructions Processing instructions
530
     * @return boolean
531
     */
532 5
    public function drawURLs_PIfilter($piString, array $incomingProcInstructions)
533
    {
534 5
        if (empty($incomingProcInstructions)) {
535 1
            return true;
536
        }
537
538 4
        foreach ($incomingProcInstructions as $pi) {
539 4
            if (GeneralUtility::inList($piString, $pi)) {
540 2
                return true;
541
            }
542
        }
543 2
        return false;
544
    }
545
546 5
    public function getPageTSconfigForId($id): array
547
    {
548 5
        if (! $this->MP) {
549 5
            $pageTSconfig = BackendUtility::getPagesTSconfig($id);
0 ignored issues
show
Deprecated Code introduced by
The function TYPO3\CMS\Backend\Utilit...ity::getPagesTSconfig() has been deprecated. ( Ignorable by Annotation )

If this is a false-positive, you can also ignore this issue in your code via the ignore-deprecated  annotation

549
            $pageTSconfig = /** @scrutinizer ignore-deprecated */ BackendUtility::getPagesTSconfig($id);
Loading history...
550
        } else {
551
            // TODO: Please check, this makes no sense to split a boolean value.
552
            [, $mountPointId] = explode('-', $this->MP);
0 ignored issues
show
Bug introduced by
$this->MP of type true is incompatible with the type string expected by parameter $string of explode(). ( Ignorable by Annotation )

If this is a false-positive, you can also ignore this issue in your code via the ignore-type  annotation

552
            [, $mountPointId] = explode('-', /** @scrutinizer ignore-type */ $this->MP);
Loading history...
553
            $pageTSconfig = BackendUtility::getPagesTSconfig($mountPointId);
0 ignored issues
show
Bug introduced by
$mountPointId of type string is incompatible with the type integer expected by parameter $id of TYPO3\CMS\Backend\Utilit...ity::getPagesTSconfig(). ( Ignorable by Annotation )

If this is a false-positive, you can also ignore this issue in your code via the ignore-type  annotation

553
            $pageTSconfig = BackendUtility::getPagesTSconfig(/** @scrutinizer ignore-type */ $mountPointId);
Loading history...
Deprecated Code introduced by
The function TYPO3\CMS\Backend\Utilit...ity::getPagesTSconfig() has been deprecated. ( Ignorable by Annotation )

If this is a false-positive, you can also ignore this issue in your code via the ignore-deprecated  annotation

553
            $pageTSconfig = /** @scrutinizer ignore-deprecated */ BackendUtility::getPagesTSconfig($mountPointId);
Loading history...
554
        }
555
556
        // Call a hook to alter configuration
557 5
        if (is_array($GLOBALS['TYPO3_CONF_VARS']['EXTCONF']['crawler']['getPageTSconfigForId'])) {
558
            $params = [
559
                'pageId' => $id,
560
                'pageTSConfig' => &$pageTSconfig,
561
            ];
562
            foreach ($GLOBALS['TYPO3_CONF_VARS']['EXTCONF']['crawler']['getPageTSconfigForId'] as $userFunc) {
563
                GeneralUtility::callUserFunction($userFunc, $params, $this);
564
            }
565
        }
566 5
        return $pageTSconfig;
567
    }
568
569
    /**
570
     * This methods returns an array of configurations.
571
     * Adds no urls!
572
     */
573 4
    public function getUrlsForPageId(int $pageId): array
574
    {
575
        // Get page TSconfig for page ID
576 4
        $pageTSconfig = $this->getPageTSconfigForId($pageId);
577
578 4
        $res = [];
579
580
        // Fetch Crawler Configuration from pageTSconfig
581 4
        $crawlerCfg = $pageTSconfig['tx_crawler.']['crawlerCfg.']['paramSets.'] ?? [];
582 4
        foreach ($crawlerCfg as $key => $values) {
583 3
            if (! is_array($values)) {
584 3
                continue;
585
            }
586 3
            $key = str_replace('.', '', $key);
587
            // Sub configuration for a single configuration string:
588 3
            $subCfg = (array) $crawlerCfg[$key . '.'];
589 3
            $subCfg['key'] = $key;
590
591 3
            if (strcmp($subCfg['procInstrFilter'] ?? '', '')) {
592 3
                $subCfg['procInstrFilter'] = implode(',', GeneralUtility::trimExplode(',', $subCfg['procInstrFilter']));
593
            }
594 3
            $pidOnlyList = implode(',', GeneralUtility::trimExplode(',', $subCfg['pidsOnly'], true));
595
596
            // process configuration if it is not page-specific or if the specific page is the current page:
597
            // TODO: Check if $pidOnlyList can be kept as Array instead of imploded
598 3
            if (! strcmp((string) $subCfg['pidsOnly'], '') || GeneralUtility::inList($pidOnlyList, strval($pageId))) {
599
600
                // Explode, process etc.:
601 3
                $res[$key] = [];
602 3
                $res[$key]['subCfg'] = $subCfg;
603 3
                $res[$key]['paramParsed'] = GeneralUtility::explodeUrl2Array($crawlerCfg[$key]);
604 3
                $res[$key]['paramExpanded'] = $this->expandParameters($res[$key]['paramParsed'], $pageId);
605 3
                $res[$key]['origin'] = 'pagets';
606
607
                // recognize MP value
608 3
                if (! $this->MP) {
609 3
                    $res[$key]['URLs'] = $this->compileUrls($res[$key]['paramExpanded'], ['?id=' . $pageId]);
610
                } else {
611
                    $res[$key]['URLs'] = $this->compileUrls($res[$key]['paramExpanded'], ['?id=' . $pageId . '&MP=' . $this->MP]);
0 ignored issues
show
Bug introduced by
Are you sure $this->MP of type true can be used in concatenation? ( Ignorable by Annotation )

If this is a false-positive, you can also ignore this issue in your code via the ignore-type  annotation

611
                    $res[$key]['URLs'] = $this->compileUrls($res[$key]['paramExpanded'], ['?id=' . $pageId . '&MP=' . /** @scrutinizer ignore-type */ $this->MP]);
Loading history...
612
                }
613
            }
614
        }
615
616
        // Get configuration from tx_crawler_configuration records up the rootline
617 4
        $crawlerConfigurations = $this->configurationRepository->getCrawlerConfigurationRecordsFromRootLine($pageId);
618 4
        foreach ($crawlerConfigurations as $configurationRecord) {
619
620
            // check access to the configuration record
621 1
            if (empty($configurationRecord['begroups']) || $this->getBackendUser()->isAdmin() || $this->hasGroupAccess($this->getBackendUser()->user['usergroup_cached_list'], $configurationRecord['begroups'])) {
622 1
                $pidOnlyList = implode(',', GeneralUtility::trimExplode(',', $configurationRecord['pidsonly'], true));
623
624
                // process configuration if it is not page-specific or if the specific page is the current page:
625
                // TODO: Check if $pidOnlyList can be kept as Array instead of imploded
626 1
                if (! strcmp($configurationRecord['pidsonly'], '') || GeneralUtility::inList($pidOnlyList, strval($pageId))) {
627 1
                    $key = $configurationRecord['name'];
628
629
                    // don't overwrite previously defined paramSets
630 1
                    if (! isset($res[$key])) {
631
632
                        /* @var $TSparserObject TypoScriptParser */
633 1
                        $TSparserObject = GeneralUtility::makeInstance(TypoScriptParser::class);
634 1
                        $TSparserObject->parse($configurationRecord['processing_instruction_parameters_ts']);
635
636
                        $subCfg = [
637 1
                            'procInstrFilter' => $configurationRecord['processing_instruction_filter'],
638 1
                            'procInstrParams.' => $TSparserObject->setup,
639 1
                            'baseUrl' => $configurationRecord['base_url'],
640 1
                            'force_ssl' => (int) $configurationRecord['force_ssl'],
641 1
                            'userGroups' => $configurationRecord['fegroups'],
642 1
                            'exclude' => $configurationRecord['exclude'],
643 1
                            'key' => $key,
644
                        ];
645
646 1
                        if (! in_array($pageId, $this->expandExcludeString($subCfg['exclude']), true)) {
647 1
                            $res[$key] = [];
648 1
                            $res[$key]['subCfg'] = $subCfg;
649 1
                            $res[$key]['paramParsed'] = GeneralUtility::explodeUrl2Array($configurationRecord['configuration']);
650 1
                            $res[$key]['paramExpanded'] = $this->expandParameters($res[$key]['paramParsed'], $pageId);
651 1
                            $res[$key]['URLs'] = $this->compileUrls($res[$key]['paramExpanded'], ['?id=' . $pageId]);
652 1
                            $res[$key]['origin'] = 'tx_crawler_configuration_' . $configurationRecord['uid'];
653
                        }
654
                    }
655
                }
656
            }
657
        }
658
659 4
        foreach ($GLOBALS['TYPO3_CONF_VARS']['EXTCONF']['crawler']['processUrls'] ?? [] as $func) {
660
            $params = [
661
                'res' => &$res,
662
            ];
663
            GeneralUtility::callUserFunction($func, $params, $this);
664
        }
665 4
        return $res;
666
    }
667
668
    /**
669
     * Find all configurations of subpages of a page
670
     * TODO: Write Functional Tests
671
     */
672 1
    public function getConfigurationsForBranch(int $rootid, int $depth): array
673
    {
674 1
        $configurationsForBranch = [];
675 1
        $pageTSconfig = $this->getPageTSconfigForId($rootid);
676 1
        $sets = $pageTSconfig['tx_crawler.']['crawlerCfg.']['paramSets.'] ?? [];
677 1
        foreach ($sets as $key => $value) {
678
            if (! is_array($value)) {
679
                continue;
680
            }
681
            $configurationsForBranch[] = substr($key, -1) === '.' ? substr($key, 0, -1) : $key;
682
        }
683 1
        $pids = [];
684 1
        $rootLine = BackendUtility::BEgetRootLine($rootid);
685 1
        foreach ($rootLine as $node) {
686 1
            $pids[] = $node['uid'];
687
        }
688
        /* @var PageTreeView $tree */
689 1
        $tree = GeneralUtility::makeInstance(PageTreeView::class);
690 1
        $perms_clause = $this->getBackendUser()->getPagePermsClause(Permission::PAGE_SHOW);
691 1
        $tree->init(empty($perms_clause) ? '' : ('AND ' . $perms_clause));
692 1
        $tree->getTree($rootid, $depth, '');
693 1
        foreach ($tree->tree as $node) {
694
            $pids[] = $node['row']['uid'];
695
        }
696
697 1
        $queryBuilder = $this->getQueryBuilder('tx_crawler_configuration');
698
        $statement = $queryBuilder
699 1
            ->select('name')
700 1
            ->from('tx_crawler_configuration')
701 1
            ->where(
702 1
                $queryBuilder->expr()->in('pid', $queryBuilder->createNamedParameter($pids, Connection::PARAM_INT_ARRAY))
703
            )
704 1
            ->execute();
705
706 1
        while ($row = $statement->fetch()) {
707 1
            $configurationsForBranch[] = $row['name'];
708
        }
709 1
        return $configurationsForBranch;
710
    }
711
712
    /**
713
     * Check if a user has access to an item
714
     * (e.g. get the group list of the current logged in user from $GLOBALS['TSFE']->gr_list)
715
     *
716
     * @param string $groupList Comma-separated list of (fe_)group UIDs from a user
717
     * @param string $accessList Comma-separated list of (fe_)group UIDs of the item to access
718
     * @return bool TRUE if at least one of the users group UIDs is in the access list or the access list is empty
719
     * @see \TYPO3\CMS\Frontend\Page\PageRepository::getMultipleGroupsWhereClause()
720
     */
721 3
    public function hasGroupAccess($groupList, $accessList)
722
    {
723 3
        if (empty($accessList)) {
724 1
            return true;
725
        }
726 2
        foreach (GeneralUtility::intExplode(',', $groupList) as $groupUid) {
727 2
            if (GeneralUtility::inList($accessList, $groupUid)) {
728 1
                return true;
729
            }
730
        }
731 1
        return false;
732
    }
733
734
    /**
735
     * Will expand the parameters configuration to individual values. This follows a certain syntax of the value of each parameter.
736
     * Syntax of values:
737
     * - Basically: If the value is wrapped in [...] it will be expanded according to the following syntax, otherwise the value is taken literally
738
     * - Configuration is splitted by "|" and the parts are processed individually and finally added together
739
     * - For each configuration part:
740
     *         - "[int]-[int]" = Integer range, will be expanded to all values in between, values included, starting from low to high (max. 1000). Example "1-34" or "-40--30"
741
     *         - "_TABLE:[TCA table name];[_PID:[optional page id, default is current page]];[_ENABLELANG:1]" = Look up of table records from PID, filtering out deleted records. Example "_TABLE:tt_content; _PID:123"
742
     *        _ENABLELANG:1 picks only original records without their language overlays
743
     *         - Default: Literal value
744
     *
745
     * @param array $paramArray Array with key (GET var name) and values (value of GET var which is configuration for expansion)
746
     * @param integer $pid Current page ID
747
     * @return array
748
     *
749
     * TODO: Write Functional Tests
750
     */
751 11
    public function expandParameters($paramArray, $pid)
752
    {
753
        // Traverse parameter names:
754 11
        foreach ($paramArray as $p => $v) {
755 11
            $v = trim($v);
756
757
            // If value is encapsulated in square brackets it means there are some ranges of values to find, otherwise the value is literal
758 11
            if (strpos($v, '[') === 0 && substr($v, -1) === ']') {
759
                // So, find the value inside brackets and reset the paramArray value as an array.
760 11
                $v = substr($v, 1, -1);
761 11
                $paramArray[$p] = [];
762
763
                // Explode parts and traverse them:
764 11
                $parts = explode('|', $v);
765 11
                foreach ($parts as $pV) {
766
767
                    // Look for integer range: (fx. 1-34 or -40--30 // reads minus 40 to minus 30)
768 11
                    if (preg_match('/^(-?[0-9]+)\s*-\s*(-?[0-9]+)$/', trim($pV), $reg)) {
769 1
                        $reg = $this->swapIfFirstIsLargerThanSecond($reg);
770
771
                        // Traverse range, add values:
772
                        // Limit to size of range!
773 1
                        $runAwayBrake = 1000;
774 1
                        for ($a = $reg[1]; $a <= $reg[2]; $a++) {
775 1
                            $paramArray[$p][] = $a;
776 1
                            $runAwayBrake--;
777 1
                            if ($runAwayBrake <= 0) {
778
                                break;
779
                            }
780
                        }
781 10
                    } elseif (strpos(trim($pV), '_TABLE:') === 0) {
782
783
                        // Parse parameters:
784 6
                        $subparts = GeneralUtility::trimExplode(';', $pV);
785 6
                        $subpartParams = [];
786 6
                        foreach ($subparts as $spV) {
787 6
                            [$pKey, $pVal] = GeneralUtility::trimExplode(':', $spV);
788 6
                            $subpartParams[$pKey] = $pVal;
789
                        }
790
791
                        // Table exists:
792 6
                        if (isset($GLOBALS['TCA'][$subpartParams['_TABLE']])) {
793 6
                            $lookUpPid = isset($subpartParams['_PID']) ? intval($subpartParams['_PID']) : intval($pid);
794 6
                            $recursiveDepth = isset($subpartParams['_RECURSIVE']) ? intval($subpartParams['_RECURSIVE']) : 0;
795 6
                            $pidField = isset($subpartParams['_PIDFIELD']) ? trim($subpartParams['_PIDFIELD']) : 'pid';
796 6
                            $where = $subpartParams['_WHERE'] ?? '';
797 6
                            $addTable = $subpartParams['_ADDTABLE'] ?? '';
798
799 6
                            $fieldName = $subpartParams['_FIELD'] ? $subpartParams['_FIELD'] : 'uid';
800 6
                            if ($fieldName === 'uid' || $GLOBALS['TCA'][$subpartParams['_TABLE']]['columns'][$fieldName]) {
801 6
                                $queryBuilder = $this->getQueryBuilder($subpartParams['_TABLE']);
802
803 6
                                if ($recursiveDepth > 0) {
804
                                    /** @var QueryGenerator $queryGenerator */
805 2
                                    $queryGenerator = GeneralUtility::makeInstance(QueryGenerator::class);
806 2
                                    $pidList = $queryGenerator->getTreeList($lookUpPid, $recursiveDepth, 0, 1);
807 2
                                    $pidArray = GeneralUtility::intExplode(',', $pidList);
808
                                } else {
809 4
                                    $pidArray = [(string) $lookUpPid];
810
                                }
811
812 6
                                $queryBuilder->getRestrictions()
813 6
                                    ->removeAll()
814 6
                                    ->add(GeneralUtility::makeInstance(DeletedRestriction::class));
815
816
                                $queryBuilder
817 6
                                    ->select($fieldName)
818 6
                                    ->from($subpartParams['_TABLE'])
819 6
                                    ->where(
820 6
                                        $queryBuilder->expr()->in($pidField, $queryBuilder->createNamedParameter($pidArray, Connection::PARAM_INT_ARRAY)),
821
                                        $where
822
                                    );
823
824 6
                                if (! empty($addTable)) {
825
                                    // TODO: Check if this works as intended!
826
                                    $queryBuilder->add('from', $addTable);
827
                                }
828 6
                                $transOrigPointerField = $GLOBALS['TCA'][$subpartParams['_TABLE']]['ctrl']['transOrigPointerField'];
829
830 6
                                if ($subpartParams['_ENABLELANG'] && $transOrigPointerField) {
831
                                    $queryBuilder->andWhere(
832
                                        $queryBuilder->expr()->lte(
833
                                            $transOrigPointerField,
834
                                            0
835
                                        )
836
                                    );
837
                                }
838
839 6
                                $statement = $queryBuilder->execute();
840
841 6
                                $rows = [];
842 6
                                while ($row = $statement->fetch()) {
843 6
                                    $rows[$row[$fieldName]] = $row;
844
                                }
845
846 6
                                if (is_array($rows)) {
847 6
                                    $paramArray[$p] = array_merge($paramArray[$p], array_keys($rows));
848
                                }
849
                            }
850
                        }
851
                    } else {
852
                        // Just add value:
853 4
                        $paramArray[$p][] = $pV;
854
                    }
855
                    // Hook for processing own expandParameters place holder
856 11
                    if (is_array($GLOBALS['TYPO3_CONF_VARS']['SC_OPTIONS']['crawler/class.tx_crawler_lib.php']['expandParameters'])) {
857
                        $_params = [
858
                            'pObj' => &$this,
859
                            'paramArray' => &$paramArray,
860
                            'currentKey' => $p,
861
                            'currentValue' => $pV,
862
                            'pid' => $pid,
863
                        ];
864
                        foreach ($GLOBALS['TYPO3_CONF_VARS']['SC_OPTIONS']['crawler/class.tx_crawler_lib.php']['expandParameters'] as $_funcRef) {
865
                            GeneralUtility::callUserFunction($_funcRef, $_params, $this);
866
                        }
867
                    }
868
                }
869
870
                // Make unique set of values and sort array by key:
871 11
                $paramArray[$p] = array_unique($paramArray[$p]);
872 11
                ksort($paramArray);
873
            } else {
874
                // Set the literal value as only value in array:
875 4
                $paramArray[$p] = [$v];
876
            }
877
        }
878
879 11
        return $paramArray;
880
    }
881
882
    /**
883
     * Compiling URLs from parameter array (output of expandParameters())
884
     * The number of URLs will be the multiplication of the number of parameter values for each key
885
     *
886
     * @param array $paramArray Output of expandParameters(): Array with keys (GET var names) and for each an array of values
887
     * @param array $urls URLs accumulated in this array (for recursion)
888
     * @return array
889
     */
890 7
    public function compileUrls($paramArray, array $urls)
891
    {
892 7
        if (empty($paramArray)) {
893 7
            return $urls;
894
        }
895
        // shift first off stack:
896 6
        reset($paramArray);
897 6
        $varName = key($paramArray);
898 6
        $valueSet = array_shift($paramArray);
899
900
        // Traverse value set:
901 6
        $newUrls = [];
902 6
        foreach ($urls as $url) {
903 5
            foreach ($valueSet as $val) {
904 5
                $newUrls[] = $url . (strcmp((string) $val, '') ? '&' . rawurlencode($varName) . '=' . rawurlencode((string) $val) : '');
905
906 5
                if (count($newUrls) > $this->maximumUrlsToCompile) {
907
                    break;
908
                }
909
            }
910
        }
911 6
        return $this->compileUrls($paramArray, $newUrls);
912
    }
913
914
    /************************************
915
     *
916
     * Crawler log
917
     *
918
     ************************************/
919
920
    /**
921
     * Return array of records from crawler queue for input page ID
922
     *
923
     * @param integer $id Page ID for which to look up log entries.
924
     * @param boolean $doFlush If TRUE, then entries selected at DELETED(!) instead of selected!
925
     * @param boolean $doFullFlush
926
     * @param integer $itemsPerPage Limit the amount of entries per page default is 10
927
     * @return array
928
     */
929 4
    public function getLogEntriesForPageId($id, QueueFilter $queueFilter, $doFlush = false, $doFullFlush = false, $itemsPerPage = 10)
930
    {
931 4
        $queryBuilder = GeneralUtility::makeInstance(ConnectionPool::class)->getQueryBuilderForTable($this->tableName);
932
        $queryBuilder
933 4
            ->select('*')
934 4
            ->from($this->tableName)
935 4
            ->where(
936 4
                $queryBuilder->expr()->eq('page_id', $queryBuilder->createNamedParameter($id, PDO::PARAM_INT))
937
            )
938 4
            ->orderBy('scheduled', 'DESC');
939
940 4
        $expressionBuilder = GeneralUtility::makeInstance(ConnectionPool::class)
941 4
            ->getConnectionForTable($this->tableName)
942 4
            ->getExpressionBuilder();
943 4
        $query = $expressionBuilder->andX();
0 ignored issues
show
Unused Code introduced by
The assignment to $query is dead and can be removed.
Loading history...
944
        // PHPStorm adds the highlight that the $addWhere is immediately overwritten,
945
        // but the $query = $expressionBuilder->andX() ensures that the $addWhere is written correctly with AND
946
        // between the statements, it's not a mistake in the code.
947 4
        switch ($queueFilter) {
948 4
            case 'pending':
949
                $queryBuilder->andWhere($queryBuilder->expr()->eq('exec_time', 0));
950
                break;
951 4
            case 'finished':
952
                $queryBuilder->andWhere($queryBuilder->expr()->gt('exec_time', 0));
953
                break;
954
        }
955
956 4
        if ($doFlush) {
957 2
            if ($doFullFlush) {
958 1
                $this->queueRepository->flushQueue($queueFilter);
959
            } else {
960 1
                $this->queueRepository->flushQueue($queueFilter);
961
            }
962
        }
963 4
        if ($itemsPerPage > 0) {
964
            $queryBuilder
965 4
                ->setMaxResults((int) $itemsPerPage);
966
        }
967
968 4
        return $queryBuilder->execute()->fetchAll();
969
    }
970
971
    /**
972
     * Return array of records from crawler queue for input set ID
973
     *
974
     * @param int $set_id Set ID for which to look up log entries.
975
     * @param string $filter Filter: "all" => all entries, "pending" => all that is not yet run, "finished" => all complete ones
976
     * @param bool $doFlush If TRUE, then entries selected at DELETED(!) instead of selected!
977
     * @param int $itemsPerPage Limit the amount of entries per page default is 10
978
     * @return array
979
     *
980
     * @deprecated
981
     */
982 6
    public function getLogEntriesForSetId(int $set_id, string $filter = '', bool $doFlush = false, bool $doFullFlush = false, int $itemsPerPage = 10)
983
    {
984 6
        $queryBuilder = GeneralUtility::makeInstance(ConnectionPool::class)->getQueryBuilderForTable($this->tableName);
985
        $queryBuilder
986 6
            ->select('*')
987 6
            ->from($this->tableName)
988 6
            ->where(
989 6
                $queryBuilder->expr()->eq('set_id', $queryBuilder->createNamedParameter($set_id, PDO::PARAM_INT))
990
            )
991 6
            ->orderBy('scheduled', 'DESC');
992
993 6
        $expressionBuilder = GeneralUtility::makeInstance(ConnectionPool::class)
994 6
            ->getConnectionForTable($this->tableName)
995 6
            ->getExpressionBuilder();
996 6
        $query = $expressionBuilder->andX();
997
        // PHPStorm adds the highlight that the $addWhere is immediately overwritten,
998
        // but the $query = $expressionBuilder->andX() ensures that the $addWhere is written correctly with AND
999
        // between the statements, it's not a mistake in the code.
1000 6
        $addWhere = '';
1001 6
        switch ($filter) {
1002 6
            case 'pending':
1003 1
                $queryBuilder->andWhere($queryBuilder->expr()->eq('exec_time', 0));
1004 1
                $addWhere = $query->add($expressionBuilder->eq('exec_time', 0));
0 ignored issues
show
Unused Code introduced by
The assignment to $addWhere is dead and can be removed.
Loading history...
1005 1
                break;
1006 5
            case 'finished':
1007 1
                $queryBuilder->andWhere($queryBuilder->expr()->gt('exec_time', 0));
1008 1
                $addWhere = $query->add($expressionBuilder->gt('exec_time', 0));
1009 1
                break;
1010
        }
1011 6
        if ($doFlush) {
1012 4
            $addWhere = $query->add($expressionBuilder->eq('set_id', (int) $set_id));
1013 4
            $this->flushQueue($doFullFlush ? '' : $addWhere);
0 ignored issues
show
Deprecated Code introduced by
The function AOE\Crawler\Controller\C...ontroller::flushQueue() has been deprecated. ( Ignorable by Annotation )

If this is a false-positive, you can also ignore this issue in your code via the ignore-deprecated  annotation

1013
            /** @scrutinizer ignore-deprecated */ $this->flushQueue($doFullFlush ? '' : $addWhere);
Loading history...
1014 4
            return [];
1015
        }
1016 2
        if ($itemsPerPage > 0) {
1017
            $queryBuilder
1018 2
                ->setMaxResults((int) $itemsPerPage);
1019
        }
1020
1021 2
        return $queryBuilder->execute()->fetchAll();
1022
    }
1023
1024
    /**
1025
     * Adding call back entries to log (called from hooks typically, see indexed search class "class.crawler.php"
1026
     *
1027
     * @param integer $setId Set ID
1028
     * @param array $params Parameters to pass to call back function
1029
     * @param string $callBack Call back object reference, eg. 'EXT:indexed_search/class.crawler.php:&tx_indexedsearch_crawler'
1030
     * @param integer $page_id Page ID to attach it to
1031
     * @param integer $schedule Time at which to activate
1032
     */
1033
    public function addQueueEntry_callBack($setId, $params, $callBack, $page_id = 0, $schedule = 0): void
1034
    {
1035
        if (! is_array($params)) {
0 ignored issues
show
introduced by
The condition is_array($params) is always true.
Loading history...
1036
            $params = [];
1037
        }
1038
        $params['_CALLBACKOBJ'] = $callBack;
1039
1040
        GeneralUtility::makeInstance(ConnectionPool::class)->getConnectionForTable('tx_crawler_queue')
1041
            ->insert(
1042
                'tx_crawler_queue',
1043
                [
1044
                    'page_id' => (int) $page_id,
1045
                    'parameters' => json_encode($params),
1046
                    'scheduled' => (int) $schedule ?: $this->getCurrentTime(),
1047
                    'exec_time' => 0,
1048
                    'set_id' => (int) $setId,
1049
                    'result_data' => '',
1050
                ]
1051
            );
1052
    }
1053
1054
    /************************************
1055
     *
1056
     * URL setting
1057
     *
1058
     ************************************/
1059
1060
    /**
1061
     * Setting a URL for crawling:
1062
     *
1063
     * @param integer $id Page ID
1064
     * @param string $url Complete URL
1065
     * @param array $subCfg Sub configuration array (from TS config)
1066
     * @param integer $tstamp Scheduled-time
1067
     * @param string $configurationHash (optional) configuration hash
1068
     * @param bool $skipInnerDuplicationCheck (optional) skip inner duplication check
1069
     * @return bool
1070
     */
1071 8
    public function addUrl(
1072
        $id,
1073
        $url,
1074
        array $subCfg,
1075
        $tstamp,
1076
        $configurationHash = '',
1077
        $skipInnerDuplicationCheck = false
1078
    ) {
1079 8
        $urlAdded = false;
1080 8
        $rows = [];
1081
1082
        // Creating parameters:
1083
        $parameters = [
1084 8
            'url' => $url,
1085
        ];
1086
1087
        // fe user group simulation:
1088 8
        $uGs = implode(',', array_unique(GeneralUtility::intExplode(',', $subCfg['userGroups'], true)));
1089 8
        if ($uGs) {
1090 1
            $parameters['feUserGroupList'] = $uGs;
1091
        }
1092
1093
        // Setting processing instructions
1094 8
        $parameters['procInstructions'] = GeneralUtility::trimExplode(',', $subCfg['procInstrFilter']);
1095 8
        if (is_array($subCfg['procInstrParams.'])) {
1096 5
            $parameters['procInstrParams'] = $subCfg['procInstrParams.'];
1097
        }
1098
1099
        // Compile value array:
1100 8
        $parameters_serialized = json_encode($parameters);
1101
        $fieldArray = [
1102 8
            'page_id' => (int) $id,
1103 8
            'parameters' => $parameters_serialized,
1104 8
            'parameters_hash' => GeneralUtility::shortMD5($parameters_serialized),
1105 8
            'configuration_hash' => $configurationHash,
1106 8
            'scheduled' => $tstamp,
1107 8
            'exec_time' => 0,
1108 8
            'set_id' => (int) $this->setID,
1109 8
            'result_data' => '',
1110 8
            'configuration' => $subCfg['key'],
1111
        ];
1112
1113 8
        if ($this->registerQueueEntriesInternallyOnly) {
1114
            //the entries will only be registered and not stored to the database
1115 1
            $this->queueEntries[] = $fieldArray;
1116
        } else {
1117 7
            if (! $skipInnerDuplicationCheck) {
1118
                // check if there is already an equal entry
1119 6
                $rows = $this->queueRepository->getDuplicateQueueItemsIfExists(
1120 6
                    (bool) $this->extensionSettings['enableTimeslot'],
1121
                    $tstamp,
1122 6
                    $this->getCurrentTime(),
1123 6
                    $fieldArray['page_id'],
1124 6
                    $fieldArray['parameters_hash']
1125
                );
1126
            }
1127
1128 7
            if (empty($rows)) {
1129 6
                $connectionForCrawlerQueue = GeneralUtility::makeInstance(ConnectionPool::class)->getConnectionForTable('tx_crawler_queue');
1130 6
                $connectionForCrawlerQueue->insert(
1131 6
                    'tx_crawler_queue',
1132
                    $fieldArray
1133
                );
1134 6
                $uid = $connectionForCrawlerQueue->lastInsertId('tx_crawler_queue', 'qid');
1135 6
                $rows[] = $uid;
1136 6
                $urlAdded = true;
1137
1138 6
                $signalPayload = ['uid' => $uid, 'fieldArray' => $fieldArray];
1139 6
                SignalSlotUtility::emitSignal(
1140 6
                    self::class,
1141 6
                    SignalSlotUtility::SIGNAL_URL_ADDED_TO_QUEUE,
1142
                    $signalPayload
1143
                );
1144
            } else {
1145 3
                $signalPayload = ['rows' => $rows, 'fieldArray' => $fieldArray];
1146 3
                SignalSlotUtility::emitSignal(
1147 3
                    self::class,
1148 3
                    SignalSlotUtility::SIGNAL_DUPLICATE_URL_IN_QUEUE,
1149
                    $signalPayload
1150
                );
1151
            }
1152
        }
1153
1154 8
        return $urlAdded;
1155
    }
1156
1157
    /**
1158
     * Returns the current system time
1159
     *
1160
     * @return int
1161
     */
1162 2
    public function getCurrentTime()
1163
    {
1164 2
        return time();
1165
    }
1166
1167
    /************************************
1168
     *
1169
     * URL reading
1170
     *
1171
     ************************************/
1172
1173
    /**
1174
     * Read URL for single queue entry
1175
     *
1176
     * @param integer $queueId
1177
     * @param boolean $force If set, will process even if exec_time has been set!
1178
     * @return integer
1179
     */
1180
    public function readUrl($queueId, $force = false)
1181
    {
1182
        $queryBuilder = GeneralUtility::makeInstance(ConnectionPool::class)->getQueryBuilderForTable($this->tableName);
1183
        $ret = 0;
1184
        $this->logger->debug('crawler-readurl start ' . microtime(true));
1185
        // Get entry:
1186
        $queryBuilder
1187
            ->select('*')
1188
            ->from('tx_crawler_queue')
1189
            ->where(
1190
                $queryBuilder->expr()->eq('qid', $queryBuilder->createNamedParameter($queueId, PDO::PARAM_INT))
1191
            );
1192
        if (! $force) {
1193
            $queryBuilder
1194
                ->andWhere('exec_time = 0')
1195
                ->andWhere('process_scheduled > 0');
1196
        }
1197
        $queueRec = $queryBuilder->execute()->fetch();
1198
1199
        if (! is_array($queueRec)) {
1200
            return;
1201
        }
1202
1203
        SignalSlotUtility::emitSignal(
1204
            self::class,
1205
            SignalSlotUtility::SIGNAL_QUEUEITEM_PREPROCESS,
1206
            [$queueId, &$queueRec]
1207
        );
1208
1209
        // Set exec_time to lock record:
1210
        $field_array = ['exec_time' => $this->getCurrentTime()];
1211
1212
        if (isset($this->processID)) {
1213
            //if mulitprocessing is used we need to store the id of the process which has handled this entry
1214
            $field_array['process_id_completed'] = $this->processID;
1215
        }
1216
1217
        GeneralUtility::makeInstance(ConnectionPool::class)->getConnectionForTable('tx_crawler_queue')
1218
            ->update(
1219
                'tx_crawler_queue',
1220
                $field_array,
1221
                ['qid' => (int) $queueId]
1222
            );
1223
1224
        $result = $this->queueExecutor->executeQueueItem($queueRec, $this);
1225
        if ($result['content'] === null) {
1226
            $resultData = 'An errors happened';
1227
        } else {
1228
            /** @var JsonCompatibilityConverter $jsonCompatibilityConverter */
1229
            $jsonCompatibilityConverter = GeneralUtility::makeInstance(JsonCompatibilityConverter::class);
1230
            $resultData = $jsonCompatibilityConverter->convert($result['content']);
1231
        }
1232
1233
        //atm there's no need to point to specific pollable extensions
1234
        if (is_array($GLOBALS['TYPO3_CONF_VARS']['EXTCONF']['crawler']['pollSuccess'])) {
1235
            foreach ($GLOBALS['TYPO3_CONF_VARS']['EXTCONF']['crawler']['pollSuccess'] as $pollable) {
1236
                // only check the success value if the instruction is runnig
1237
                // it is important to name the pollSuccess key same as the procInstructions key
1238
                if (is_array($resultData['parameters']['procInstructions'])
1239
                    && in_array(
1240
                        $pollable,
1241
                        $resultData['parameters']['procInstructions'], true
1242
                    )
1243
                ) {
1244
                    if (! empty($resultData['success'][$pollable]) && $resultData['success'][$pollable]) {
1245
                        $ret |= self::CLI_STATUS_POLLABLE_PROCESSED;
1246
                    }
1247
                }
1248
            }
1249
        }
1250
1251
        // Set result in log which also denotes the end of the processing of this entry.
1252
        $field_array = ['result_data' => json_encode($result)];
1253
1254
        SignalSlotUtility::emitSignal(
1255
            self::class,
1256
            SignalSlotUtility::SIGNAL_QUEUEITEM_POSTPROCESS,
1257
            [$queueId, &$field_array]
1258
        );
1259
1260
        GeneralUtility::makeInstance(ConnectionPool::class)->getConnectionForTable('tx_crawler_queue')
1261
            ->update(
1262
                'tx_crawler_queue',
1263
                $field_array,
1264
                ['qid' => (int) $queueId]
1265
            );
1266
1267
        $this->logger->debug('crawler-readurl stop ' . microtime(true));
1268
        return $ret;
1269
    }
1270
1271
    /**
1272
     * Read URL for not-yet-inserted log-entry
1273
     *
1274
     * @param array $field_array Queue field array,
1275
     *
1276
     * @return array|bool|mixed|string
1277
     */
1278
    public function readUrlFromArray($field_array)
1279
    {
1280
        // Set exec_time to lock record:
1281
        $field_array['exec_time'] = $this->getCurrentTime();
1282
        $connectionForCrawlerQueue = GeneralUtility::makeInstance(ConnectionPool::class)->getConnectionForTable($this->tableName);
1283
        $connectionForCrawlerQueue->insert(
1284
            $this->tableName,
1285
            $field_array
1286
        );
1287
        $queueId = $connectionForCrawlerQueue->lastInsertId($this->tableName, 'qid');
1288
        $result = $this->queueExecutor->executeQueueItem($field_array, $this);
1289
1290
        // Set result in log which also denotes the end of the processing of this entry.
1291
        $field_array = ['result_data' => json_encode($result)];
1292
1293
        SignalSlotUtility::emitSignal(
1294
            self::class,
1295
            SignalSlotUtility::SIGNAL_QUEUEITEM_POSTPROCESS,
1296
            [$queueId, &$field_array]
1297
        );
1298
1299
        $connectionForCrawlerQueue->update(
1300
            $this->tableName,
1301
            $field_array,
1302
            ['qid' => $queueId]
1303
        );
1304
1305
        return $result;
1306
    }
1307
1308
    /*****************************
1309
     *
1310
     * Compiling URLs to crawl - tools
1311
     *
1312
     *****************************/
1313
1314
    /**
1315
     * @param integer $id Root page id to start from.
1316
     * @param integer $depth Depth of tree, 0=only id-page, 1= on sublevel, 99 = infinite
1317
     * @param integer $scheduledTime Unix Time when the URL is timed to be visited when put in queue
1318
     * @param integer $reqMinute Number of requests per minute (creates the interleave between requests)
1319
     * @param boolean $submitCrawlUrls If set, submits the URLs to queue in database (real crawling)
1320
     * @param boolean $downloadCrawlUrls If set (and submitcrawlUrls is false) will fill $downloadUrls with entries)
1321
     * @param array $incomingProcInstructions Array of processing instructions
1322
     * @param array $configurationSelection Array of configuration keys
1323
     * @return string
1324
     */
1325
    public function getPageTreeAndUrls(
1326
        $id,
1327
        $depth,
1328
        $scheduledTime,
1329
        $reqMinute,
1330
        $submitCrawlUrls,
1331
        $downloadCrawlUrls,
1332
        array $incomingProcInstructions,
1333
        array $configurationSelection
1334
    ) {
1335
        $this->scheduledTime = $scheduledTime;
1336
        $this->reqMinute = $reqMinute;
1337
        $this->submitCrawlUrls = $submitCrawlUrls;
1338
        $this->downloadCrawlUrls = $downloadCrawlUrls;
1339
        $this->incomingProcInstructions = $incomingProcInstructions;
1340
        $this->incomingConfigurationSelection = $configurationSelection;
1341
1342
        $this->duplicateTrack = [];
1343
        $this->downloadUrls = [];
1344
1345
        // Drawing tree:
1346
        /* @var PageTreeView $tree */
1347
        $tree = GeneralUtility::makeInstance(PageTreeView::class);
1348
        $perms_clause = $this->getBackendUser()->getPagePermsClause(Permission::PAGE_SHOW);
1349
        $tree->init('AND ' . $perms_clause);
1350
1351
        $pageInfo = BackendUtility::readPageAccess($id, $perms_clause);
1352
        if (is_array($pageInfo)) {
0 ignored issues
show
introduced by
The condition is_array($pageInfo) is always true.
Loading history...
1353
            // Set root row:
1354
            $tree->tree[] = [
1355
                'row' => $pageInfo,
1356
                'HTML' => $this->iconFactory->getIconForRecord('pages', $pageInfo, Icon::SIZE_SMALL),
1357
            ];
1358
        }
1359
1360
        // Get branch beneath:
1361
        if ($depth) {
1362
            $tree->getTree($id, $depth, '');
1363
        }
1364
1365
        // Traverse page tree:
1366
        $code = '';
1367
1368
        foreach ($tree->tree as $data) {
1369
            $this->MP = false;
1370
1371
            // recognize mount points
1372
            if ($data['row']['doktype'] === PageRepository::DOKTYPE_MOUNTPOINT) {
1373
                $mountpage = $this->pageRepository->getPage($data['row']['uid']);
1374
1375
                // fetch mounted pages
1376
                $this->MP = $mountpage[0]['mount_pid'] . '-' . $data['row']['uid'];
0 ignored issues
show
Documentation Bug introduced by
The property $MP was declared of type boolean, but $mountpage[0]['mount_pid...' . $data['row']['uid'] is of type string. Maybe add a type cast?

This check looks for assignments to scalar types that may be of the wrong type.

To ensure the code behaves as expected, it may be a good idea to add an explicit type cast.

$answer = 42;

$correct = false;

$correct = (bool) $answer;
Loading history...
1377
1378
                $mountTree = GeneralUtility::makeInstance(PageTreeView::class);
1379
                $mountTree->init('AND ' . $perms_clause);
1380
                $mountTree->getTree($mountpage[0]['mount_pid'], $depth);
1381
1382
                foreach ($mountTree->tree as $mountData) {
1383
                    $code .= $this->drawURLs_addRowsForPage(
1384
                        $mountData['row'],
1385
                        $mountData['HTML'] . BackendUtility::getRecordTitle('pages', $mountData['row'], true)
1386
                    );
1387
                }
1388
1389
                // replace page when mount_pid_ol is enabled
1390
                if ($mountpage[0]['mount_pid_ol']) {
1391
                    $data['row']['uid'] = $mountpage[0]['mount_pid'];
1392
                } else {
1393
                    // if the mount_pid_ol is not set the MP must not be used for the mountpoint page
1394
                    $this->MP = false;
1395
                }
1396
            }
1397
1398
            $code .= $this->drawURLs_addRowsForPage(
1399
                $data['row'],
1400
                $data['HTML'] . BackendUtility::getRecordTitle('pages', $data['row'], true)
1401
            );
1402
        }
1403
1404
        return $code;
1405
    }
1406
1407
    /**
1408
     * Expands exclude string
1409
     *
1410
     * @param string $excludeString Exclude string
1411
     * @return array
1412
     */
1413 2
    public function expandExcludeString($excludeString)
1414
    {
1415
        // internal static caches;
1416 2
        static $expandedExcludeStringCache;
1417 2
        static $treeCache;
1418
1419 2
        if (empty($expandedExcludeStringCache[$excludeString])) {
1420 2
            $pidList = [];
1421
1422 2
            if (! empty($excludeString)) {
1423
                /** @var PageTreeView $tree */
1424 1
                $tree = GeneralUtility::makeInstance(PageTreeView::class);
1425 1
                $tree->init('AND ' . $this->getBackendUser()->getPagePermsClause(Permission::PAGE_SHOW));
1426
1427 1
                $excludeParts = GeneralUtility::trimExplode(',', $excludeString);
1428
1429 1
                foreach ($excludeParts as $excludePart) {
1430 1
                    [$pid, $depth] = GeneralUtility::trimExplode('+', $excludePart);
1431
1432
                    // default is "page only" = "depth=0"
1433 1
                    if (empty($depth)) {
1434 1
                        $depth = (stristr($excludePart, '+')) ? 99 : 0;
1435
                    }
1436
1437 1
                    $pidList[] = (int) $pid;
1438
1439 1
                    if ($depth > 0) {
1440
                        if (empty($treeCache[$pid][$depth])) {
1441
                            $tree->reset();
1442
                            $tree->getTree($pid, $depth);
0 ignored issues
show
Bug introduced by
$pid of type string is incompatible with the type integer expected by parameter $uid of TYPO3\CMS\Backend\Tree\V...ractTreeView::getTree(). ( Ignorable by Annotation )

If this is a false-positive, you can also ignore this issue in your code via the ignore-type  annotation

1442
                            $tree->getTree(/** @scrutinizer ignore-type */ $pid, $depth);
Loading history...
1443
                            $treeCache[$pid][$depth] = $tree->tree;
1444
                        }
1445
1446
                        foreach ($treeCache[$pid][$depth] as $data) {
1447
                            $pidList[] = (int) $data['row']['uid'];
1448
                        }
1449
                    }
1450
                }
1451
            }
1452
1453 2
            $expandedExcludeStringCache[$excludeString] = array_unique($pidList);
1454
        }
1455
1456 2
        return $expandedExcludeStringCache[$excludeString];
1457
    }
1458
1459
    /**
1460
     * Create the rows for display of the page tree
1461
     * For each page a number of rows are shown displaying GET variable configuration
1462
     */
1463
    public function drawURLs_addRowsForPage(array $pageRow, string $pageTitle): string
1464
    {
1465
        $skipMessage = '';
1466
1467
        // Get list of configurations
1468
        $configurations = $this->getUrlsForPageRow($pageRow, $skipMessage);
1469
1470
        if (! empty($this->incomingConfigurationSelection)) {
1471
            // remove configuration that does not match the current selection
1472
            foreach ($configurations as $confKey => $confArray) {
1473
                if (! in_array($confKey, $this->incomingConfigurationSelection, true)) {
1474
                    unset($configurations[$confKey]);
1475
                }
1476
            }
1477
        }
1478
1479
        // Traverse parameter combinations:
1480
        $c = 0;
1481
        $content = '';
1482
        if (! empty($configurations)) {
1483
            foreach ($configurations as $confKey => $confArray) {
1484
1485
                // Title column:
1486
                if (! $c) {
1487
                    $titleClm = '<td rowspan="' . count($configurations) . '">' . $pageTitle . '</td>';
1488
                } else {
1489
                    $titleClm = '';
1490
                }
1491
1492
                if (! in_array($pageRow['uid'], $this->expandExcludeString($confArray['subCfg']['exclude']), true)) {
1493
1494
                    // URL list:
1495
                    $urlList = $this->urlListFromUrlArray(
1496
                        $confArray,
1497
                        $pageRow,
1498
                        $this->scheduledTime,
1499
                        $this->reqMinute,
1500
                        $this->submitCrawlUrls,
1501
                        $this->downloadCrawlUrls,
1502
                        $this->duplicateTrack,
1503
                        $this->downloadUrls,
1504
                        // if empty the urls won't be filtered by processing instructions
1505
                        $this->incomingProcInstructions
1506
                    );
1507
1508
                    // Expanded parameters:
1509
                    $paramExpanded = '';
1510
                    $calcAccu = [];
1511
                    $calcRes = 1;
1512
                    foreach ($confArray['paramExpanded'] as $gVar => $gVal) {
1513
                        $paramExpanded .= '
1514
                            <tr>
1515
                                <td>' . htmlspecialchars('&' . $gVar . '=') . '<br/>' .
1516
                            '(' . count($gVal) . ')' .
1517
                            '</td>
1518
                                <td nowrap="nowrap">' . nl2br(htmlspecialchars(implode(chr(10), $gVal))) . '</td>
1519
                            </tr>
1520
                        ';
1521
                        $calcRes *= count($gVal);
1522
                        $calcAccu[] = count($gVal);
1523
                    }
1524
                    $paramExpanded = '<table>' . $paramExpanded . '</table>';
1525
                    $paramExpanded .= 'Comb: ' . implode('*', $calcAccu) . '=' . $calcRes;
1526
1527
                    // Options
1528
                    $optionValues = '';
1529
                    if ($confArray['subCfg']['userGroups']) {
1530
                        $optionValues .= 'User Groups: ' . $confArray['subCfg']['userGroups'] . '<br/>';
1531
                    }
1532
                    if ($confArray['subCfg']['procInstrFilter']) {
1533
                        $optionValues .= 'ProcInstr: ' . $confArray['subCfg']['procInstrFilter'] . '<br/>';
1534
                    }
1535
1536
                    // Compile row:
1537
                    $content .= '
1538
                        <tr>
1539
                            ' . $titleClm . '
1540
                            <td>' . htmlspecialchars($confKey) . '</td>
1541
                            <td>' . nl2br(htmlspecialchars(rawurldecode(trim(str_replace('&', chr(10) . '&', GeneralUtility::implodeArrayForUrl('', $confArray['paramParsed'])))))) . '</td>
1542
                            <td>' . $paramExpanded . '</td>
1543
                            <td nowrap="nowrap">' . $urlList . '</td>
1544
                            <td nowrap="nowrap">' . $optionValues . '</td>
1545
                            <td nowrap="nowrap">' . DebugUtility::viewArray($confArray['subCfg']['procInstrParams.']) . '</td>
1546
                        </tr>';
1547
                } else {
1548
                    $content .= '<tr>
1549
                            ' . $titleClm . '
1550
                            <td>' . htmlspecialchars($confKey) . '</td>
1551
                            <td colspan="5"><em>No entries</em> (Page is excluded in this configuration)</td>
1552
                        </tr>';
1553
                }
1554
1555
                $c++;
1556
            }
1557
        } else {
1558
            $message = ! empty($skipMessage) ? ' (' . $skipMessage . ')' : '';
1559
1560
            // Compile row:
1561
            $content .= '
1562
                <tr>
1563
                    <td>' . $pageTitle . '</td>
1564
                    <td colspan="6"><em>No entries</em>' . $message . '</td>
1565
                </tr>';
1566
        }
1567
1568
        return $content;
1569
    }
1570
1571
    /*****************************
1572
     *
1573
     * CLI functions
1574
     *
1575
     *****************************/
1576
1577
    /**
1578
     * Running the functionality of the CLI (crawling URLs from queue)
1579
     */
1580
    public function CLI_run(int $countInARun, int $sleepTime, int $sleepAfterFinish): int
1581
    {
1582
        $result = 0;
1583
        $counter = 0;
1584
1585
        // First, run hooks:
1586
        $this->CLI_runHooks();
1587
1588
        // Clean up the queue
1589
        $this->queueRepository->cleanupQueue();
1590
1591
        // Select entries:
1592
        $rows = $this->queueRepository->fetchRecordsToBeCrawled($countInARun);
1593
1594
        if (! empty($rows)) {
1595
            $quidList = [];
1596
1597
            foreach ($rows as $r) {
1598
                $quidList[] = $r['qid'];
1599
            }
1600
1601
            $processId = $this->CLI_buildProcessId();
1602
1603
            //save the number of assigned queue entries to determine how many have been processed later
1604
            $numberOfAffectedRows = $this->queueRepository->updateProcessIdAndSchedulerForQueueIds($quidList, $processId);
1605
            $this->processRepository->updateProcessAssignItemsCount($numberOfAffectedRows, $processId);
1606
1607
            if ($numberOfAffectedRows !== count($quidList)) {
1608
                $this->CLI_debug('Nothing processed due to multi-process collision (' . $this->CLI_buildProcessId() . ')');
0 ignored issues
show
Deprecated Code introduced by
The function AOE\Crawler\Controller\C...Controller::CLI_debug() has been deprecated. ( Ignorable by Annotation )

If this is a false-positive, you can also ignore this issue in your code via the ignore-deprecated  annotation

1608
                /** @scrutinizer ignore-deprecated */ $this->CLI_debug('Nothing processed due to multi-process collision (' . $this->CLI_buildProcessId() . ')');
Loading history...
1609
                return ($result | self::CLI_STATUS_ABORTED);
1610
            }
1611
1612
            foreach ($rows as $r) {
1613
                $result |= $this->readUrl($r['qid']);
1614
1615
                $counter++;
1616
                // Just to relax the system
1617
                usleep((int) $sleepTime);
1618
1619
                // if during the start and the current read url the cli has been disable we need to return from the function
1620
                // mark the process NOT as ended.
1621
                if ($this->crawler->isDisabled()) {
1622
                    return ($result | self::CLI_STATUS_ABORTED);
1623
                }
1624
1625
                if (! $this->processRepository->isProcessActive($this->CLI_buildProcessId())) {
1626
                    $this->CLI_debug('conflict / timeout (' . $this->CLI_buildProcessId() . ')');
0 ignored issues
show
Deprecated Code introduced by
The function AOE\Crawler\Controller\C...Controller::CLI_debug() has been deprecated. ( Ignorable by Annotation )

If this is a false-positive, you can also ignore this issue in your code via the ignore-deprecated  annotation

1626
                    /** @scrutinizer ignore-deprecated */ $this->CLI_debug('conflict / timeout (' . $this->CLI_buildProcessId() . ')');
Loading history...
1627
                    $result |= self::CLI_STATUS_ABORTED;
1628
                    //possible timeout
1629
                    break;
1630
                }
1631
            }
1632
1633
            sleep((int) $sleepAfterFinish);
1634
1635
            $msg = 'Rows: ' . $counter;
1636
            $this->CLI_debug($msg . ' (' . $this->CLI_buildProcessId() . ')');
0 ignored issues
show
Deprecated Code introduced by
The function AOE\Crawler\Controller\C...Controller::CLI_debug() has been deprecated. ( Ignorable by Annotation )

If this is a false-positive, you can also ignore this issue in your code via the ignore-deprecated  annotation

1636
            /** @scrutinizer ignore-deprecated */ $this->CLI_debug($msg . ' (' . $this->CLI_buildProcessId() . ')');
Loading history...
1637
        } else {
1638
            $this->CLI_debug('Nothing within queue which needs to be processed (' . $this->CLI_buildProcessId() . ')');
0 ignored issues
show
Deprecated Code introduced by
The function AOE\Crawler\Controller\C...Controller::CLI_debug() has been deprecated. ( Ignorable by Annotation )

If this is a false-positive, you can also ignore this issue in your code via the ignore-deprecated  annotation

1638
            /** @scrutinizer ignore-deprecated */ $this->CLI_debug('Nothing within queue which needs to be processed (' . $this->CLI_buildProcessId() . ')');
Loading history...
1639
        }
1640
1641
        if ($counter > 0) {
1642
            $result |= self::CLI_STATUS_PROCESSED;
1643
        }
1644
1645
        return $result;
1646
    }
1647
1648
    /**
1649
     * Activate hooks
1650
     */
1651
    public function CLI_runHooks(): void
1652
    {
1653
        foreach ($GLOBALS['TYPO3_CONF_VARS']['EXTCONF']['crawler']['cli_hooks'] ?? [] as $objRef) {
1654
            $hookObj = GeneralUtility::makeInstance($objRef);
1655
            if (is_object($hookObj)) {
1656
                $hookObj->crawler_init($this);
1657
            }
1658
        }
1659
    }
1660
1661
    /**
1662
     * Try to acquire a new process with the given id
1663
     * also performs some auto-cleanup for orphan processes
1664
     * @param string $id identification string for the process
1665
     * @return boolean
1666
     * @todo preemption might not be the most elegant way to clean up
1667
     */
1668
    public function CLI_checkAndAcquireNewProcess($id)
1669
    {
1670
        $ret = true;
1671
1672
        $systemProcessId = getmypid();
1673
        if (! $systemProcessId) {
1674
            return false;
1675
        }
1676
1677
        $processCount = 0;
1678
        $orphanProcesses = [];
1679
1680
        $activeProcesses = $this->processRepository->findAllActive();
1681
        $currentTime = $this->getCurrentTime();
1682
1683
        /** @var Process $process */
1684
        foreach ($activeProcesses as $process) {
1685
            if ($process->getTtl() < $currentTime) {
1686
                $orphanProcesses[] = $process->getProcessId();
1687
            } else {
1688
                $processCount++;
1689
            }
1690
        }
1691
1692
        // if there are less than allowed active processes then add a new one
1693
        if ($processCount < (int) $this->extensionSettings['processLimit']) {
1694
            $this->CLI_debug('add process ' . $this->CLI_buildProcessId() . ' (' . ($processCount + 1) . '/' . (int) $this->extensionSettings['processLimit'] . ')');
0 ignored issues
show
Deprecated Code introduced by
The function AOE\Crawler\Controller\C...Controller::CLI_debug() has been deprecated. ( Ignorable by Annotation )

If this is a false-positive, you can also ignore this issue in your code via the ignore-deprecated  annotation

1694
            /** @scrutinizer ignore-deprecated */ $this->CLI_debug('add process ' . $this->CLI_buildProcessId() . ' (' . ($processCount + 1) . '/' . (int) $this->extensionSettings['processLimit'] . ')');
Loading history...
1695
1696
            GeneralUtility::makeInstance(ConnectionPool::class)->getConnectionForTable('tx_crawler_process')->insert(
1697
                'tx_crawler_process',
1698
                [
1699
                    'process_id' => $id,
1700
                    'active' => 1,
1701
                    'ttl' => $currentTime + (int) $this->extensionSettings['processMaxRunTime'],
1702
                    'system_process_id' => $systemProcessId,
1703
                ]
1704
            );
1705
        } else {
1706
            $this->CLI_debug('Processlimit reached (' . ($processCount) . '/' . (int) $this->extensionSettings['processLimit'] . ')');
0 ignored issues
show
Deprecated Code introduced by
The function AOE\Crawler\Controller\C...Controller::CLI_debug() has been deprecated. ( Ignorable by Annotation )

If this is a false-positive, you can also ignore this issue in your code via the ignore-deprecated  annotation

1706
            /** @scrutinizer ignore-deprecated */ $this->CLI_debug('Processlimit reached (' . ($processCount) . '/' . (int) $this->extensionSettings['processLimit'] . ')');
Loading history...
1707
            $ret = false;
1708
        }
1709
1710
        $this->processRepository->deleteProcessesMarkedAsDeleted();
1711
        $this->CLI_releaseProcesses($orphanProcesses);
1712
1713
        return $ret;
1714
    }
1715
1716
    /**
1717
     * Release a process and the required resources
1718
     *
1719
     * @param mixed $releaseIds string with a single process-id or array with multiple process-ids
1720
     * @return boolean
1721
     */
1722
    public function CLI_releaseProcesses($releaseIds)
1723
    {
1724
        $queryBuilder = GeneralUtility::makeInstance(ConnectionPool::class)->getQueryBuilderForTable($this->tableName);
1725
1726
        if (! is_array($releaseIds)) {
1727
            $releaseIds = [$releaseIds];
1728
        }
1729
1730
        if (empty($releaseIds)) {
1731
            //nothing to release
1732
            return false;
1733
        }
1734
1735
        // some kind of 2nd chance algo - this way you need at least 2 processes to have a real cleanup
1736
        // this ensures that a single process can't mess up the entire process table
1737
1738
        // mark all processes as deleted which have no "waiting" queue-entires and which are not active
1739
1740
        $queryBuilder
1741
            ->update($this->tableName, 'q')
1742
            ->where(
1743
                'q.process_id IN(SELECT p.process_id FROM tx_crawler_process as p WHERE p.active = 0)'
1744
            )
1745
            ->set('q.process_scheduled', 0)
1746
            ->set('q.process_id', '')
1747
            ->execute();
1748
1749
        // FIXME: Not entirely sure that this is equivalent to the previous version
1750
        $queryBuilder->resetQueryPart('set');
1751
1752
        $queryBuilder
1753
            ->update('tx_crawler_process')
1754
            ->where(
1755
                $queryBuilder->expr()->eq('active', 0),
1756
                'process_id IN(SELECT q.process_id FROM tx_crawler_queue as q WHERE q.exec_time = 0)'
1757
            )
1758
            ->set('system_process_id', 0)
1759
            ->execute();
1760
1761
        $this->processRepository->markRequestedProcessesAsNotActive($releaseIds);
1762
        $this->queueRepository->unsetProcessScheduledAndProcessIdForQueueEntries($releaseIds);
1763
1764
        return true;
1765
    }
1766
1767
    /**
1768
     * Create a unique Id for the current process
1769
     *
1770
     * @return string the ID
1771
     */
1772 1
    public function CLI_buildProcessId()
1773
    {
1774 1
        if (! $this->processID) {
1775
            $this->processID = GeneralUtility::shortMD5(microtime(true));
1776
        }
1777 1
        return $this->processID;
1778
    }
1779
1780
    /**
1781
     * Prints a message to the stdout (only if debug-mode is enabled)
1782
     *
1783
     * @param string $msg the message
1784
     * @deprecated
1785
     * @codeCoverageIgnore
1786
     */
1787
    public function CLI_debug($msg): void
1788
    {
1789
        if ((int) $this->extensionSettings['processDebug']) {
1790
            echo $msg . "\n";
1791
            flush();
1792
        }
1793
    }
1794
1795
    /**
1796
     * Cleans up entries that stayed for too long in the queue. These are:
1797
     * - processed entries that are over 1.5 days in age
1798
     * - scheduled entries that are over 7 days old
1799
     *
1800
     * @deprecated
1801
     */
1802 1
    public function cleanUpOldQueueEntries(): void
1803
    {
1804
        // 24*60*60 Seconds in 24 hours
1805 1
        $processedAgeInSeconds = $this->extensionSettings['cleanUpProcessedAge'] * 86400;
1806 1
        $scheduledAgeInSeconds = $this->extensionSettings['cleanUpScheduledAge'] * 86400;
1807
1808 1
        $now = time();
1809 1
        $condition = '(exec_time<>0 AND exec_time<' . ($now - $processedAgeInSeconds) . ') OR scheduled<=' . ($now - $scheduledAgeInSeconds);
1810 1
        $this->flushQueue($condition);
0 ignored issues
show
Deprecated Code introduced by
The function AOE\Crawler\Controller\C...ontroller::flushQueue() has been deprecated. ( Ignorable by Annotation )

If this is a false-positive, you can also ignore this issue in your code via the ignore-deprecated  annotation

1810
        /** @scrutinizer ignore-deprecated */ $this->flushQueue($condition);
Loading history...
1811 1
    }
1812
1813
    /**
1814
     * Removes queue entries
1815
     *
1816
     * @param string $where SQL related filter for the entries which should be removed
1817
     *
1818
     * @deprecated
1819
     */
1820 5
    protected function flushQueue($where = ''): void
1821
    {
1822 5
        $realWhere = strlen((string) $where) > 0 ? $where : '1=1';
1823
1824 5
        $queryBuilder = $this->getQueryBuilder($this->tableName);
1825
1826
        $groups = $queryBuilder
1827 5
            ->selectLiteral('DISTINCT set_id')
1828 5
            ->from($this->tableName)
1829 5
            ->where($realWhere)
1830 5
            ->execute()
1831 5
            ->fetchAll();
1832 5
        if (is_array($groups)) {
0 ignored issues
show
introduced by
The condition is_array($groups) is always true.
Loading history...
1833 5
            foreach ($groups as $group) {
1834
                $subSet = $queryBuilder
1835 4
                    ->select('qid', 'set_id')
1836 4
                    ->from($this->tableName)
1837 4
                    ->where(
1838 4
                        $realWhere,
1839 4
                        $queryBuilder->expr()->eq('set_id', $group['set_id'])
1840
                    )
1841 4
                    ->execute()
1842 4
                    ->fetchAll();
1843
1844 4
                $payLoad = ['subSet' => $subSet];
1845 4
                SignalSlotUtility::emitSignal(
1846 4
                    self::class,
1847 4
                    SignalSlotUtility::SIGNAL_QUEUE_ENTRY_FLUSH,
1848
                    $payLoad
1849
                );
1850
            }
1851
        }
1852
1853
        $queryBuilder
1854 5
            ->delete($this->tableName)
1855 5
            ->where($realWhere)
1856 5
            ->execute();
1857 5
    }
1858
1859
    /**
1860
     * This method determines duplicates for a queue entry with the same parameters and this timestamp.
1861
     * If the timestamp is in the past, it will check if there is any unprocessed queue entry in the past.
1862
     * If the timestamp is in the future it will check, if the queued entry has exactly the same timestamp
1863
     *
1864
     * @param int $tstamp
1865
     * @param array $fieldArray
1866
     *
1867
     * @return array
1868
     * @deprecated
1869
     */
1870 5
    protected function getDuplicateRowsIfExist($tstamp, $fieldArray)
1871
    {
1872 5
        $rows = [];
1873
1874 5
        $currentTime = $this->getCurrentTime();
1875
1876 5
        $queryBuilder = GeneralUtility::makeInstance(ConnectionPool::class)->getQueryBuilderForTable($this->tableName);
1877
        $queryBuilder
1878 5
            ->select('qid')
1879 5
            ->from('tx_crawler_queue');
1880
        //if this entry is scheduled with "now"
1881 5
        if ($tstamp <= $currentTime) {
1882 2
            if ($this->extensionSettings['enableTimeslot']) {
1883 1
                $timeBegin = $currentTime - 100;
1884 1
                $timeEnd = $currentTime + 100;
1885
                $queryBuilder
1886 1
                    ->where(
1887 1
                        'scheduled BETWEEN ' . $timeBegin . ' AND ' . $timeEnd . ''
1888
                    )
1889 1
                    ->orWhere(
1890 1
                        $queryBuilder->expr()->lte('scheduled', $currentTime)
1891
                    );
1892
            } else {
1893
                $queryBuilder
1894 1
                    ->where(
1895 2
                        $queryBuilder->expr()->lte('scheduled', $currentTime)
1896
                    );
1897
            }
1898 3
        } elseif ($tstamp > $currentTime) {
1899
            //entry with a timestamp in the future need to have the same schedule time
1900
            $queryBuilder
1901 3
                ->where(
1902 3
                    $queryBuilder->expr()->eq('scheduled', $tstamp)
1903
                );
1904
        }
1905
1906
        $queryBuilder
1907 5
            ->andWhere('NOT exec_time')
1908 5
            ->andWhere('NOT process_id')
1909 5
            ->andWhere($queryBuilder->expr()->eq('page_id', $queryBuilder->createNamedParameter($fieldArray['page_id'], PDO::PARAM_INT)))
1910 5
            ->andWhere($queryBuilder->expr()->eq('parameters_hash', $queryBuilder->createNamedParameter($fieldArray['parameters_hash'], PDO::PARAM_STR)));
1911
1912 5
        $statement = $queryBuilder->execute();
1913
1914 5
        while ($row = $statement->fetch()) {
1915 5
            $rows[] = $row['qid'];
1916
        }
1917
1918 5
        return $rows;
1919
    }
1920
1921
    /**
1922
     * Returns a md5 hash generated from a serialized configuration array.
1923
     *
1924
     * @return string
1925
     */
1926 10
    protected function getConfigurationHash(array $configuration)
1927
    {
1928 10
        unset($configuration['paramExpanded']);
1929 10
        unset($configuration['URLs']);
1930 10
        return md5(serialize($configuration));
1931
    }
1932
1933
    /**
1934
     * Build a URL from a Page and the Query String. If the page has a Site configuration, it can be built by using
1935
     * the Site instance.
1936
     *
1937
     * @param int $httpsOrHttp see tx_crawler_configuration.force_ssl
1938
     * @throws SiteNotFoundException
1939
     * @throws InvalidRouteArgumentsException
1940
     *
1941
     * @deprecated Using CrawlerController::getUrlFromPageAndQueryParameters() is deprecated since 9.1.1 and will be removed in v11.x, please use UrlService->getUrlFromPageAndQueryParameters() instead.
1942
     * @codeCoverageIgnore
1943
     */
1944
    protected function getUrlFromPageAndQueryParameters(int $pageId, string $queryString, ?string $alternativeBaseUrl, int $httpsOrHttp): UriInterface
1945
    {
1946
        $urlService = new UrlService();
1947
        return $urlService->getUrlFromPageAndQueryParameters($pageId, $queryString, $alternativeBaseUrl, $httpsOrHttp);
1948
    }
1949
1950 1
    protected function swapIfFirstIsLargerThanSecond(array $reg): array
1951
    {
1952
        // Swap if first is larger than last:
1953 1
        if ($reg[1] > $reg[2]) {
1954
            $temp = $reg[2];
1955
            $reg[2] = $reg[1];
1956
            $reg[1] = $temp;
1957
        }
1958
1959 1
        return $reg;
1960
    }
1961
1962
    /**
1963
     * @return BackendUserAuthentication
1964
     */
1965 2
    private function getBackendUser()
1966
    {
1967
        // Make sure the _cli_ user is loaded
1968 2
        Bootstrap::initializeBackendAuthentication();
1969 2
        if ($this->backendUser === null) {
1970 2
            $this->backendUser = $GLOBALS['BE_USER'];
1971
        }
1972 2
        return $this->backendUser;
1973
    }
1974
1975
    /**
1976
     * Get querybuilder for given table
1977
     *
1978
     * @return QueryBuilder
1979
     */
1980 12
    private function getQueryBuilder(string $table)
1981
    {
1982 12
        return GeneralUtility::makeInstance(ConnectionPool::class)->getQueryBuilderForTable($table);
1983
    }
1984
}
1985