Passed
Pull Request — master (#672)
by Tomas Norre
07:41 queued 04:28
created

CrawlerController::expandParameters()   F

Complexity

Conditions 25
Paths 831

Size

Total Lines 129
Code Lines 74

Duplication

Lines 0
Ratio 0 %

Code Coverage

Tests 57
CRAP Score 28.2868

Importance

Changes 1
Bugs 0 Features 0
Metric Value
cc 25
eloc 74
c 1
b 0
f 0
nc 831
nop 2
dl 0
loc 129
ccs 57
cts 69
cp 0.8261
crap 28.2868
rs 0.2347

How to fix   Long Method    Complexity   

Long Method

Small methods make your code easier to understand, in particular if combined with a good name. Besides, if your method is small, finding a good name is usually much easier.

For example, if you find yourself adding comments to a method's body, this is usually a good sign to extract the commented part to a new method, and use the comment as a starting point when coming up with a good name for this new method.

Commonly applied refactorings include:

1
<?php
2
3
declare(strict_types=1);
4
5
namespace AOE\Crawler\Controller;
6
7
/***************************************************************
8
 *  Copyright notice
9
 *
10
 *  (c) 2020 AOE GmbH <[email protected]>
11
 *
12
 *  All rights reserved
13
 *
14
 *  This script is part of the TYPO3 project. The TYPO3 project is
15
 *  free software; you can redistribute it and/or modify
16
 *  it under the terms of the GNU General Public License as published by
17
 *  the Free Software Foundation; either version 3 of the License, or
18
 *  (at your option) any later version.
19
 *
20
 *  The GNU General Public License can be found at
21
 *  http://www.gnu.org/copyleft/gpl.html.
22
 *
23
 *  This script is distributed in the hope that it will be useful,
24
 *  but WITHOUT ANY WARRANTY; without even the implied warranty of
25
 *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
26
 *  GNU General Public License for more details.
27
 *
28
 *  This copyright notice MUST APPEAR in all copies of the script!
29
 ***************************************************************/
30
31
use AOE\Crawler\Configuration\ExtensionConfigurationProvider;
32
use AOE\Crawler\Converter\JsonCompatibilityConverter;
33
use AOE\Crawler\Crawler;
34
use AOE\Crawler\CrawlStrategy\CrawlStrategyFactory;
35
use AOE\Crawler\Domain\Model\Process;
36
use AOE\Crawler\Domain\Repository\ConfigurationRepository;
37
use AOE\Crawler\Domain\Repository\ProcessRepository;
38
use AOE\Crawler\Domain\Repository\QueueRepository;
39
use AOE\Crawler\QueueExecutor;
40
use AOE\Crawler\Service\UrlService;
41
use AOE\Crawler\Utility\SignalSlotUtility;
42
use AOE\Crawler\Value\QueueFilter;
43
use PDO;
44
use Psr\Http\Message\UriInterface;
45
use Psr\Log\LoggerAwareInterface;
46
use Psr\Log\LoggerAwareTrait;
47
use TYPO3\CMS\Backend\Tree\View\PageTreeView;
48
use TYPO3\CMS\Backend\Utility\BackendUtility;
49
use TYPO3\CMS\Core\Authentication\BackendUserAuthentication;
50
use TYPO3\CMS\Core\Compatibility\PublicMethodDeprecationTrait;
51
use TYPO3\CMS\Core\Compatibility\PublicPropertyDeprecationTrait;
52
use TYPO3\CMS\Core\Core\Bootstrap;
53
use TYPO3\CMS\Core\Core\Environment;
54
use TYPO3\CMS\Core\Database\Connection;
55
use TYPO3\CMS\Core\Database\ConnectionPool;
56
use TYPO3\CMS\Core\Database\Query\QueryBuilder;
57
use TYPO3\CMS\Core\Database\Query\Restriction\DeletedRestriction;
58
use TYPO3\CMS\Core\Database\QueryGenerator;
59
use TYPO3\CMS\Core\Exception\SiteNotFoundException;
60
use TYPO3\CMS\Core\Imaging\Icon;
61
use TYPO3\CMS\Core\Imaging\IconFactory;
62
use TYPO3\CMS\Core\Routing\InvalidRouteArgumentsException;
63
use TYPO3\CMS\Core\Site\Entity\Site;
64
use TYPO3\CMS\Core\Type\Bitmask\Permission;
65
use TYPO3\CMS\Core\TypoScript\Parser\TypoScriptParser;
66
use TYPO3\CMS\Core\Utility\DebugUtility;
67
use TYPO3\CMS\Core\Utility\GeneralUtility;
68
use TYPO3\CMS\Core\Utility\MathUtility;
69
use TYPO3\CMS\Extbase\Object\ObjectManager;
70
use TYPO3\CMS\Frontend\Page\PageRepository;
71
72
/**
73
 * Class CrawlerController
74
 *
75
 * @package AOE\Crawler\Controller
76
 */
77
class CrawlerController implements LoggerAwareInterface
78
{
79
    use LoggerAwareTrait;
80
    use PublicMethodDeprecationTrait;
81
    use PublicPropertyDeprecationTrait;
82
83
    public const CLI_STATUS_NOTHING_PROCCESSED = 0;
84
85
    //queue not empty
86
    public const CLI_STATUS_REMAIN = 1;
87
88
    //(some) queue items where processed
89
    public const CLI_STATUS_PROCESSED = 2;
90
91
    //instance didn't finish
92
    public const CLI_STATUS_ABORTED = 4;
93
94
    public const CLI_STATUS_POLLABLE_PROCESSED = 8;
95
96
    /**
97
     * @var integer
98
     */
99
    public $setID = 0;
100
101
    /**
102
     * @var string
103
     */
104
    public $processID = '';
105
106
    /**
107
     * @var array
108
     */
109
    public $duplicateTrack = [];
110
111
    /**
112
     * @var array
113
     */
114
    public $downloadUrls = [];
115
116
    /**
117
     * @var array
118
     */
119
    public $incomingProcInstructions = [];
120
121
    /**
122
     * @var array
123
     */
124
    public $incomingConfigurationSelection = [];
125
126
    /**
127
     * @var bool
128
     */
129
    public $registerQueueEntriesInternallyOnly = false;
130
131
    /**
132
     * @var array
133
     */
134
    public $queueEntries = [];
135
136
    /**
137
     * @var array
138
     */
139
    public $urlList = [];
140
141
    /**
142
     * @var array
143
     */
144
    public $extensionSettings = [];
145
146
    /**
147
     * Mount Point
148
     *
149
     * @var bool
150
     * Todo: Check what this is used for and adjust the type hint or code, as bool doesn't match the current code.
151
     */
152
    public $MP = false;
153
154
    /**
155
     * @var string
156
     * @deprecated
157
     */
158
    protected $processFilename;
159
160
    /**
161
     * Holds the internal access mode can be 'gui','cli' or 'cli_im'
162
     *
163
     * @var string
164
     * @deprecated
165
     */
166
    protected $accessMode;
167
168
    /**
169
     * @var QueueRepository
170
     */
171
    protected $queueRepository;
172
173
    /**
174
     * @var ProcessRepository
175
     */
176
    protected $processRepository;
177
178
    /**
179
     * @var ConfigurationRepository
180
     */
181
    protected $configurationRepository;
182
183
    /**
184
     * @var string
185
     */
186
    protected $tableName = 'tx_crawler_queue';
187
188
    /**
189
     * @var QueueExecutor
190
     */
191
    protected $queueExecutor;
192
193
    /**
194
     * @var int
195
     */
196
    protected $maximumUrlsToCompile = 10000;
197
198
    /**
199
     * @var IconFactory
200
     */
201
    protected $iconFactory;
202
203
    /**
204
     * @var string[]
205
     */
206
    private $deprecatedPublicMethods = [
0 ignored issues
show
introduced by
The private property $deprecatedPublicMethods is not used, and could be removed.
Loading history...
207
        'cleanUpOldQueueEntries' => 'Using CrawlerController::cleanUpOldQueueEntries() is deprecated since 9.0.1 and will be removed in v11.x, please use QueueRepository->cleanUpOldQueueEntries() instead.',
208
        'CLI_debug' => 'Using CrawlerController->CLI_debug() is deprecated since 9.1.3 and will be removed in v11.x',
209
        'getAccessMode' => 'Using CrawlerController->getAccessMode() is deprecated since 9.1.3 and will be removed in v11.x',
210
        'getLogEntriesForSetId' => 'Using crawlerController::getLogEntriesForSetId() is deprecated since 9.0.1 and will be removed in v11.x',
211
        'flushQueue' => 'Using CrawlerController::flushQueue() is deprecated since 9.0.1 and will be removed in v11.x, please use QueueRepository->flushQueue() instead.',
212
        'setAccessMode' => 'Using CrawlerController->setAccessMode() is deprecated since 9.1.3 and will be removed in v11.x',
213
        'getDisabled' => 'Using CrawlerController->getDisabled() is deprecated since 9.1.3 and will be removed in v11.x, please use Crawler->isDisabled() instead',
214
        'setDisabled' => 'Using CrawlerController->setDisabled() is deprecated since 9.1.3 and will be removed in v11.x, please use Crawler->setDisabled() instead',
215
        'getProcessFilename' => 'Using CrawlerController->getProcessFilename() is deprecated since 9.1.3 and will be removed in v11.x',
216
        'setProcessFilename' => 'Using CrawlerController->setProcessFilename() is deprecated since 9.1.3 and will be removed in v11.x',
217
        'getDuplicateRowsIfExist' => 'Using CrawlerController->getDuplicateRowsIfExist() is deprecated since 9.1.4 and will be remove in v11.x, please use QueueRepository->getDuplicateQueueItemsIfExists() instead',
218
219
    ];
220
221
    /**
222
     * @var string[]
223
     */
224
    private $deprecatedPublicProperties = [
1 ignored issue
show
introduced by
The private property $deprecatedPublicProperties is not used, and could be removed.
Loading history...
225
        'accessMode' => 'Using CrawlerController->accessMode is deprecated since 9.1.3 and will be removed in v11.x',
226
        'processFilename' => 'Using CrawlerController->accessMode is deprecated since 9.1.3 and will be removed in v11.x',
227
    ];
228
229
    /**
230
     * @var BackendUserAuthentication|null
231
     */
232
    private $backendUser;
233
234
    /**
235
     * @var integer
236
     */
237
    private $scheduledTime = 0;
238
239
    /**
240
     * @var integer
241
     */
242
    private $reqMinute = 0;
243
244
    /**
245
     * @var bool
246
     */
247
    private $submitCrawlUrls = false;
248
249
    /**
250
     * @var bool
251
     */
252
    private $downloadCrawlUrls = false;
253
254
    /**
255
     * @var PageRepository
256
     */
257
    private $pageRepository;
258
259
    /**
260
     * @var Crawler
261
     */
262
    private $crawler;
263
264
    /************************************
265
     *
266
     * Getting URLs based on Page TSconfig
267
     *
268
     ************************************/
269
270 36
    public function __construct()
271
    {
272 36
        $objectManager = GeneralUtility::makeInstance(ObjectManager::class);
273 36
        $crawlStrategyFactory = GeneralUtility::makeInstance(CrawlStrategyFactory::class);
274 36
        $this->queueRepository = $objectManager->get(QueueRepository::class);
275 36
        $this->processRepository = $objectManager->get(ProcessRepository::class);
276 36
        $this->configurationRepository = $objectManager->get(ConfigurationRepository::class);
277 36
        $this->pageRepository = $objectManager->get(PageRepository::class);
278 36
        $this->queueExecutor = GeneralUtility::makeInstance(QueueExecutor::class, $crawlStrategyFactory);
279 36
        $this->iconFactory = GeneralUtility::makeInstance(IconFactory::class);
280 36
        $this->crawler = GeneralUtility::makeInstance(Crawler::class);
281
282 36
        $this->processFilename = Environment::getVarPath() . '/lock/tx_crawler.proc';
0 ignored issues
show
Deprecated Code introduced by
The property AOE\Crawler\Controller\C...oller::$processFilename has been deprecated. ( Ignorable by Annotation )

If this is a false-positive, you can also ignore this issue in your code via the ignore-deprecated  annotation

282
        /** @scrutinizer ignore-deprecated */ $this->processFilename = Environment::getVarPath() . '/lock/tx_crawler.proc';
Loading history...
283
284
        /** @var ExtensionConfigurationProvider $configurationProvider */
285 36
        $configurationProvider = GeneralUtility::makeInstance(ExtensionConfigurationProvider::class);
286 36
        $settings = $configurationProvider->getExtensionConfiguration();
287 36
        $this->extensionSettings = is_array($settings) ? $settings : [];
0 ignored issues
show
introduced by
The condition is_array($settings) is always true.
Loading history...
288
289
        // set defaults:
290 36
        if (MathUtility::convertToPositiveInteger($this->extensionSettings['countInARun']) === 0) {
291
            $this->extensionSettings['countInARun'] = 100;
292
        }
293
294 36
        $this->extensionSettings['processLimit'] = MathUtility::forceIntegerInRange($this->extensionSettings['processLimit'], 1, 99, 1);
295 36
        $this->maximumUrlsToCompile = MathUtility::forceIntegerInRange($this->extensionSettings['maxCompileUrls'], 1, 1000000000, 10000);
296 36
    }
297
298
    /**
299
     * Method to set the accessMode can be gui, cli or cli_im
300
     *
301
     * @return string
302
     * @deprecated
303
     */
304 1
    public function getAccessMode()
305
    {
306 1
        return $this->accessMode;
0 ignored issues
show
Deprecated Code introduced by
The property AOE\Crawler\Controller\C...Controller::$accessMode has been deprecated. ( Ignorable by Annotation )

If this is a false-positive, you can also ignore this issue in your code via the ignore-deprecated  annotation

306
        return /** @scrutinizer ignore-deprecated */ $this->accessMode;
Loading history...
307
    }
308
309
    /**
310
     * @param string $accessMode
311
     * @deprecated
312
     */
313 1
    public function setAccessMode($accessMode): void
314
    {
315 1
        $this->accessMode = $accessMode;
0 ignored issues
show
Deprecated Code introduced by
The property AOE\Crawler\Controller\C...Controller::$accessMode has been deprecated. ( Ignorable by Annotation )

If this is a false-positive, you can also ignore this issue in your code via the ignore-deprecated  annotation

315
        /** @scrutinizer ignore-deprecated */ $this->accessMode = $accessMode;
Loading history...
316 1
    }
317
318
    /**
319
     * Set disabled status to prevent processes from being processed
320
     *
321
     * @param bool $disabled (optional, defaults to true)
322
     * @deprecated
323
     */
324 2
    public function setDisabled($disabled = true): void
325
    {
326 2
        if ($disabled) {
327 1
            GeneralUtility::writeFile($this->processFilename, '');
0 ignored issues
show
Deprecated Code introduced by
The property AOE\Crawler\Controller\C...oller::$processFilename has been deprecated. ( Ignorable by Annotation )

If this is a false-positive, you can also ignore this issue in your code via the ignore-deprecated  annotation

327
            GeneralUtility::writeFile(/** @scrutinizer ignore-deprecated */ $this->processFilename, '');
Loading history...
328
        } else {
329 1
            if (is_file($this->processFilename)) {
0 ignored issues
show
Deprecated Code introduced by
The property AOE\Crawler\Controller\C...oller::$processFilename has been deprecated. ( Ignorable by Annotation )

If this is a false-positive, you can also ignore this issue in your code via the ignore-deprecated  annotation

329
            if (is_file(/** @scrutinizer ignore-deprecated */ $this->processFilename)) {
Loading history...
330 1
                unlink($this->processFilename);
0 ignored issues
show
Deprecated Code introduced by
The property AOE\Crawler\Controller\C...oller::$processFilename has been deprecated. ( Ignorable by Annotation )

If this is a false-positive, you can also ignore this issue in your code via the ignore-deprecated  annotation

330
                unlink(/** @scrutinizer ignore-deprecated */ $this->processFilename);
Loading history...
331
            }
332
        }
333 2
    }
334
335
    /**
336
     * Get disable status
337
     *
338
     * @return bool true if disabled
339
     * @deprecated
340
     */
341 2
    public function getDisabled()
342
    {
343 2
        return is_file($this->processFilename);
0 ignored issues
show
Deprecated Code introduced by
The property AOE\Crawler\Controller\C...oller::$processFilename has been deprecated. ( Ignorable by Annotation )

If this is a false-positive, you can also ignore this issue in your code via the ignore-deprecated  annotation

343
        return is_file(/** @scrutinizer ignore-deprecated */ $this->processFilename);
Loading history...
344
    }
345
346
    /**
347
     * @param string $filenameWithPath
348
     * @deprecated
349
     */
350 3
    public function setProcessFilename($filenameWithPath): void
351
    {
352 3
        $this->processFilename = $filenameWithPath;
0 ignored issues
show
Deprecated Code introduced by
The property AOE\Crawler\Controller\C...oller::$processFilename has been deprecated. ( Ignorable by Annotation )

If this is a false-positive, you can also ignore this issue in your code via the ignore-deprecated  annotation

352
        /** @scrutinizer ignore-deprecated */ $this->processFilename = $filenameWithPath;
Loading history...
353 3
    }
354
355
    /**
356
     * @return string
357
     * @deprecated
358
     */
359 1
    public function getProcessFilename()
360
    {
361 1
        return $this->processFilename;
0 ignored issues
show
Deprecated Code introduced by
The property AOE\Crawler\Controller\C...oller::$processFilename has been deprecated. ( Ignorable by Annotation )

If this is a false-positive, you can also ignore this issue in your code via the ignore-deprecated  annotation

361
        return /** @scrutinizer ignore-deprecated */ $this->processFilename;
Loading history...
362
    }
363
364
    /**
365
     * Sets the extensions settings (unserialized pendant of $TYPO3_CONF_VARS['EXT']['extConf']['crawler']).
366
     */
367 14
    public function setExtensionSettings(array $extensionSettings): void
368
    {
369 14
        $this->extensionSettings = $extensionSettings;
370 14
    }
371
372
    /**
373
     * Check if the given page should be crawled
374
     *
375
     * @return false|string false if the page should be crawled (not excluded), true / skipMessage if it should be skipped
376
     */
377 12
    public function checkIfPageShouldBeSkipped(array $pageRow)
378
    {
379
        // if page is hidden
380 12
        if (! $this->extensionSettings['crawlHiddenPages'] && $pageRow['hidden']) {
381 1
            return 'Because page is hidden';
382
        }
383
384 11
        if (GeneralUtility::inList('3,4,199,254,255', $pageRow['doktype'])) {
385 3
            $skipPage = true;
0 ignored issues
show
Unused Code introduced by
The assignment to $skipPage is dead and can be removed.
Loading history...
386 3
            return 'Because doktype is not allowed';
387
        }
388
389 8
        foreach ($GLOBALS['TYPO3_CONF_VARS']['EXTCONF']['crawler']['excludeDoktype'] ?? [] as $key => $doktypeList) {
390 1
            if (GeneralUtility::inList($doktypeList, $pageRow['doktype'])) {
391 1
                $skipPage = true;
392 1
                return 'Doktype was excluded by "' . $key . '"';
393
                break;
0 ignored issues
show
Unused Code introduced by
break is not strictly necessary here and could be removed.

The break statement is not necessary if it is preceded for example by a return statement:

switch ($x) {
    case 1:
        return 'foo';
        break; // This break is not necessary and can be left off.
}

If you would like to keep this construct to be consistent with other case statements, you can safely mark this issue as a false-positive.

Loading history...
394
            }
395
        }
396
397
        // veto hook
398 7
        foreach ($GLOBALS['TYPO3_CONF_VARS']['EXTCONF']['crawler']['pageVeto'] ?? [] as $key => $func) {
399
            $params = [
400 2
                'pageRow' => $pageRow,
401
            ];
402
            // expects "false" if page is ok and "true" or a skipMessage if this page should _not_ be crawled
403 2
            $veto = GeneralUtility::callUserFunction($func, $params, $this);
404 2
            if ($veto !== false) {
405 2
                $skipPage = true;
406 2
                if (is_string($veto)) {
407 1
                    return $veto;
408
                }
409 1
                return 'Veto from hook "' . htmlspecialchars($key) . '"';
410
411
                // no need to execute other hooks if a previous one return a veto
412
                break;
413
            }
414
        }
415
416 5
        return false;
417
    }
418
419
    /**
420
     * Wrapper method for getUrlsForPageId()
421
     * It returns an array of configurations and no urls!
422
     *
423
     * @param array $pageRow Page record with at least dok-type and uid columns.
424
     * @param string $skipMessage
425
     * @return array
426
     * @see getUrlsForPageId()
427
     */
428 6
    public function getUrlsForPageRow(array $pageRow, &$skipMessage = '')
429
    {
430 6
        $message = $this->checkIfPageShouldBeSkipped($pageRow);
431 6
        if ($message === false) {
432 5
            $res = $this->getUrlsForPageId($pageRow['uid']);
433 5
            $skipMessage = '';
434
        } else {
435 1
            $skipMessage = $message;
436 1
            $res = [];
437
        }
438
439 6
        return $res;
440
    }
441
442
    /**
443
     * Creates a list of URLs from input array (and submits them to queue if asked for)
444
     * See Web > Info module script + "indexed_search"'s crawler hook-client using this!
445
     *
446
     * @param array $vv Information about URLs from pageRow to crawl.
447
     * @param array $pageRow Page row
448
     * @param int $scheduledTime Unix time to schedule indexing to, typically time()
449
     * @param int $reqMinute Number of requests per minute (creates the interleave between requests)
450
     * @param bool $submitCrawlUrls If set, submits the URLs to queue
451
     * @param bool $downloadCrawlUrls If set (and submitcrawlUrls is false) will fill $downloadUrls with entries)
452
     * @param array $duplicateTrack Array which is passed by reference and contains the an id per url to secure we will not crawl duplicates
453
     * @param array $downloadUrls Array which will be filled with URLS for download if flag is set.
454
     * @param array $incomingProcInstructions Array of processing instructions
455
     * @return string List of URLs (meant for display in backend module)
456
     */
457 4
    public function urlListFromUrlArray(
458
        array $vv,
459
        array $pageRow,
460
        $scheduledTime,
461
        $reqMinute,
462
        $submitCrawlUrls,
463
        $downloadCrawlUrls,
464
        array &$duplicateTrack,
465
        array &$downloadUrls,
466
        array $incomingProcInstructions
467
    ) {
468 4
        if (! is_array($vv['URLs'])) {
469
            return 'ERROR - no URL generated';
470
        }
471 4
        $urlLog = [];
472 4
        $pageId = (int) $pageRow['uid'];
473 4
        $configurationHash = $this->getConfigurationHash($vv);
474 4
        $skipInnerCheck = $this->queueRepository->noUnprocessedQueueEntriesForPageWithConfigurationHashExist($pageId, $configurationHash);
475
476 4
        $urlService = new UrlService();
477
478 4
        foreach ($vv['URLs'] as $urlQuery) {
479 4
            if (! $this->drawURLs_PIfilter($vv['subCfg']['procInstrFilter'], $incomingProcInstructions)) {
480
                continue;
481
            }
482 4
            $url = (string) $urlService->getUrlFromPageAndQueryParameters(
483 4
                $pageId,
484
                $urlQuery,
485 4
                $vv['subCfg']['baseUrl'] ?? null,
486 4
                $vv['subCfg']['force_ssl'] ?? 0
487
            );
488
489
            // Create key by which to determine unique-ness:
490 4
            $uKey = $url . '|' . $vv['subCfg']['userGroups'] . '|' . $vv['subCfg']['procInstrFilter'];
491
492 4
            if (isset($duplicateTrack[$uKey])) {
493
                //if the url key is registered just display it and do not resubmit is
494
                $urlLog[] = '<em><span class="text-muted">' . htmlspecialchars($url) . '</span></em>';
495
            } else {
496
                // Scheduled time:
497 4
                $schTime = $scheduledTime + round(count($duplicateTrack) * (60 / $reqMinute));
498 4
                $schTime = intval($schTime / 60) * 60;
499 4
                $formattedDate = BackendUtility::datetime($schTime);
500 4
                $this->urlList[] = '[' . $formattedDate . '] ' . $url;
501 4
                $urlList = '[' . $formattedDate . '] ' . htmlspecialchars($url);
502
503
                // Submit for crawling!
504 4
                if ($submitCrawlUrls) {
505 4
                    $added = $this->addUrl(
506 4
                        $pageId,
507
                        $url,
508 4
                        $vv['subCfg'],
509
                        $scheduledTime,
510
                        $configurationHash,
511
                        $skipInnerCheck
512
                    );
513 4
                    if ($added === false) {
514 4
                        $urlList .= ' (URL already existed)';
515
                    }
516
                } elseif ($downloadCrawlUrls) {
517
                    $downloadUrls[$url] = $url;
518
                }
519 4
                $urlLog[] = $urlList;
520
            }
521 4
            $duplicateTrack[$uKey] = true;
522
        }
523
524 4
        return implode('<br>', $urlLog);
525
    }
526
527
    /**
528
     * Returns true if input processing instruction is among registered ones.
529
     *
530
     * @param string $piString PI to test
531
     * @param array $incomingProcInstructions Processing instructions
532
     * @return boolean
533
     */
534 5
    public function drawURLs_PIfilter($piString, array $incomingProcInstructions)
535
    {
536 5
        if (empty($incomingProcInstructions)) {
537 1
            return true;
538
        }
539
540 4
        foreach ($incomingProcInstructions as $pi) {
541 4
            if (GeneralUtility::inList($piString, $pi)) {
542 2
                return true;
543
            }
544
        }
545 2
        return false;
546
    }
547
548 5
    public function getPageTSconfigForId($id): array
549
    {
550 5
        if (! $this->MP) {
551 5
            $pageTSconfig = BackendUtility::getPagesTSconfig($id);
0 ignored issues
show
Deprecated Code introduced by
The function TYPO3\CMS\Backend\Utilit...ity::getPagesTSconfig() has been deprecated. ( Ignorable by Annotation )

If this is a false-positive, you can also ignore this issue in your code via the ignore-deprecated  annotation

551
            $pageTSconfig = /** @scrutinizer ignore-deprecated */ BackendUtility::getPagesTSconfig($id);
Loading history...
552
        } else {
553
            // TODO: Please check, this makes no sense to split a boolean value.
554
            [, $mountPointId] = explode('-', $this->MP);
0 ignored issues
show
Bug introduced by
$this->MP of type true is incompatible with the type string expected by parameter $string of explode(). ( Ignorable by Annotation )

If this is a false-positive, you can also ignore this issue in your code via the ignore-type  annotation

554
            [, $mountPointId] = explode('-', /** @scrutinizer ignore-type */ $this->MP);
Loading history...
555
            $pageTSconfig = BackendUtility::getPagesTSconfig($mountPointId);
0 ignored issues
show
Bug introduced by
$mountPointId of type string is incompatible with the type integer expected by parameter $id of TYPO3\CMS\Backend\Utilit...ity::getPagesTSconfig(). ( Ignorable by Annotation )

If this is a false-positive, you can also ignore this issue in your code via the ignore-type  annotation

555
            $pageTSconfig = BackendUtility::getPagesTSconfig(/** @scrutinizer ignore-type */ $mountPointId);
Loading history...
Deprecated Code introduced by
The function TYPO3\CMS\Backend\Utilit...ity::getPagesTSconfig() has been deprecated. ( Ignorable by Annotation )

If this is a false-positive, you can also ignore this issue in your code via the ignore-deprecated  annotation

555
            $pageTSconfig = /** @scrutinizer ignore-deprecated */ BackendUtility::getPagesTSconfig($mountPointId);
Loading history...
556
        }
557
558
        // Call a hook to alter configuration
559 5
        if (is_array($GLOBALS['TYPO3_CONF_VARS']['EXTCONF']['crawler']['getPageTSconfigForId'])) {
560
            $params = [
561
                'pageId' => $id,
562
                'pageTSConfig' => &$pageTSconfig,
563
            ];
564
            foreach ($GLOBALS['TYPO3_CONF_VARS']['EXTCONF']['crawler']['getPageTSconfigForId'] as $userFunc) {
565
                GeneralUtility::callUserFunction($userFunc, $params, $this);
566
            }
567
        }
568 5
        return $pageTSconfig;
569
    }
570
571
    /**
572
     * This methods returns an array of configurations.
573
     * Adds no urls!
574
     */
575 4
    public function getUrlsForPageId(int $pageId): array
576
    {
577
        // Get page TSconfig for page ID
578 4
        $pageTSconfig = $this->getPageTSconfigForId($pageId);
579
580 4
        $res = [];
581
582
        // Fetch Crawler Configuration from pageTSconfig
583 4
        $crawlerCfg = $pageTSconfig['tx_crawler.']['crawlerCfg.']['paramSets.'] ?? [];
584 4
        foreach ($crawlerCfg as $key => $values) {
585 3
            if (! is_array($values)) {
586 3
                continue;
587
            }
588 3
            $key = str_replace('.', '', $key);
589
            // Sub configuration for a single configuration string:
590 3
            $subCfg = (array) $crawlerCfg[$key . '.'];
591 3
            $subCfg['key'] = $key;
592
593 3
            if (strcmp($subCfg['procInstrFilter'] ?? '', '')) {
594 3
                $subCfg['procInstrFilter'] = implode(',', GeneralUtility::trimExplode(',', $subCfg['procInstrFilter']));
595
            }
596 3
            $pidOnlyList = implode(',', GeneralUtility::trimExplode(',', $subCfg['pidsOnly'], true));
597
598
            // process configuration if it is not page-specific or if the specific page is the current page:
599
            // TODO: Check if $pidOnlyList can be kept as Array instead of imploded
600 3
            if (! strcmp((string) $subCfg['pidsOnly'], '') || GeneralUtility::inList($pidOnlyList, strval($pageId))) {
601
602
                // Explode, process etc.:
603 3
                $res[$key] = [];
604 3
                $res[$key]['subCfg'] = $subCfg;
605 3
                $res[$key]['paramParsed'] = GeneralUtility::explodeUrl2Array($crawlerCfg[$key]);
606 3
                $res[$key]['paramExpanded'] = $this->expandParameters($res[$key]['paramParsed'], $pageId);
607 3
                $res[$key]['origin'] = 'pagets';
608
609
                // recognize MP value
610 3
                if (! $this->MP) {
611 3
                    $res[$key]['URLs'] = $this->compileUrls($res[$key]['paramExpanded'], ['?id=' . $pageId]);
612
                } else {
613
                    $res[$key]['URLs'] = $this->compileUrls($res[$key]['paramExpanded'], ['?id=' . $pageId . '&MP=' . $this->MP]);
0 ignored issues
show
Bug introduced by
Are you sure $this->MP of type true can be used in concatenation? ( Ignorable by Annotation )

If this is a false-positive, you can also ignore this issue in your code via the ignore-type  annotation

613
                    $res[$key]['URLs'] = $this->compileUrls($res[$key]['paramExpanded'], ['?id=' . $pageId . '&MP=' . /** @scrutinizer ignore-type */ $this->MP]);
Loading history...
614
                }
615
            }
616
        }
617
618
        // Get configuration from tx_crawler_configuration records up the rootline
619 4
        $crawlerConfigurations = $this->configurationRepository->getCrawlerConfigurationRecordsFromRootLine($pageId);
620 4
        foreach ($crawlerConfigurations as $configurationRecord) {
621
622
            // check access to the configuration record
623 1
            if (empty($configurationRecord['begroups']) || $this->getBackendUser()->isAdmin() || $this->hasGroupAccess($this->getBackendUser()->user['usergroup_cached_list'], $configurationRecord['begroups'])) {
624 1
                $pidOnlyList = implode(',', GeneralUtility::trimExplode(',', $configurationRecord['pidsonly'], true));
625
626
                // process configuration if it is not page-specific or if the specific page is the current page:
627
                // TODO: Check if $pidOnlyList can be kept as Array instead of imploded
628 1
                if (! strcmp($configurationRecord['pidsonly'], '') || GeneralUtility::inList($pidOnlyList, strval($pageId))) {
629 1
                    $key = $configurationRecord['name'];
630
631
                    // don't overwrite previously defined paramSets
632 1
                    if (! isset($res[$key])) {
633
634
                        /* @var $TSparserObject TypoScriptParser */
635 1
                        $TSparserObject = GeneralUtility::makeInstance(TypoScriptParser::class);
636 1
                        $TSparserObject->parse($configurationRecord['processing_instruction_parameters_ts']);
637
638
                        $subCfg = [
639 1
                            'procInstrFilter' => $configurationRecord['processing_instruction_filter'],
640 1
                            'procInstrParams.' => $TSparserObject->setup,
641 1
                            'baseUrl' => $configurationRecord['base_url'],
642 1
                            'force_ssl' => (int) $configurationRecord['force_ssl'],
643 1
                            'userGroups' => $configurationRecord['fegroups'],
644 1
                            'exclude' => $configurationRecord['exclude'],
645 1
                            'key' => $key,
646
                        ];
647
648 1
                        if (! in_array($pageId, $this->expandExcludeString($subCfg['exclude']), true)) {
649 1
                            $res[$key] = [];
650 1
                            $res[$key]['subCfg'] = $subCfg;
651 1
                            $res[$key]['paramParsed'] = GeneralUtility::explodeUrl2Array($configurationRecord['configuration']);
652 1
                            $res[$key]['paramExpanded'] = $this->expandParameters($res[$key]['paramParsed'], $pageId);
653 1
                            $res[$key]['URLs'] = $this->compileUrls($res[$key]['paramExpanded'], ['?id=' . $pageId]);
654 1
                            $res[$key]['origin'] = 'tx_crawler_configuration_' . $configurationRecord['uid'];
655
                        }
656
                    }
657
                }
658
            }
659
        }
660
661 4
        foreach ($GLOBALS['TYPO3_CONF_VARS']['EXTCONF']['crawler']['processUrls'] ?? [] as $func) {
662
            $params = [
663
                'res' => &$res,
664
            ];
665
            GeneralUtility::callUserFunction($func, $params, $this);
666
        }
667 4
        return $res;
668
    }
669
670
    /**
671
     * Find all configurations of subpages of a page
672
     * TODO: Write Functional Tests
673
     */
674 1
    public function getConfigurationsForBranch(int $rootid, int $depth): array
675
    {
676 1
        $configurationsForBranch = [];
677 1
        $pageTSconfig = $this->getPageTSconfigForId($rootid);
678 1
        $sets = $pageTSconfig['tx_crawler.']['crawlerCfg.']['paramSets.'] ?? [];
679 1
        foreach ($sets as $key => $value) {
680
            if (! is_array($value)) {
681
                continue;
682
            }
683
            $configurationsForBranch[] = substr($key, -1) === '.' ? substr($key, 0, -1) : $key;
684
        }
685 1
        $pids = [];
686 1
        $rootLine = BackendUtility::BEgetRootLine($rootid);
687 1
        foreach ($rootLine as $node) {
688 1
            $pids[] = $node['uid'];
689
        }
690
        /* @var PageTreeView $tree */
691 1
        $tree = GeneralUtility::makeInstance(PageTreeView::class);
692 1
        $perms_clause = $this->getBackendUser()->getPagePermsClause(Permission::PAGE_SHOW);
693 1
        $tree->init(empty($perms_clause) ? '' : ('AND ' . $perms_clause));
694 1
        $tree->getTree($rootid, $depth, '');
695 1
        foreach ($tree->tree as $node) {
696
            $pids[] = $node['row']['uid'];
697
        }
698
699 1
        $queryBuilder = $this->getQueryBuilder('tx_crawler_configuration');
700
        $statement = $queryBuilder
701 1
            ->select('name')
702 1
            ->from('tx_crawler_configuration')
703 1
            ->where(
704 1
                $queryBuilder->expr()->in('pid', $queryBuilder->createNamedParameter($pids, Connection::PARAM_INT_ARRAY))
705
            )
706 1
            ->execute();
707
708 1
        while ($row = $statement->fetch()) {
709 1
            $configurationsForBranch[] = $row['name'];
710
        }
711 1
        return $configurationsForBranch;
712
    }
713
714
    /**
715
     * Check if a user has access to an item
716
     * (e.g. get the group list of the current logged in user from $GLOBALS['TSFE']->gr_list)
717
     *
718
     * @param string $groupList Comma-separated list of (fe_)group UIDs from a user
719
     * @param string $accessList Comma-separated list of (fe_)group UIDs of the item to access
720
     * @return bool TRUE if at least one of the users group UIDs is in the access list or the access list is empty
721
     * @see \TYPO3\CMS\Frontend\Page\PageRepository::getMultipleGroupsWhereClause()
722
     */
723 3
    public function hasGroupAccess($groupList, $accessList)
724
    {
725 3
        if (empty($accessList)) {
726 1
            return true;
727
        }
728 2
        foreach (GeneralUtility::intExplode(',', $groupList) as $groupUid) {
729 2
            if (GeneralUtility::inList($accessList, $groupUid)) {
730 1
                return true;
731
            }
732
        }
733 1
        return false;
734
    }
735
736
    /**
737
     * Will expand the parameters configuration to individual values. This follows a certain syntax of the value of each parameter.
738
     * Syntax of values:
739
     * - Basically: If the value is wrapped in [...] it will be expanded according to the following syntax, otherwise the value is taken literally
740
     * - Configuration is splitted by "|" and the parts are processed individually and finally added together
741
     * - For each configuration part:
742
     *         - "[int]-[int]" = Integer range, will be expanded to all values in between, values included, starting from low to high (max. 1000). Example "1-34" or "-40--30"
743
     *         - "_TABLE:[TCA table name];[_PID:[optional page id, default is current page]];[_ENABLELANG:1]" = Look up of table records from PID, filtering out deleted records. Example "_TABLE:tt_content; _PID:123"
744
     *        _ENABLELANG:1 picks only original records without their language overlays
745
     *         - Default: Literal value
746
     *
747
     * @param array $paramArray Array with key (GET var name) and values (value of GET var which is configuration for expansion)
748
     * @param integer $pid Current page ID
749
     * @return array
750
     *
751
     * TODO: Write Functional Tests
752
     */
753 11
    public function expandParameters($paramArray, $pid)
754
    {
755
        // Traverse parameter names:
756 11
        foreach ($paramArray as $p => $v) {
757 11
            $v = trim($v);
758
759
            // If value is encapsulated in square brackets it means there are some ranges of values to find, otherwise the value is literal
760 11
            if (strpos($v, '[') === 0 && substr($v, -1) === ']') {
761
                // So, find the value inside brackets and reset the paramArray value as an array.
762 11
                $v = substr($v, 1, -1);
763 11
                $paramArray[$p] = [];
764
765
                // Explode parts and traverse them:
766 11
                $parts = explode('|', $v);
767 11
                foreach ($parts as $pV) {
768
769
                    // Look for integer range: (fx. 1-34 or -40--30 // reads minus 40 to minus 30)
770 11
                    if (preg_match('/^(-?[0-9]+)\s*-\s*(-?[0-9]+)$/', trim($pV), $reg)) {
771 1
                        $reg = $this->swapIfFirstIsLargerThanSecond($reg);
772
773
                        // Traverse range, add values:
774
                        // Limit to size of range!
775 1
                        $runAwayBrake = 1000;
776 1
                        for ($a = $reg[1]; $a <= $reg[2]; $a++) {
777 1
                            $paramArray[$p][] = $a;
778 1
                            $runAwayBrake--;
779 1
                            if ($runAwayBrake <= 0) {
780
                                break;
781
                            }
782
                        }
783 10
                    } elseif (strpos(trim($pV), '_TABLE:') === 0) {
784
785
                        // Parse parameters:
786 6
                        $subparts = GeneralUtility::trimExplode(';', $pV);
787 6
                        $subpartParams = [];
788 6
                        foreach ($subparts as $spV) {
789 6
                            [$pKey, $pVal] = GeneralUtility::trimExplode(':', $spV);
790 6
                            $subpartParams[$pKey] = $pVal;
791
                        }
792
793
                        // Table exists:
794 6
                        if (isset($GLOBALS['TCA'][$subpartParams['_TABLE']])) {
795 6
                            $lookUpPid = isset($subpartParams['_PID']) ? intval($subpartParams['_PID']) : intval($pid);
796 6
                            $recursiveDepth = isset($subpartParams['_RECURSIVE']) ? intval($subpartParams['_RECURSIVE']) : 0;
797 6
                            $pidField = isset($subpartParams['_PIDFIELD']) ? trim($subpartParams['_PIDFIELD']) : 'pid';
798 6
                            $where = $subpartParams['_WHERE'] ?? '';
799 6
                            $addTable = $subpartParams['_ADDTABLE'] ?? '';
800
801 6
                            $fieldName = $subpartParams['_FIELD'] ? $subpartParams['_FIELD'] : 'uid';
802 6
                            if ($fieldName === 'uid' || $GLOBALS['TCA'][$subpartParams['_TABLE']]['columns'][$fieldName]) {
803 6
                                $queryBuilder = $this->getQueryBuilder($subpartParams['_TABLE']);
804
805 6
                                if ($recursiveDepth > 0) {
806
                                    /** @var QueryGenerator $queryGenerator */
807 2
                                    $queryGenerator = GeneralUtility::makeInstance(QueryGenerator::class);
808 2
                                    $pidList = $queryGenerator->getTreeList($lookUpPid, $recursiveDepth, 0, 1);
809 2
                                    $pidArray = GeneralUtility::intExplode(',', $pidList);
810
                                } else {
811 4
                                    $pidArray = [(string) $lookUpPid];
812
                                }
813
814 6
                                $queryBuilder->getRestrictions()
815 6
                                    ->removeAll()
816 6
                                    ->add(GeneralUtility::makeInstance(DeletedRestriction::class));
817
818
                                $queryBuilder
819 6
                                    ->select($fieldName)
820 6
                                    ->from($subpartParams['_TABLE'])
821 6
                                    ->where(
822 6
                                        $queryBuilder->expr()->in($pidField, $queryBuilder->createNamedParameter($pidArray, Connection::PARAM_INT_ARRAY)),
823
                                        $where
824
                                    );
825
826 6
                                if (! empty($addTable)) {
827
                                    // TODO: Check if this works as intended!
828
                                    $queryBuilder->add('from', $addTable);
829
                                }
830 6
                                $transOrigPointerField = $GLOBALS['TCA'][$subpartParams['_TABLE']]['ctrl']['transOrigPointerField'];
831
832 6
                                if ($subpartParams['_ENABLELANG'] && $transOrigPointerField) {
833
                                    $queryBuilder->andWhere(
834
                                        $queryBuilder->expr()->lte(
835
                                            $transOrigPointerField,
836
                                            0
837
                                        )
838
                                    );
839
                                }
840
841 6
                                $statement = $queryBuilder->execute();
842
843 6
                                $rows = [];
844 6
                                while ($row = $statement->fetch()) {
845 6
                                    $rows[$row[$fieldName]] = $row;
846
                                }
847
848 6
                                if (is_array($rows)) {
849 6
                                    $paramArray[$p] = array_merge($paramArray[$p], array_keys($rows));
850
                                }
851
                            }
852
                        }
853
                    } else {
854
                        // Just add value:
855 4
                        $paramArray[$p][] = $pV;
856
                    }
857
                    // Hook for processing own expandParameters place holder
858 11
                    if (is_array($GLOBALS['TYPO3_CONF_VARS']['SC_OPTIONS']['crawler/class.tx_crawler_lib.php']['expandParameters'])) {
859
                        $_params = [
860
                            'pObj' => &$this,
861
                            'paramArray' => &$paramArray,
862
                            'currentKey' => $p,
863
                            'currentValue' => $pV,
864
                            'pid' => $pid,
865
                        ];
866
                        foreach ($GLOBALS['TYPO3_CONF_VARS']['SC_OPTIONS']['crawler/class.tx_crawler_lib.php']['expandParameters'] as $_funcRef) {
867
                            GeneralUtility::callUserFunction($_funcRef, $_params, $this);
868
                        }
869
                    }
870
                }
871
872
                // Make unique set of values and sort array by key:
873 11
                $paramArray[$p] = array_unique($paramArray[$p]);
874 11
                ksort($paramArray);
875
            } else {
876
                // Set the literal value as only value in array:
877 4
                $paramArray[$p] = [$v];
878
            }
879
        }
880
881 11
        return $paramArray;
882
    }
883
884
    /**
885
     * Compiling URLs from parameter array (output of expandParameters())
886
     * The number of URLs will be the multiplication of the number of parameter values for each key
887
     *
888
     * @param array $paramArray Output of expandParameters(): Array with keys (GET var names) and for each an array of values
889
     * @param array $urls URLs accumulated in this array (for recursion)
890
     * @return array
891
     */
892 7
    public function compileUrls($paramArray, array $urls)
893
    {
894 7
        if (empty($paramArray)) {
895 7
            return $urls;
896
        }
897
        // shift first off stack:
898 6
        reset($paramArray);
899 6
        $varName = key($paramArray);
900 6
        $valueSet = array_shift($paramArray);
901
902
        // Traverse value set:
903 6
        $newUrls = [];
904 6
        foreach ($urls as $url) {
905 5
            foreach ($valueSet as $val) {
906 5
                $newUrls[] = $url . (strcmp((string) $val, '') ? '&' . rawurlencode($varName) . '=' . rawurlencode((string) $val) : '');
907
908 5
                if (count($newUrls) > $this->maximumUrlsToCompile) {
909
                    break;
910
                }
911
            }
912
        }
913 6
        return $this->compileUrls($paramArray, $newUrls);
914
    }
915
916
    /************************************
917
     *
918
     * Crawler log
919
     *
920
     ************************************/
921
922
    /**
923
     * Return array of records from crawler queue for input page ID
924
     *
925
     * @param integer $id Page ID for which to look up log entries.
926
     * @param boolean $doFlush If TRUE, then entries selected at DELETED(!) instead of selected!
927
     * @param boolean $doFullFlush
928
     * @param integer $itemsPerPage Limit the amount of entries per page default is 10
929
     * @return array
930
     */
931 4
    public function getLogEntriesForPageId($id, QueueFilter $queueFilter, $doFlush = false, $doFullFlush = false, $itemsPerPage = 10)
932
    {
933 4
        $queryBuilder = GeneralUtility::makeInstance(ConnectionPool::class)->getQueryBuilderForTable($this->tableName);
934
        $queryBuilder
935 4
            ->select('*')
936 4
            ->from($this->tableName)
937 4
            ->where(
938 4
                $queryBuilder->expr()->eq('page_id', $queryBuilder->createNamedParameter($id, PDO::PARAM_INT))
939
            )
940 4
            ->orderBy('scheduled', 'DESC');
941
942 4
        $expressionBuilder = GeneralUtility::makeInstance(ConnectionPool::class)
943 4
            ->getConnectionForTable($this->tableName)
944 4
            ->getExpressionBuilder();
945 4
        $query = $expressionBuilder->andX();
0 ignored issues
show
Unused Code introduced by
The assignment to $query is dead and can be removed.
Loading history...
946
        // PHPStorm adds the highlight that the $addWhere is immediately overwritten,
947
        // but the $query = $expressionBuilder->andX() ensures that the $addWhere is written correctly with AND
948
        // between the statements, it's not a mistake in the code.
949 4
        switch ($queueFilter) {
950 4
            case 'pending':
951
                $queryBuilder->andWhere($queryBuilder->expr()->eq('exec_time', 0));
952
                break;
953 4
            case 'finished':
954
                $queryBuilder->andWhere($queryBuilder->expr()->gt('exec_time', 0));
955
                break;
956
        }
957
958 4
        if ($doFlush) {
959 2
            if ($doFullFlush) {
960 1
                $this->queueRepository->flushQueue($queueFilter);
961
            } else {
962 1
                $this->queueRepository->flushQueue($queueFilter);
963
            }
964
        }
965 4
        if ($itemsPerPage > 0) {
966
            $queryBuilder
967 4
                ->setMaxResults((int) $itemsPerPage);
968
        }
969
970 4
        return $queryBuilder->execute()->fetchAll();
971
    }
972
973
    /**
974
     * Return array of records from crawler queue for input set ID
975
     *
976
     * @param int $set_id Set ID for which to look up log entries.
977
     * @param string $filter Filter: "all" => all entries, "pending" => all that is not yet run, "finished" => all complete ones
978
     * @param bool $doFlush If TRUE, then entries selected at DELETED(!) instead of selected!
979
     * @param int $itemsPerPage Limit the amount of entries per page default is 10
980
     * @return array
981
     *
982
     * @deprecated
983
     */
984 6
    public function getLogEntriesForSetId(int $set_id, string $filter = '', bool $doFlush = false, bool $doFullFlush = false, int $itemsPerPage = 10)
985
    {
986 6
        $queryBuilder = GeneralUtility::makeInstance(ConnectionPool::class)->getQueryBuilderForTable($this->tableName);
987
        $queryBuilder
988 6
            ->select('*')
989 6
            ->from($this->tableName)
990 6
            ->where(
991 6
                $queryBuilder->expr()->eq('set_id', $queryBuilder->createNamedParameter($set_id, PDO::PARAM_INT))
992
            )
993 6
            ->orderBy('scheduled', 'DESC');
994
995 6
        $expressionBuilder = GeneralUtility::makeInstance(ConnectionPool::class)
996 6
            ->getConnectionForTable($this->tableName)
997 6
            ->getExpressionBuilder();
998 6
        $query = $expressionBuilder->andX();
999
        // PHPStorm adds the highlight that the $addWhere is immediately overwritten,
1000
        // but the $query = $expressionBuilder->andX() ensures that the $addWhere is written correctly with AND
1001
        // between the statements, it's not a mistake in the code.
1002 6
        $addWhere = '';
1003 6
        switch ($filter) {
1004 6
            case 'pending':
1005 1
                $queryBuilder->andWhere($queryBuilder->expr()->eq('exec_time', 0));
1006 1
                $addWhere = $query->add($expressionBuilder->eq('exec_time', 0));
0 ignored issues
show
Unused Code introduced by
The assignment to $addWhere is dead and can be removed.
Loading history...
1007 1
                break;
1008 5
            case 'finished':
1009 1
                $queryBuilder->andWhere($queryBuilder->expr()->gt('exec_time', 0));
1010 1
                $addWhere = $query->add($expressionBuilder->gt('exec_time', 0));
1011 1
                break;
1012
        }
1013 6
        if ($doFlush) {
1014 4
            $addWhere = $query->add($expressionBuilder->eq('set_id', (int) $set_id));
1015 4
            $this->flushQueue($doFullFlush ? '' : $addWhere);
0 ignored issues
show
Deprecated Code introduced by
The function AOE\Crawler\Controller\C...ontroller::flushQueue() has been deprecated. ( Ignorable by Annotation )

If this is a false-positive, you can also ignore this issue in your code via the ignore-deprecated  annotation

1015
            /** @scrutinizer ignore-deprecated */ $this->flushQueue($doFullFlush ? '' : $addWhere);
Loading history...
1016 4
            return [];
1017
        }
1018 2
        if ($itemsPerPage > 0) {
1019
            $queryBuilder
1020 2
                ->setMaxResults((int) $itemsPerPage);
1021
        }
1022
1023 2
        return $queryBuilder->execute()->fetchAll();
1024
    }
1025
1026
    /**
1027
     * Adding call back entries to log (called from hooks typically, see indexed search class "class.crawler.php"
1028
     *
1029
     * @param integer $setId Set ID
1030
     * @param array $params Parameters to pass to call back function
1031
     * @param string $callBack Call back object reference, eg. 'EXT:indexed_search/class.crawler.php:&tx_indexedsearch_crawler'
1032
     * @param integer $page_id Page ID to attach it to
1033
     * @param integer $schedule Time at which to activate
1034
     */
1035
    public function addQueueEntry_callBack($setId, $params, $callBack, $page_id = 0, $schedule = 0): void
1036
    {
1037
        if (! is_array($params)) {
0 ignored issues
show
introduced by
The condition is_array($params) is always true.
Loading history...
1038
            $params = [];
1039
        }
1040
        $params['_CALLBACKOBJ'] = $callBack;
1041
1042
        GeneralUtility::makeInstance(ConnectionPool::class)->getConnectionForTable('tx_crawler_queue')
1043
            ->insert(
1044
                'tx_crawler_queue',
1045
                [
1046
                    'page_id' => (int) $page_id,
1047
                    'parameters' => json_encode($params),
1048
                    'scheduled' => (int) $schedule ?: $this->getCurrentTime(),
1049
                    'exec_time' => 0,
1050
                    'set_id' => (int) $setId,
1051
                    'result_data' => '',
1052
                ]
1053
            );
1054
    }
1055
1056
    /************************************
1057
     *
1058
     * URL setting
1059
     *
1060
     ************************************/
1061
1062
    /**
1063
     * Setting a URL for crawling:
1064
     *
1065
     * @param integer $id Page ID
1066
     * @param string $url Complete URL
1067
     * @param array $subCfg Sub configuration array (from TS config)
1068
     * @param integer $tstamp Scheduled-time
1069
     * @param string $configurationHash (optional) configuration hash
1070
     * @param bool $skipInnerDuplicationCheck (optional) skip inner duplication check
1071
     * @return bool
1072
     */
1073 8
    public function addUrl(
1074
        $id,
1075
        $url,
1076
        array $subCfg,
1077
        $tstamp,
1078
        $configurationHash = '',
1079
        $skipInnerDuplicationCheck = false
1080
    ) {
1081 8
        $urlAdded = false;
1082 8
        $rows = [];
1083
1084
        // Creating parameters:
1085
        $parameters = [
1086 8
            'url' => $url,
1087
        ];
1088
1089
        // fe user group simulation:
1090 8
        $uGs = implode(',', array_unique(GeneralUtility::intExplode(',', $subCfg['userGroups'], true)));
1091 8
        if ($uGs) {
1092 1
            $parameters['feUserGroupList'] = $uGs;
1093
        }
1094
1095
        // Setting processing instructions
1096 8
        $parameters['procInstructions'] = GeneralUtility::trimExplode(',', $subCfg['procInstrFilter']);
1097 8
        if (is_array($subCfg['procInstrParams.'])) {
1098 5
            $parameters['procInstrParams'] = $subCfg['procInstrParams.'];
1099
        }
1100
1101
        // Compile value array:
1102 8
        $parameters_serialized = json_encode($parameters);
1103
        $fieldArray = [
1104 8
            'page_id' => (int) $id,
1105 8
            'parameters' => $parameters_serialized,
1106 8
            'parameters_hash' => GeneralUtility::shortMD5($parameters_serialized),
1107 8
            'configuration_hash' => $configurationHash,
1108 8
            'scheduled' => $tstamp,
1109 8
            'exec_time' => 0,
1110 8
            'set_id' => (int) $this->setID,
1111 8
            'result_data' => '',
1112 8
            'configuration' => $subCfg['key'],
1113
        ];
1114
1115 8
        if ($this->registerQueueEntriesInternallyOnly) {
1116
            //the entries will only be registered and not stored to the database
1117 1
            $this->queueEntries[] = $fieldArray;
1118
        } else {
1119 7
            if (! $skipInnerDuplicationCheck) {
1120
                // check if there is already an equal entry
1121 6
                $rows = $this->queueRepository->getDuplicateQueueItemsIfExists(
1122 6
                    (bool) $this->extensionSettings['enableTimeslot'],
1123
                    $tstamp,
1124 6
                    $this->getCurrentTime(),
1125 6
                    $fieldArray['page_id'],
1126 6
                    $fieldArray['parameters_hash']
1127
                );
1128
            }
1129
1130 7
            if (empty($rows)) {
1131 6
                $connectionForCrawlerQueue = GeneralUtility::makeInstance(ConnectionPool::class)->getConnectionForTable('tx_crawler_queue');
1132 6
                $connectionForCrawlerQueue->insert(
1133 6
                    'tx_crawler_queue',
1134
                    $fieldArray
1135
                );
1136 6
                $uid = $connectionForCrawlerQueue->lastInsertId('tx_crawler_queue', 'qid');
1137 6
                $rows[] = $uid;
1138 6
                $urlAdded = true;
1139
1140 6
                $signalPayload = ['uid' => $uid, 'fieldArray' => $fieldArray];
1141 6
                SignalSlotUtility::emitSignal(
1142 6
                    self::class,
1143 6
                    SignalSlotUtility::SIGNAL_URL_ADDED_TO_QUEUE,
1144
                    $signalPayload
1145
                );
1146
            } else {
1147 3
                $signalPayload = ['rows' => $rows, 'fieldArray' => $fieldArray];
1148 3
                SignalSlotUtility::emitSignal(
1149 3
                    self::class,
1150 3
                    SignalSlotUtility::SIGNAL_DUPLICATE_URL_IN_QUEUE,
1151
                    $signalPayload
1152
                );
1153
            }
1154
        }
1155
1156 8
        return $urlAdded;
1157
    }
1158
1159
    /**
1160
     * Returns the current system time
1161
     *
1162
     * @return int
1163
     */
1164 2
    public function getCurrentTime()
1165
    {
1166 2
        return time();
1167
    }
1168
1169
    /************************************
1170
     *
1171
     * URL reading
1172
     *
1173
     ************************************/
1174
1175
    /**
1176
     * Read URL for single queue entry
1177
     *
1178
     * @param integer $queueId
1179
     * @param boolean $force If set, will process even if exec_time has been set!
1180
     * @return integer
1181
     */
1182
    public function readUrl($queueId, $force = false)
1183
    {
1184
        $queryBuilder = GeneralUtility::makeInstance(ConnectionPool::class)->getQueryBuilderForTable($this->tableName);
1185
        $ret = 0;
1186
        $this->logger->debug('crawler-readurl start ' . microtime(true));
1187
        // Get entry:
1188
        $queryBuilder
1189
            ->select('*')
1190
            ->from('tx_crawler_queue')
1191
            ->where(
1192
                $queryBuilder->expr()->eq('qid', $queryBuilder->createNamedParameter($queueId, PDO::PARAM_INT))
1193
            );
1194
        if (! $force) {
1195
            $queryBuilder
1196
                ->andWhere('exec_time = 0')
1197
                ->andWhere('process_scheduled > 0');
1198
        }
1199
        $queueRec = $queryBuilder->execute()->fetch();
1200
1201
        if (! is_array($queueRec)) {
1202
            return;
1203
        }
1204
1205
        SignalSlotUtility::emitSignal(
1206
            self::class,
1207
            SignalSlotUtility::SIGNAL_QUEUEITEM_PREPROCESS,
1208
            [$queueId, &$queueRec]
1209
        );
1210
1211
        // Set exec_time to lock record:
1212
        $field_array = ['exec_time' => $this->getCurrentTime()];
1213
1214
        if (isset($this->processID)) {
1215
            //if mulitprocessing is used we need to store the id of the process which has handled this entry
1216
            $field_array['process_id_completed'] = $this->processID;
1217
        }
1218
1219
        GeneralUtility::makeInstance(ConnectionPool::class)->getConnectionForTable('tx_crawler_queue')
1220
            ->update(
1221
                'tx_crawler_queue',
1222
                $field_array,
1223
                ['qid' => (int) $queueId]
1224
            );
1225
1226
        $result = $this->queueExecutor->executeQueueItem($queueRec, $this);
1227
        if ($result['content'] === null) {
1228
            $resultData = 'An errors happened';
1229
        } else {
1230
            /** @var JsonCompatibilityConverter $jsonCompatibilityConverter */
1231
            $jsonCompatibilityConverter = GeneralUtility::makeInstance(JsonCompatibilityConverter::class);
1232
            $resultData = $jsonCompatibilityConverter->convert($result['content']);
1233
        }
1234
1235
        //atm there's no need to point to specific pollable extensions
1236
        if (is_array($GLOBALS['TYPO3_CONF_VARS']['EXTCONF']['crawler']['pollSuccess'])) {
1237
            foreach ($GLOBALS['TYPO3_CONF_VARS']['EXTCONF']['crawler']['pollSuccess'] as $pollable) {
1238
                // only check the success value if the instruction is runnig
1239
                // it is important to name the pollSuccess key same as the procInstructions key
1240
                if (is_array($resultData['parameters']['procInstructions'])
1241
                    && in_array(
1242
                        $pollable,
1243
                        $resultData['parameters']['procInstructions'], true
1244
                    )
1245
                ) {
1246
                    if (! empty($resultData['success'][$pollable]) && $resultData['success'][$pollable]) {
1247
                        $ret |= self::CLI_STATUS_POLLABLE_PROCESSED;
1248
                    }
1249
                }
1250
            }
1251
        }
1252
1253
        // Set result in log which also denotes the end of the processing of this entry.
1254
        $field_array = ['result_data' => json_encode($result)];
1255
1256
        SignalSlotUtility::emitSignal(
1257
            self::class,
1258
            SignalSlotUtility::SIGNAL_QUEUEITEM_POSTPROCESS,
1259
            [$queueId, &$field_array]
1260
        );
1261
1262
        GeneralUtility::makeInstance(ConnectionPool::class)->getConnectionForTable('tx_crawler_queue')
1263
            ->update(
1264
                'tx_crawler_queue',
1265
                $field_array,
1266
                ['qid' => (int) $queueId]
1267
            );
1268
1269
        $this->logger->debug('crawler-readurl stop ' . microtime(true));
1270
        return $ret;
1271
    }
1272
1273
    /**
1274
     * Read URL for not-yet-inserted log-entry
1275
     *
1276
     * @param array $field_array Queue field array,
1277
     *
1278
     * @return array|bool|mixed|string
1279
     */
1280
    public function readUrlFromArray($field_array)
1281
    {
1282
        // Set exec_time to lock record:
1283
        $field_array['exec_time'] = $this->getCurrentTime();
1284
        $connectionForCrawlerQueue = GeneralUtility::makeInstance(ConnectionPool::class)->getConnectionForTable($this->tableName);
1285
        $connectionForCrawlerQueue->insert(
1286
            $this->tableName,
1287
            $field_array
1288
        );
1289
        $queueId = $connectionForCrawlerQueue->lastInsertId($this->tableName, 'qid');
1290
        $result = $this->queueExecutor->executeQueueItem($field_array, $this);
1291
1292
        // Set result in log which also denotes the end of the processing of this entry.
1293
        $field_array = ['result_data' => json_encode($result)];
1294
1295
        SignalSlotUtility::emitSignal(
1296
            self::class,
1297
            SignalSlotUtility::SIGNAL_QUEUEITEM_POSTPROCESS,
1298
            [$queueId, &$field_array]
1299
        );
1300
1301
        $connectionForCrawlerQueue->update(
1302
            $this->tableName,
1303
            $field_array,
1304
            ['qid' => $queueId]
1305
        );
1306
1307
        return $result;
1308
    }
1309
1310
    /*****************************
1311
     *
1312
     * Compiling URLs to crawl - tools
1313
     *
1314
     *****************************/
1315
1316
    /**
1317
     * @param integer $id Root page id to start from.
1318
     * @param integer $depth Depth of tree, 0=only id-page, 1= on sublevel, 99 = infinite
1319
     * @param integer $scheduledTime Unix Time when the URL is timed to be visited when put in queue
1320
     * @param integer $reqMinute Number of requests per minute (creates the interleave between requests)
1321
     * @param boolean $submitCrawlUrls If set, submits the URLs to queue in database (real crawling)
1322
     * @param boolean $downloadCrawlUrls If set (and submitcrawlUrls is false) will fill $downloadUrls with entries)
1323
     * @param array $incomingProcInstructions Array of processing instructions
1324
     * @param array $configurationSelection Array of configuration keys
1325
     * @return string
1326
     */
1327
    public function getPageTreeAndUrls(
1328
        $id,
1329
        $depth,
1330
        $scheduledTime,
1331
        $reqMinute,
1332
        $submitCrawlUrls,
1333
        $downloadCrawlUrls,
1334
        array $incomingProcInstructions,
1335
        array $configurationSelection
1336
    ) {
1337
        $this->scheduledTime = $scheduledTime;
1338
        $this->reqMinute = $reqMinute;
1339
        $this->submitCrawlUrls = $submitCrawlUrls;
1340
        $this->downloadCrawlUrls = $downloadCrawlUrls;
1341
        $this->incomingProcInstructions = $incomingProcInstructions;
1342
        $this->incomingConfigurationSelection = $configurationSelection;
1343
1344
        $this->duplicateTrack = [];
1345
        $this->downloadUrls = [];
1346
1347
        // Drawing tree:
1348
        /* @var PageTreeView $tree */
1349
        $tree = GeneralUtility::makeInstance(PageTreeView::class);
1350
        $perms_clause = $this->getBackendUser()->getPagePermsClause(Permission::PAGE_SHOW);
1351
        $tree->init('AND ' . $perms_clause);
1352
1353
        $pageInfo = BackendUtility::readPageAccess($id, $perms_clause);
1354
        if (is_array($pageInfo)) {
0 ignored issues
show
introduced by
The condition is_array($pageInfo) is always true.
Loading history...
1355
            // Set root row:
1356
            $tree->tree[] = [
1357
                'row' => $pageInfo,
1358
                'HTML' => $this->iconFactory->getIconForRecord('pages', $pageInfo, Icon::SIZE_SMALL),
1359
            ];
1360
        }
1361
1362
        // Get branch beneath:
1363
        if ($depth) {
1364
            $tree->getTree($id, $depth, '');
1365
        }
1366
1367
        // Traverse page tree:
1368
        $code = '';
1369
1370
        foreach ($tree->tree as $data) {
1371
            $this->MP = false;
1372
1373
            // recognize mount points
1374
            if ($data['row']['doktype'] === PageRepository::DOKTYPE_MOUNTPOINT) {
1375
                $mountpage = $this->pageRepository->getPage($data['row']['uid']);
1376
1377
                // fetch mounted pages
1378
                $this->MP = $mountpage[0]['mount_pid'] . '-' . $data['row']['uid'];
0 ignored issues
show
Documentation Bug introduced by
The property $MP was declared of type boolean, but $mountpage[0]['mount_pid...' . $data['row']['uid'] is of type string. Maybe add a type cast?

This check looks for assignments to scalar types that may be of the wrong type.

To ensure the code behaves as expected, it may be a good idea to add an explicit type cast.

$answer = 42;

$correct = false;

$correct = (bool) $answer;
Loading history...
1379
1380
                $mountTree = GeneralUtility::makeInstance(PageTreeView::class);
1381
                $mountTree->init('AND ' . $perms_clause);
1382
                $mountTree->getTree($mountpage[0]['mount_pid'], $depth);
1383
1384
                foreach ($mountTree->tree as $mountData) {
1385
                    $code .= $this->drawURLs_addRowsForPage(
1386
                        $mountData['row'],
1387
                        $mountData['HTML'] . BackendUtility::getRecordTitle('pages', $mountData['row'], true)
1388
                    );
1389
                }
1390
1391
                // replace page when mount_pid_ol is enabled
1392
                if ($mountpage[0]['mount_pid_ol']) {
1393
                    $data['row']['uid'] = $mountpage[0]['mount_pid'];
1394
                } else {
1395
                    // if the mount_pid_ol is not set the MP must not be used for the mountpoint page
1396
                    $this->MP = false;
1397
                }
1398
            }
1399
1400
            $code .= $this->drawURLs_addRowsForPage(
1401
                $data['row'],
1402
                $data['HTML'] . BackendUtility::getRecordTitle('pages', $data['row'], true)
1403
            );
1404
        }
1405
1406
        return $code;
1407
    }
1408
1409
    /**
1410
     * Expands exclude string
1411
     *
1412
     * @param string $excludeString Exclude string
1413
     * @return array
1414
     */
1415 2
    public function expandExcludeString($excludeString)
1416
    {
1417
        // internal static caches;
1418 2
        static $expandedExcludeStringCache;
1419 2
        static $treeCache;
1420
1421 2
        if (empty($expandedExcludeStringCache[$excludeString])) {
1422 2
            $pidList = [];
1423
1424 2
            if (! empty($excludeString)) {
1425
                /** @var PageTreeView $tree */
1426 1
                $tree = GeneralUtility::makeInstance(PageTreeView::class);
1427 1
                $tree->init('AND ' . $this->getBackendUser()->getPagePermsClause(Permission::PAGE_SHOW));
1428
1429 1
                $excludeParts = GeneralUtility::trimExplode(',', $excludeString);
1430
1431 1
                foreach ($excludeParts as $excludePart) {
1432 1
                    [$pid, $depth] = GeneralUtility::trimExplode('+', $excludePart);
1433
1434
                    // default is "page only" = "depth=0"
1435 1
                    if (empty($depth)) {
1436 1
                        $depth = (stristr($excludePart, '+')) ? 99 : 0;
1437
                    }
1438
1439 1
                    $pidList[] = (int) $pid;
1440
1441 1
                    if ($depth > 0) {
1442
                        if (empty($treeCache[$pid][$depth])) {
1443
                            $tree->reset();
1444
                            $tree->getTree($pid, $depth);
0 ignored issues
show
Bug introduced by
$pid of type string is incompatible with the type integer expected by parameter $uid of TYPO3\CMS\Backend\Tree\V...ractTreeView::getTree(). ( Ignorable by Annotation )

If this is a false-positive, you can also ignore this issue in your code via the ignore-type  annotation

1444
                            $tree->getTree(/** @scrutinizer ignore-type */ $pid, $depth);
Loading history...
1445
                            $treeCache[$pid][$depth] = $tree->tree;
1446
                        }
1447
1448
                        foreach ($treeCache[$pid][$depth] as $data) {
1449
                            $pidList[] = (int) $data['row']['uid'];
1450
                        }
1451
                    }
1452
                }
1453
            }
1454
1455 2
            $expandedExcludeStringCache[$excludeString] = array_unique($pidList);
1456
        }
1457
1458 2
        return $expandedExcludeStringCache[$excludeString];
1459
    }
1460
1461
    /**
1462
     * Create the rows for display of the page tree
1463
     * For each page a number of rows are shown displaying GET variable configuration
1464
     */
1465
    public function drawURLs_addRowsForPage(array $pageRow, string $pageTitle): string
1466
    {
1467
        $skipMessage = '';
1468
1469
        // Get list of configurations
1470
        $configurations = $this->getUrlsForPageRow($pageRow, $skipMessage);
1471
1472
        if (! empty($this->incomingConfigurationSelection)) {
1473
            // remove configuration that does not match the current selection
1474
            foreach ($configurations as $confKey => $confArray) {
1475
                if (! in_array($confKey, $this->incomingConfigurationSelection, true)) {
1476
                    unset($configurations[$confKey]);
1477
                }
1478
            }
1479
        }
1480
1481
        // Traverse parameter combinations:
1482
        $c = 0;
1483
        $content = '';
1484
        if (! empty($configurations)) {
1485
            foreach ($configurations as $confKey => $confArray) {
1486
1487
                // Title column:
1488
                if (! $c) {
1489
                    $titleClm = '<td rowspan="' . count($configurations) . '">' . $pageTitle . '</td>';
1490
                } else {
1491
                    $titleClm = '';
1492
                }
1493
1494
                if (! in_array($pageRow['uid'], $this->expandExcludeString($confArray['subCfg']['exclude']), true)) {
1495
1496
                    // URL list:
1497
                    $urlList = $this->urlListFromUrlArray(
1498
                        $confArray,
1499
                        $pageRow,
1500
                        $this->scheduledTime,
1501
                        $this->reqMinute,
1502
                        $this->submitCrawlUrls,
1503
                        $this->downloadCrawlUrls,
1504
                        $this->duplicateTrack,
1505
                        $this->downloadUrls,
1506
                        // if empty the urls won't be filtered by processing instructions
1507
                        $this->incomingProcInstructions
1508
                    );
1509
1510
                    // Expanded parameters:
1511
                    $paramExpanded = '';
1512
                    $calcAccu = [];
1513
                    $calcRes = 1;
1514
                    foreach ($confArray['paramExpanded'] as $gVar => $gVal) {
1515
                        $paramExpanded .= '
1516
                            <tr>
1517
                                <td>' . htmlspecialchars('&' . $gVar . '=') . '<br/>' .
1518
                            '(' . count($gVal) . ')' .
1519
                            '</td>
1520
                                <td nowrap="nowrap">' . nl2br(htmlspecialchars(implode(chr(10), $gVal))) . '</td>
1521
                            </tr>
1522
                        ';
1523
                        $calcRes *= count($gVal);
1524
                        $calcAccu[] = count($gVal);
1525
                    }
1526
                    $paramExpanded = '<table>' . $paramExpanded . '</table>';
1527
                    $paramExpanded .= 'Comb: ' . implode('*', $calcAccu) . '=' . $calcRes;
1528
1529
                    // Options
1530
                    $optionValues = '';
1531
                    if ($confArray['subCfg']['userGroups']) {
1532
                        $optionValues .= 'User Groups: ' . $confArray['subCfg']['userGroups'] . '<br/>';
1533
                    }
1534
                    if ($confArray['subCfg']['procInstrFilter']) {
1535
                        $optionValues .= 'ProcInstr: ' . $confArray['subCfg']['procInstrFilter'] . '<br/>';
1536
                    }
1537
1538
                    // Compile row:
1539
                    $content .= '
1540
                        <tr>
1541
                            ' . $titleClm . '
1542
                            <td>' . htmlspecialchars($confKey) . '</td>
1543
                            <td>' . nl2br(htmlspecialchars(rawurldecode(trim(str_replace('&', chr(10) . '&', GeneralUtility::implodeArrayForUrl('', $confArray['paramParsed'])))))) . '</td>
1544
                            <td>' . $paramExpanded . '</td>
1545
                            <td nowrap="nowrap">' . $urlList . '</td>
1546
                            <td nowrap="nowrap">' . $optionValues . '</td>
1547
                            <td nowrap="nowrap">' . DebugUtility::viewArray($confArray['subCfg']['procInstrParams.']) . '</td>
1548
                        </tr>';
1549
                } else {
1550
                    $content .= '<tr>
1551
                            ' . $titleClm . '
1552
                            <td>' . htmlspecialchars($confKey) . '</td>
1553
                            <td colspan="5"><em>No entries</em> (Page is excluded in this configuration)</td>
1554
                        </tr>';
1555
                }
1556
1557
                $c++;
1558
            }
1559
        } else {
1560
            $message = ! empty($skipMessage) ? ' (' . $skipMessage . ')' : '';
1561
1562
            // Compile row:
1563
            $content .= '
1564
                <tr>
1565
                    <td>' . $pageTitle . '</td>
1566
                    <td colspan="6"><em>No entries</em>' . $message . '</td>
1567
                </tr>';
1568
        }
1569
1570
        return $content;
1571
    }
1572
1573
    /*****************************
1574
     *
1575
     * CLI functions
1576
     *
1577
     *****************************/
1578
1579
    /**
1580
     * Running the functionality of the CLI (crawling URLs from queue)
1581
     */
1582
    public function CLI_run(int $countInARun, int $sleepTime, int $sleepAfterFinish): int
1583
    {
1584
        $result = 0;
1585
        $counter = 0;
1586
1587
        // First, run hooks:
1588
        $this->CLI_runHooks();
1589
1590
        // Clean up the queue
1591
        $this->queueRepository->cleanupQueue();
1592
1593
        // Select entries:
1594
        $rows = $this->queueRepository->fetchRecordsToBeCrawled($countInARun);
1595
1596
        if (! empty($rows)) {
1597
            $quidList = [];
1598
1599
            foreach ($rows as $r) {
1600
                $quidList[] = $r['qid'];
1601
            }
1602
1603
            $processId = $this->CLI_buildProcessId();
1604
1605
            //save the number of assigned queue entries to determine how many have been processed later
1606
            $numberOfAffectedRows = $this->queueRepository->updateProcessIdAndSchedulerForQueueIds($quidList, $processId);
1607
            $this->processRepository->updateProcessAssignItemsCount($numberOfAffectedRows, $processId);
1608
1609
            if ($numberOfAffectedRows !== count($quidList)) {
1610
                $this->CLI_debug('Nothing processed due to multi-process collision (' . $this->CLI_buildProcessId() . ')');
0 ignored issues
show
Deprecated Code introduced by
The function AOE\Crawler\Controller\C...Controller::CLI_debug() has been deprecated. ( Ignorable by Annotation )

If this is a false-positive, you can also ignore this issue in your code via the ignore-deprecated  annotation

1610
                /** @scrutinizer ignore-deprecated */ $this->CLI_debug('Nothing processed due to multi-process collision (' . $this->CLI_buildProcessId() . ')');
Loading history...
1611
                return ($result | self::CLI_STATUS_ABORTED);
1612
            }
1613
1614
            foreach ($rows as $r) {
1615
                $result |= $this->readUrl($r['qid']);
1616
1617
                $counter++;
1618
                // Just to relax the system
1619
                usleep((int) $sleepTime);
1620
1621
                // if during the start and the current read url the cli has been disable we need to return from the function
1622
                // mark the process NOT as ended.
1623
                if ($this->crawler->isDisabled()) {
1624
                    return ($result | self::CLI_STATUS_ABORTED);
1625
                }
1626
1627
                if (! $this->processRepository->isProcessActive($this->CLI_buildProcessId())) {
1628
                    $this->CLI_debug('conflict / timeout (' . $this->CLI_buildProcessId() . ')');
0 ignored issues
show
Deprecated Code introduced by
The function AOE\Crawler\Controller\C...Controller::CLI_debug() has been deprecated. ( Ignorable by Annotation )

If this is a false-positive, you can also ignore this issue in your code via the ignore-deprecated  annotation

1628
                    /** @scrutinizer ignore-deprecated */ $this->CLI_debug('conflict / timeout (' . $this->CLI_buildProcessId() . ')');
Loading history...
1629
                    $result |= self::CLI_STATUS_ABORTED;
1630
                    //possible timeout
1631
                    break;
1632
                }
1633
            }
1634
1635
            sleep((int) $sleepAfterFinish);
1636
1637
            $msg = 'Rows: ' . $counter;
1638
            $this->CLI_debug($msg . ' (' . $this->CLI_buildProcessId() . ')');
0 ignored issues
show
Deprecated Code introduced by
The function AOE\Crawler\Controller\C...Controller::CLI_debug() has been deprecated. ( Ignorable by Annotation )

If this is a false-positive, you can also ignore this issue in your code via the ignore-deprecated  annotation

1638
            /** @scrutinizer ignore-deprecated */ $this->CLI_debug($msg . ' (' . $this->CLI_buildProcessId() . ')');
Loading history...
1639
        } else {
1640
            $this->CLI_debug('Nothing within queue which needs to be processed (' . $this->CLI_buildProcessId() . ')');
0 ignored issues
show
Deprecated Code introduced by
The function AOE\Crawler\Controller\C...Controller::CLI_debug() has been deprecated. ( Ignorable by Annotation )

If this is a false-positive, you can also ignore this issue in your code via the ignore-deprecated  annotation

1640
            /** @scrutinizer ignore-deprecated */ $this->CLI_debug('Nothing within queue which needs to be processed (' . $this->CLI_buildProcessId() . ')');
Loading history...
1641
        }
1642
1643
        if ($counter > 0) {
1644
            $result |= self::CLI_STATUS_PROCESSED;
1645
        }
1646
1647
        return $result;
1648
    }
1649
1650
    /**
1651
     * Activate hooks
1652
     */
1653
    public function CLI_runHooks(): void
1654
    {
1655
        foreach ($GLOBALS['TYPO3_CONF_VARS']['EXTCONF']['crawler']['cli_hooks'] ?? [] as $objRef) {
1656
            $hookObj = GeneralUtility::makeInstance($objRef);
1657
            if (is_object($hookObj)) {
1658
                $hookObj->crawler_init($this);
1659
            }
1660
        }
1661
    }
1662
1663
    /**
1664
     * Try to acquire a new process with the given id
1665
     * also performs some auto-cleanup for orphan processes
1666
     * @param string $id identification string for the process
1667
     * @return boolean
1668
     * @todo preemption might not be the most elegant way to clean up
1669
     */
1670
    public function CLI_checkAndAcquireNewProcess($id)
1671
    {
1672
        $ret = true;
1673
1674
        $systemProcessId = getmypid();
1675
        if (! $systemProcessId) {
1676
            return false;
1677
        }
1678
1679
        $processCount = 0;
1680
        $orphanProcesses = [];
1681
1682
        $activeProcesses = $this->processRepository->findAllActive();
1683
        $currentTime = $this->getCurrentTime();
1684
1685
        /** @var Process $process */
1686
        foreach ($activeProcesses as $process) {
1687
            if ($process->getTtl() < $currentTime) {
1688
                $orphanProcesses[] = $process->getProcessId();
1689
            } else {
1690
                $processCount++;
1691
            }
1692
        }
1693
1694
        // if there are less than allowed active processes then add a new one
1695
        if ($processCount < (int) $this->extensionSettings['processLimit']) {
1696
            $this->CLI_debug('add process ' . $this->CLI_buildProcessId() . ' (' . ($processCount + 1) . '/' . (int) $this->extensionSettings['processLimit'] . ')');
0 ignored issues
show
Deprecated Code introduced by
The function AOE\Crawler\Controller\C...Controller::CLI_debug() has been deprecated. ( Ignorable by Annotation )

If this is a false-positive, you can also ignore this issue in your code via the ignore-deprecated  annotation

1696
            /** @scrutinizer ignore-deprecated */ $this->CLI_debug('add process ' . $this->CLI_buildProcessId() . ' (' . ($processCount + 1) . '/' . (int) $this->extensionSettings['processLimit'] . ')');
Loading history...
1697
1698
            GeneralUtility::makeInstance(ConnectionPool::class)->getConnectionForTable('tx_crawler_process')->insert(
1699
                'tx_crawler_process',
1700
                [
1701
                    'process_id' => $id,
1702
                    'active' => 1,
1703
                    'ttl' => $currentTime + (int) $this->extensionSettings['processMaxRunTime'],
1704
                    'system_process_id' => $systemProcessId,
1705
                ]
1706
            );
1707
        } else {
1708
            $this->CLI_debug('Processlimit reached (' . ($processCount) . '/' . (int) $this->extensionSettings['processLimit'] . ')');
0 ignored issues
show
Deprecated Code introduced by
The function AOE\Crawler\Controller\C...Controller::CLI_debug() has been deprecated. ( Ignorable by Annotation )

If this is a false-positive, you can also ignore this issue in your code via the ignore-deprecated  annotation

1708
            /** @scrutinizer ignore-deprecated */ $this->CLI_debug('Processlimit reached (' . ($processCount) . '/' . (int) $this->extensionSettings['processLimit'] . ')');
Loading history...
1709
            $ret = false;
1710
        }
1711
1712
        $this->processRepository->deleteProcessesMarkedAsDeleted();
1713
        $this->CLI_releaseProcesses($orphanProcesses);
1714
1715
        return $ret;
1716
    }
1717
1718
    /**
1719
     * Release a process and the required resources
1720
     *
1721
     * @param mixed $releaseIds string with a single process-id or array with multiple process-ids
1722
     * @return boolean
1723
     */
1724
    public function CLI_releaseProcesses($releaseIds)
1725
    {
1726
        $queryBuilder = GeneralUtility::makeInstance(ConnectionPool::class)->getQueryBuilderForTable($this->tableName);
1727
1728
        if (! is_array($releaseIds)) {
1729
            $releaseIds = [$releaseIds];
1730
        }
1731
1732
        if (empty($releaseIds)) {
1733
            //nothing to release
1734
            return false;
1735
        }
1736
1737
        // some kind of 2nd chance algo - this way you need at least 2 processes to have a real cleanup
1738
        // this ensures that a single process can't mess up the entire process table
1739
1740
        // mark all processes as deleted which have no "waiting" queue-entires and which are not active
1741
1742
        $queryBuilder
1743
            ->update($this->tableName, 'q')
1744
            ->where(
1745
                'q.process_id IN(SELECT p.process_id FROM tx_crawler_process as p WHERE p.active = 0)'
1746
            )
1747
            ->set('q.process_scheduled', 0)
1748
            ->set('q.process_id', '')
1749
            ->execute();
1750
1751
        // FIXME: Not entirely sure that this is equivalent to the previous version
1752
        $queryBuilder->resetQueryPart('set');
1753
1754
        $queryBuilder
1755
            ->update('tx_crawler_process')
1756
            ->where(
1757
                $queryBuilder->expr()->eq('active', 0),
1758
                'process_id IN(SELECT q.process_id FROM tx_crawler_queue as q WHERE q.exec_time = 0)'
1759
            )
1760
            ->set('system_process_id', 0)
1761
            ->execute();
1762
1763
        $this->processRepository->markRequestedProcessesAsNotActive($releaseIds);
1764
        $this->queueRepository->unsetProcessScheduledAndProcessIdForQueueEntries($releaseIds);
1765
1766
        return true;
1767
    }
1768
1769
    /**
1770
     * Create a unique Id for the current process
1771
     *
1772
     * @return string the ID
1773
     */
1774 1
    public function CLI_buildProcessId()
1775
    {
1776 1
        if (! $this->processID) {
1777
            $this->processID = GeneralUtility::shortMD5(microtime(true));
1778
        }
1779 1
        return $this->processID;
1780
    }
1781
1782
    /**
1783
     * Prints a message to the stdout (only if debug-mode is enabled)
1784
     *
1785
     * @param string $msg the message
1786
     * @deprecated
1787
     * @codeCoverageIgnore
1788
     */
1789
    public function CLI_debug($msg): void
1790
    {
1791
        if ((int) $this->extensionSettings['processDebug']) {
1792
            echo $msg . "\n";
1793
            flush();
1794
        }
1795
    }
1796
1797
    /**
1798
     * Cleans up entries that stayed for too long in the queue. These are:
1799
     * - processed entries that are over 1.5 days in age
1800
     * - scheduled entries that are over 7 days old
1801
     *
1802
     * @deprecated
1803
     */
1804 1
    public function cleanUpOldQueueEntries(): void
1805
    {
1806
        // 24*60*60 Seconds in 24 hours
1807 1
        $processedAgeInSeconds = $this->extensionSettings['cleanUpProcessedAge'] * 86400;
1808 1
        $scheduledAgeInSeconds = $this->extensionSettings['cleanUpScheduledAge'] * 86400;
1809
1810 1
        $now = time();
1811 1
        $condition = '(exec_time<>0 AND exec_time<' . ($now - $processedAgeInSeconds) . ') OR scheduled<=' . ($now - $scheduledAgeInSeconds);
1812 1
        $this->flushQueue($condition);
0 ignored issues
show
Deprecated Code introduced by
The function AOE\Crawler\Controller\C...ontroller::flushQueue() has been deprecated. ( Ignorable by Annotation )

If this is a false-positive, you can also ignore this issue in your code via the ignore-deprecated  annotation

1812
        /** @scrutinizer ignore-deprecated */ $this->flushQueue($condition);
Loading history...
1813 1
    }
1814
1815
    /**
1816
     * Removes queue entries
1817
     *
1818
     * @param string $where SQL related filter for the entries which should be removed
1819
     *
1820
     * @deprecated
1821
     */
1822 5
    protected function flushQueue($where = ''): void
1823
    {
1824 5
        $realWhere = strlen((string) $where) > 0 ? $where : '1=1';
1825
1826 5
        $queryBuilder = $this->getQueryBuilder($this->tableName);
1827
1828
        $groups = $queryBuilder
1829 5
            ->selectLiteral('DISTINCT set_id')
1830 5
            ->from($this->tableName)
1831 5
            ->where($realWhere)
1832 5
            ->execute()
1833 5
            ->fetchAll();
1834 5
        if (is_array($groups)) {
0 ignored issues
show
introduced by
The condition is_array($groups) is always true.
Loading history...
1835 5
            foreach ($groups as $group) {
1836
                $subSet = $queryBuilder
1837 4
                    ->select('qid', 'set_id')
1838 4
                    ->from($this->tableName)
1839 4
                    ->where(
1840 4
                        $realWhere,
1841 4
                        $queryBuilder->expr()->eq('set_id', $group['set_id'])
1842
                    )
1843 4
                    ->execute()
1844 4
                    ->fetchAll();
1845
1846 4
                $payLoad = ['subSet' => $subSet];
1847 4
                SignalSlotUtility::emitSignal(
1848 4
                    self::class,
1849 4
                    SignalSlotUtility::SIGNAL_QUEUE_ENTRY_FLUSH,
1850
                    $payLoad
1851
                );
1852
            }
1853
        }
1854
1855
        $queryBuilder
1856 5
            ->delete($this->tableName)
1857 5
            ->where($realWhere)
1858 5
            ->execute();
1859 5
    }
1860
1861
    /**
1862
     * This method determines duplicates for a queue entry with the same parameters and this timestamp.
1863
     * If the timestamp is in the past, it will check if there is any unprocessed queue entry in the past.
1864
     * If the timestamp is in the future it will check, if the queued entry has exactly the same timestamp
1865
     *
1866
     * @param int $tstamp
1867
     * @param array $fieldArray
1868
     *
1869
     * @return array
1870
     * @deprecated
1871
     */
1872 5
    protected function getDuplicateRowsIfExist($tstamp, $fieldArray)
1873
    {
1874 5
        $rows = [];
1875
1876 5
        $currentTime = $this->getCurrentTime();
1877
1878 5
        $queryBuilder = GeneralUtility::makeInstance(ConnectionPool::class)->getQueryBuilderForTable($this->tableName);
1879
        $queryBuilder
1880 5
            ->select('qid')
1881 5
            ->from('tx_crawler_queue');
1882
        //if this entry is scheduled with "now"
1883 5
        if ($tstamp <= $currentTime) {
1884 2
            if ($this->extensionSettings['enableTimeslot']) {
1885 1
                $timeBegin = $currentTime - 100;
1886 1
                $timeEnd = $currentTime + 100;
1887
                $queryBuilder
1888 1
                    ->where(
1889 1
                        'scheduled BETWEEN ' . $timeBegin . ' AND ' . $timeEnd . ''
1890
                    )
1891 1
                    ->orWhere(
1892 1
                        $queryBuilder->expr()->lte('scheduled', $currentTime)
1893
                    );
1894
            } else {
1895
                $queryBuilder
1896 1
                    ->where(
1897 2
                        $queryBuilder->expr()->lte('scheduled', $currentTime)
1898
                    );
1899
            }
1900 3
        } elseif ($tstamp > $currentTime) {
1901
            //entry with a timestamp in the future need to have the same schedule time
1902
            $queryBuilder
1903 3
                ->where(
1904 3
                    $queryBuilder->expr()->eq('scheduled', $tstamp)
1905
                );
1906
        }
1907
1908
        $queryBuilder
1909 5
            ->andWhere('NOT exec_time')
1910 5
            ->andWhere('NOT process_id')
1911 5
            ->andWhere($queryBuilder->expr()->eq('page_id', $queryBuilder->createNamedParameter($fieldArray['page_id'], PDO::PARAM_INT)))
1912 5
            ->andWhere($queryBuilder->expr()->eq('parameters_hash', $queryBuilder->createNamedParameter($fieldArray['parameters_hash'], PDO::PARAM_STR)));
1913
1914 5
        $statement = $queryBuilder->execute();
1915
1916 5
        while ($row = $statement->fetch()) {
1917 5
            $rows[] = $row['qid'];
1918
        }
1919
1920 5
        return $rows;
1921
    }
1922
1923
    /**
1924
     * Returns a md5 hash generated from a serialized configuration array.
1925
     *
1926
     * @return string
1927
     */
1928 10
    protected function getConfigurationHash(array $configuration)
1929
    {
1930 10
        unset($configuration['paramExpanded']);
1931 10
        unset($configuration['URLs']);
1932 10
        return md5(serialize($configuration));
1933
    }
1934
1935
    /**
1936
     * Build a URL from a Page and the Query String. If the page has a Site configuration, it can be built by using
1937
     * the Site instance.
1938
     *
1939
     * @param int $httpsOrHttp see tx_crawler_configuration.force_ssl
1940
     * @throws SiteNotFoundException
1941
     * @throws InvalidRouteArgumentsException
1942
     *
1943
     * @deprecated Using CrawlerController::getUrlFromPageAndQueryParameters() is deprecated since 9.1.1 and will be removed in v11.x, please use UrlService->getUrlFromPageAndQueryParameters() instead.
1944
     * @codeCoverageIgnore
1945
     */
1946
    protected function getUrlFromPageAndQueryParameters(int $pageId, string $queryString, ?string $alternativeBaseUrl, int $httpsOrHttp): UriInterface
1947
    {
1948
        $urlService = new UrlService();
1949
        return $urlService->getUrlFromPageAndQueryParameters($pageId, $queryString, $alternativeBaseUrl, $httpsOrHttp);
1950
    }
1951
1952 1
    protected function swapIfFirstIsLargerThanSecond(array $reg): array
1953
    {
1954
        // Swap if first is larger than last:
1955 1
        if ($reg[1] > $reg[2]) {
1956
            $temp = $reg[2];
1957
            $reg[2] = $reg[1];
1958
            $reg[1] = $temp;
1959
        }
1960
1961 1
        return $reg;
1962
    }
1963
1964
    /**
1965
     * @return BackendUserAuthentication
1966
     */
1967 2
    private function getBackendUser()
1968
    {
1969
        // Make sure the _cli_ user is loaded
1970 2
        Bootstrap::initializeBackendAuthentication();
1971 2
        if ($this->backendUser === null) {
1972 2
            $this->backendUser = $GLOBALS['BE_USER'];
1973
        }
1974 2
        return $this->backendUser;
1975
    }
1976
1977
    /**
1978
     * Get querybuilder for given table
1979
     *
1980
     * @return QueryBuilder
1981
     */
1982 12
    private function getQueryBuilder(string $table)
1983
    {
1984 12
        return GeneralUtility::makeInstance(ConnectionPool::class)->getQueryBuilderForTable($table);
1985
    }
1986
}
1987