Passed
Pull Request — master (#677)
by Tomas Norre
09:30 queued 06:06
created

CrawlerController::getUrlsForPageRow()   A

Complexity

Conditions 3
Paths 3

Size

Total Lines 17
Code Lines 11

Duplication

Lines 0
Ratio 0 %

Code Coverage

Tests 8
CRAP Score 3

Importance

Changes 0
Metric Value
cc 3
eloc 11
c 0
b 0
f 0
nc 3
nop 2
dl 0
loc 17
ccs 8
cts 8
cp 1
crap 3
rs 9.9
1
<?php
2
3
declare(strict_types=1);
4
5
namespace AOE\Crawler\Controller;
6
7
/***************************************************************
8
 *  Copyright notice
9
 *
10
 *  (c) 2020 AOE GmbH <[email protected]>
11
 *
12
 *  All rights reserved
13
 *
14
 *  This script is part of the TYPO3 project. The TYPO3 project is
15
 *  free software; you can redistribute it and/or modify
16
 *  it under the terms of the GNU General Public License as published by
17
 *  the Free Software Foundation; either version 3 of the License, or
18
 *  (at your option) any later version.
19
 *
20
 *  The GNU General Public License can be found at
21
 *  http://www.gnu.org/copyleft/gpl.html.
22
 *
23
 *  This script is distributed in the hope that it will be useful,
24
 *  but WITHOUT ANY WARRANTY; without even the implied warranty of
25
 *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
26
 *  GNU General Public License for more details.
27
 *
28
 *  This copyright notice MUST APPEAR in all copies of the script!
29
 ***************************************************************/
30
31
use AOE\Crawler\Configuration\ExtensionConfigurationProvider;
32
use AOE\Crawler\Converter\JsonCompatibilityConverter;
33
use AOE\Crawler\Crawler;
34
use AOE\Crawler\CrawlStrategy\CrawlStrategyFactory;
35
use AOE\Crawler\Domain\Model\Process;
36
use AOE\Crawler\Domain\Repository\ConfigurationRepository;
37
use AOE\Crawler\Domain\Repository\ProcessRepository;
38
use AOE\Crawler\Domain\Repository\QueueRepository;
39
use AOE\Crawler\QueueExecutor;
40
use AOE\Crawler\Service\UrlService;
41
use AOE\Crawler\Utility\SignalSlotUtility;
42
use AOE\Crawler\Value\QueueFilter;
43
use PDO;
44
use Psr\Http\Message\UriInterface;
45
use Psr\Log\LoggerAwareInterface;
46
use Psr\Log\LoggerAwareTrait;
47
use TYPO3\CMS\Backend\Tree\View\PageTreeView;
48
use TYPO3\CMS\Backend\Utility\BackendUtility;
49
use TYPO3\CMS\Core\Authentication\BackendUserAuthentication;
50
use TYPO3\CMS\Core\Compatibility\PublicMethodDeprecationTrait;
51
use TYPO3\CMS\Core\Compatibility\PublicPropertyDeprecationTrait;
52
use TYPO3\CMS\Core\Core\Bootstrap;
53
use TYPO3\CMS\Core\Core\Environment;
54
use TYPO3\CMS\Core\Database\Connection;
55
use TYPO3\CMS\Core\Database\ConnectionPool;
56
use TYPO3\CMS\Core\Database\Query\QueryBuilder;
57
use TYPO3\CMS\Core\Database\Query\Restriction\DeletedRestriction;
58
use TYPO3\CMS\Core\Database\QueryGenerator;
59
use TYPO3\CMS\Core\Exception\SiteNotFoundException;
60
use TYPO3\CMS\Core\Imaging\Icon;
61
use TYPO3\CMS\Core\Imaging\IconFactory;
62
use TYPO3\CMS\Core\Routing\InvalidRouteArgumentsException;
63
use TYPO3\CMS\Core\Site\Entity\Site;
64
use TYPO3\CMS\Core\Type\Bitmask\Permission;
65
use TYPO3\CMS\Core\TypoScript\Parser\TypoScriptParser;
66
use TYPO3\CMS\Core\Utility\DebugUtility;
67
use TYPO3\CMS\Core\Utility\GeneralUtility;
68
use TYPO3\CMS\Core\Utility\MathUtility;
69
use TYPO3\CMS\Extbase\Object\ObjectManager;
70
use TYPO3\CMS\Frontend\Page\PageRepository;
71
72
/**
73
 * Class CrawlerController
74
 *
75
 * @package AOE\Crawler\Controller
76
 */
77
class CrawlerController implements LoggerAwareInterface
78
{
79
    use LoggerAwareTrait;
80
    use PublicMethodDeprecationTrait;
81
    use PublicPropertyDeprecationTrait;
82
83
    public const CLI_STATUS_NOTHING_PROCCESSED = 0;
84
85
    //queue not empty
86
    public const CLI_STATUS_REMAIN = 1;
87
88
    //(some) queue items where processed
89
    public const CLI_STATUS_PROCESSED = 2;
90
91
    //instance didn't finish
92
    public const CLI_STATUS_ABORTED = 4;
93
94
    public const CLI_STATUS_POLLABLE_PROCESSED = 8;
95
96
    /**
97
     * @var integer
98
     */
99
    public $setID = 0;
100
101
    /**
102
     * @var string
103
     */
104
    public $processID = '';
105
106
    /**
107
     * @var array
108
     */
109
    public $duplicateTrack = [];
110
111
    /**
112
     * @var array
113
     */
114
    public $downloadUrls = [];
115
116
    /**
117
     * @var array
118
     */
119
    public $incomingProcInstructions = [];
120
121
    /**
122
     * @var array
123
     */
124
    public $incomingConfigurationSelection = [];
125
126
    /**
127
     * @var bool
128
     */
129
    public $registerQueueEntriesInternallyOnly = false;
130
131
    /**
132
     * @var array
133
     */
134
    public $queueEntries = [];
135
136
    /**
137
     * @var array
138
     */
139
    public $urlList = [];
140
141
    /**
142
     * @var array
143
     */
144
    public $extensionSettings = [];
145
146
    /**
147
     * Mount Point
148
     *
149
     * @var bool
150
     * Todo: Check what this is used for and adjust the type hint or code, as bool doesn't match the current code.
151
     */
152
    public $MP = false;
153
154
    /**
155
     * @var string
156
     * @deprecated
157
     */
158
    protected $processFilename;
159
160
    /**
161
     * Holds the internal access mode can be 'gui','cli' or 'cli_im'
162
     *
163
     * @var string
164
     * @deprecated
165
     */
166
    protected $accessMode;
167
168
    /**
169
     * @var QueueRepository
170
     */
171
    protected $queueRepository;
172
173
    /**
174
     * @var ProcessRepository
175
     */
176
    protected $processRepository;
177
178
    /**
179
     * @var ConfigurationRepository
180
     */
181
    protected $configurationRepository;
182
183
    /**
184
     * @var string
185
     */
186
    protected $tableName = 'tx_crawler_queue';
187
188
    /**
189
     * @var QueueExecutor
190
     */
191
    protected $queueExecutor;
192
193
    /**
194
     * @var int
195
     */
196
    protected $maximumUrlsToCompile = 10000;
197
198
    /**
199
     * @var IconFactory
200
     */
201
    protected $iconFactory;
202
203
    /**
204
     * @var string[]
205
     */
206
    private $deprecatedPublicMethods = [
0 ignored issues
show
introduced by
The private property $deprecatedPublicMethods is not used, and could be removed.
Loading history...
207
        'cleanUpOldQueueEntries' => 'Using CrawlerController::cleanUpOldQueueEntries() is deprecated since 9.0.1 and will be removed in v11.x, please use QueueRepository->cleanUpOldQueueEntries() instead.',
208
        'CLI_debug' => 'Using CrawlerController->CLI_debug() is deprecated since 9.1.3 and will be removed in v11.x',
209
        'getAccessMode' => 'Using CrawlerController->getAccessMode() is deprecated since 9.1.3 and will be removed in v11.x',
210
        'getLogEntriesForPageId' => 'Using CrawlerController->getLogEntriesForPageId() is deprecated since 9.1.5 and will be remove in v11.x',
211
        'getLogEntriesForSetId' => 'Using crawlerController::getLogEntriesForSetId() is deprecated since 9.0.1 and will be removed in v11.x',
212
        'flushQueue' => 'Using CrawlerController::flushQueue() is deprecated since 9.0.1 and will be removed in v11.x, please use QueueRepository->flushQueue() instead.',
213
        'setAccessMode' => 'Using CrawlerController->setAccessMode() is deprecated since 9.1.3 and will be removed in v11.x',
214
        'getDisabled' => 'Using CrawlerController->getDisabled() is deprecated since 9.1.3 and will be removed in v11.x, please use Crawler->isDisabled() instead',
215
        'setDisabled' => 'Using CrawlerController->setDisabled() is deprecated since 9.1.3 and will be removed in v11.x, please use Crawler->setDisabled() instead',
216
        'getProcessFilename' => 'Using CrawlerController->getProcessFilename() is deprecated since 9.1.3 and will be removed in v11.x',
217
        'setProcessFilename' => 'Using CrawlerController->setProcessFilename() is deprecated since 9.1.3 and will be removed in v11.x',
218
        'getDuplicateRowsIfExist' => 'Using CrawlerController->getDuplicateRowsIfExist() is deprecated since 9.1.4 and will be remove in v11.x, please use QueueRepository->getDuplicateQueueItemsIfExists() instead',
219
    ];
220
221
    /**
222
     * @var string[]
223
     */
224
    private $deprecatedPublicProperties = [
1 ignored issue
show
introduced by
The private property $deprecatedPublicProperties is not used, and could be removed.
Loading history...
225
        'accessMode' => 'Using CrawlerController->accessMode is deprecated since 9.1.3 and will be removed in v11.x',
226
        'processFilename' => 'Using CrawlerController->accessMode is deprecated since 9.1.3 and will be removed in v11.x',
227
    ];
228
229
    /**
230
     * @var BackendUserAuthentication|null
231
     */
232
    private $backendUser;
233
234
    /**
235
     * @var integer
236
     */
237
    private $scheduledTime = 0;
238
239
    /**
240
     * @var integer
241
     */
242
    private $reqMinute = 0;
243
244
    /**
245
     * @var bool
246
     */
247
    private $submitCrawlUrls = false;
248
249
    /**
250
     * @var bool
251
     */
252
    private $downloadCrawlUrls = false;
253
254
    /**
255
     * @var PageRepository
256
     */
257
    private $pageRepository;
258
259
    /**
260
     * @var Crawler
261
     */
262
    private $crawler;
263
264
    /************************************
265
     *
266
     * Getting URLs based on Page TSconfig
267
     *
268
     ************************************/
269
270 36
    public function __construct()
271
    {
272 36
        $objectManager = GeneralUtility::makeInstance(ObjectManager::class);
273 36
        $crawlStrategyFactory = GeneralUtility::makeInstance(CrawlStrategyFactory::class);
274 36
        $this->queueRepository = $objectManager->get(QueueRepository::class);
275 36
        $this->processRepository = $objectManager->get(ProcessRepository::class);
276 36
        $this->configurationRepository = $objectManager->get(ConfigurationRepository::class);
277 36
        $this->pageRepository = $objectManager->get(PageRepository::class);
278 36
        $this->queueExecutor = GeneralUtility::makeInstance(QueueExecutor::class, $crawlStrategyFactory);
279 36
        $this->iconFactory = GeneralUtility::makeInstance(IconFactory::class);
280 36
        $this->crawler = GeneralUtility::makeInstance(Crawler::class);
281
282 36
        $this->processFilename = Environment::getVarPath() . '/lock/tx_crawler.proc';
0 ignored issues
show
Deprecated Code introduced by
The property AOE\Crawler\Controller\C...oller::$processFilename has been deprecated. ( Ignorable by Annotation )

If this is a false-positive, you can also ignore this issue in your code via the ignore-deprecated  annotation

282
        /** @scrutinizer ignore-deprecated */ $this->processFilename = Environment::getVarPath() . '/lock/tx_crawler.proc';
Loading history...
283
284
        /** @var ExtensionConfigurationProvider $configurationProvider */
285 36
        $configurationProvider = GeneralUtility::makeInstance(ExtensionConfigurationProvider::class);
286 36
        $settings = $configurationProvider->getExtensionConfiguration();
287 36
        $this->extensionSettings = is_array($settings) ? $settings : [];
0 ignored issues
show
introduced by
The condition is_array($settings) is always true.
Loading history...
288
289
        // set defaults:
290 36
        if (MathUtility::convertToPositiveInteger($this->extensionSettings['countInARun']) === 0) {
291
            $this->extensionSettings['countInARun'] = 100;
292
        }
293
294 36
        $this->extensionSettings['processLimit'] = MathUtility::forceIntegerInRange($this->extensionSettings['processLimit'], 1, 99, 1);
295 36
        $this->maximumUrlsToCompile = MathUtility::forceIntegerInRange($this->extensionSettings['maxCompileUrls'], 1, 1000000000, 10000);
296 36
    }
297
298
    /**
299
     * Method to set the accessMode can be gui, cli or cli_im
300
     *
301
     * @return string
302
     * @deprecated
303
     */
304 1
    public function getAccessMode()
305
    {
306 1
        return $this->accessMode;
0 ignored issues
show
Deprecated Code introduced by
The property AOE\Crawler\Controller\C...Controller::$accessMode has been deprecated. ( Ignorable by Annotation )

If this is a false-positive, you can also ignore this issue in your code via the ignore-deprecated  annotation

306
        return /** @scrutinizer ignore-deprecated */ $this->accessMode;
Loading history...
307
    }
308
309
    /**
310
     * @param string $accessMode
311
     * @deprecated
312
     */
313 1
    public function setAccessMode($accessMode): void
314
    {
315 1
        $this->accessMode = $accessMode;
0 ignored issues
show
Deprecated Code introduced by
The property AOE\Crawler\Controller\C...Controller::$accessMode has been deprecated. ( Ignorable by Annotation )

If this is a false-positive, you can also ignore this issue in your code via the ignore-deprecated  annotation

315
        /** @scrutinizer ignore-deprecated */ $this->accessMode = $accessMode;
Loading history...
316 1
    }
317
318
    /**
319
     * Set disabled status to prevent processes from being processed
320
     *
321
     * @param bool $disabled (optional, defaults to true)
322
     * @deprecated
323
     */
324 2
    public function setDisabled($disabled = true): void
325
    {
326 2
        if ($disabled) {
327 1
            GeneralUtility::writeFile($this->processFilename, '');
0 ignored issues
show
Deprecated Code introduced by
The property AOE\Crawler\Controller\C...oller::$processFilename has been deprecated. ( Ignorable by Annotation )

If this is a false-positive, you can also ignore this issue in your code via the ignore-deprecated  annotation

327
            GeneralUtility::writeFile(/** @scrutinizer ignore-deprecated */ $this->processFilename, '');
Loading history...
328
        } else {
329 1
            if (is_file($this->processFilename)) {
0 ignored issues
show
Deprecated Code introduced by
The property AOE\Crawler\Controller\C...oller::$processFilename has been deprecated. ( Ignorable by Annotation )

If this is a false-positive, you can also ignore this issue in your code via the ignore-deprecated  annotation

329
            if (is_file(/** @scrutinizer ignore-deprecated */ $this->processFilename)) {
Loading history...
330 1
                unlink($this->processFilename);
0 ignored issues
show
Deprecated Code introduced by
The property AOE\Crawler\Controller\C...oller::$processFilename has been deprecated. ( Ignorable by Annotation )

If this is a false-positive, you can also ignore this issue in your code via the ignore-deprecated  annotation

330
                unlink(/** @scrutinizer ignore-deprecated */ $this->processFilename);
Loading history...
331
            }
332
        }
333 2
    }
334
335
    /**
336
     * Get disable status
337
     *
338
     * @return bool true if disabled
339
     * @deprecated
340
     */
341 2
    public function getDisabled()
342
    {
343 2
        return is_file($this->processFilename);
0 ignored issues
show
Deprecated Code introduced by
The property AOE\Crawler\Controller\C...oller::$processFilename has been deprecated. ( Ignorable by Annotation )

If this is a false-positive, you can also ignore this issue in your code via the ignore-deprecated  annotation

343
        return is_file(/** @scrutinizer ignore-deprecated */ $this->processFilename);
Loading history...
344
    }
345
346
    /**
347
     * @param string $filenameWithPath
348
     * @deprecated
349
     */
350 3
    public function setProcessFilename($filenameWithPath): void
351
    {
352 3
        $this->processFilename = $filenameWithPath;
0 ignored issues
show
Deprecated Code introduced by
The property AOE\Crawler\Controller\C...oller::$processFilename has been deprecated. ( Ignorable by Annotation )

If this is a false-positive, you can also ignore this issue in your code via the ignore-deprecated  annotation

352
        /** @scrutinizer ignore-deprecated */ $this->processFilename = $filenameWithPath;
Loading history...
353 3
    }
354
355
    /**
356
     * @return string
357
     * @deprecated
358
     */
359 1
    public function getProcessFilename()
360
    {
361 1
        return $this->processFilename;
0 ignored issues
show
Deprecated Code introduced by
The property AOE\Crawler\Controller\C...oller::$processFilename has been deprecated. ( Ignorable by Annotation )

If this is a false-positive, you can also ignore this issue in your code via the ignore-deprecated  annotation

361
        return /** @scrutinizer ignore-deprecated */ $this->processFilename;
Loading history...
362
    }
363
364
    /**
365
     * Sets the extensions settings (unserialized pendant of $TYPO3_CONF_VARS['EXT']['extConf']['crawler']).
366
     */
367 14
    public function setExtensionSettings(array $extensionSettings): void
368
    {
369 14
        $this->extensionSettings = $extensionSettings;
370 14
    }
371
372
    /**
373
     * Check if the given page should be crawled
374
     *
375
     * @return false|string false if the page should be crawled (not excluded), true / skipMessage if it should be skipped
376
     */
377 12
    public function checkIfPageShouldBeSkipped(array $pageRow)
378
    {
379
        // if page is hidden
380 12
        if (! $this->extensionSettings['crawlHiddenPages'] && $pageRow['hidden']) {
381 1
            return 'Because page is hidden';
382
        }
383
384 11
        if (GeneralUtility::inList('3,4,199,254,255', $pageRow['doktype'])) {
385 3
            return 'Because doktype is not allowed';
386
        }
387
388 8
        foreach ($GLOBALS['TYPO3_CONF_VARS']['EXTCONF']['crawler']['excludeDoktype'] ?? [] as $key => $doktypeList) {
389 1
            if (GeneralUtility::inList($doktypeList, $pageRow['doktype'])) {
390 1
                return 'Doktype was excluded by "' . $key . '"';
391
            }
392
        }
393
394
        // veto hook
395 7
        foreach ($GLOBALS['TYPO3_CONF_VARS']['EXTCONF']['crawler']['pageVeto'] ?? [] as $key => $func) {
396
            $params = [
397 2
                'pageRow' => $pageRow,
398
            ];
399
            // expects "false" if page is ok and "true" or a skipMessage if this page should _not_ be crawled
400 2
            $veto = GeneralUtility::callUserFunction($func, $params, $this);
401 2
            if ($veto !== false) {
402 2
                if (is_string($veto)) {
403 1
                    return $veto;
404
                }
405 1
                return 'Veto from hook "' . htmlspecialchars($key) . '"';
406
            }
407
        }
408
409 5
        return false;
410
    }
411
412
    /**
413
     * Wrapper method for getUrlsForPageId()
414
     * It returns an array of configurations and no urls!
415
     *
416
     * @param array $pageRow Page record with at least dok-type and uid columns.
417
     * @param string $skipMessage
418
     * @return array
419
     * @see getUrlsForPageId()
420
     */
421 6
    public function getUrlsForPageRow(array $pageRow, &$skipMessage = '')
422
    {
423 6
        if (! is_int($pageRow['uid'])) {
424 6
            $skipMessage = 'PageUid ' . $pageRow['uid'] . ' was not an integer';
425 5
            return [];
426 5
        }
427
428 1
        $message = $this->checkIfPageShouldBeSkipped($pageRow);
429 1
        if ($message === false) {
430
            $res = $this->getUrlsForPageId($pageRow['uid']);
431
            $skipMessage = '';
432 6
        } else {
433
            $skipMessage = $message;
434
            $res = [];
435
        }
436
437
        return $res;
438
    }
439
440
    /**
441
     * Creates a list of URLs from input array (and submits them to queue if asked for)
442
     * See Web > Info module script + "indexed_search"'s crawler hook-client using this!
443
     *
444
     * @param array $vv Information about URLs from pageRow to crawl.
445
     * @param array $pageRow Page row
446
     * @param int $scheduledTime Unix time to schedule indexing to, typically time()
447
     * @param int $reqMinute Number of requests per minute (creates the interleave between requests)
448
     * @param bool $submitCrawlUrls If set, submits the URLs to queue
449
     * @param bool $downloadCrawlUrls If set (and submitcrawlUrls is false) will fill $downloadUrls with entries)
450 4
     * @param array $duplicateTrack Array which is passed by reference and contains the an id per url to secure we will not crawl duplicates
451
     * @param array $downloadUrls Array which will be filled with URLS for download if flag is set.
452
     * @param array $incomingProcInstructions Array of processing instructions
453
     * @return string List of URLs (meant for display in backend module)
454
     */
455
    public function urlListFromUrlArray(
456
        array $vv,
457
        array $pageRow,
458
        $scheduledTime,
459
        $reqMinute,
460
        $submitCrawlUrls,
461 4
        $downloadCrawlUrls,
462
        array &$duplicateTrack,
463
        array &$downloadUrls,
464 4
        array $incomingProcInstructions
465 4
    ) {
466 4
        if (! is_array($vv['URLs'])) {
467 4
            return 'ERROR - no URL generated';
468
        }
469 4
        $urlLog = [];
470
        $pageId = (int) $pageRow['uid'];
471 4
        $configurationHash = $this->getConfigurationHash($vv);
472 4
        $skipInnerCheck = $this->queueRepository->noUnprocessedQueueEntriesForPageWithConfigurationHashExist($pageId, $configurationHash);
473
474
        $urlService = new UrlService();
475 4
476 4
        foreach ($vv['URLs'] as $urlQuery) {
477
            if (! $this->drawURLs_PIfilter($vv['subCfg']['procInstrFilter'], $incomingProcInstructions)) {
478 4
                continue;
479 4
            }
480
            $url = (string) $urlService->getUrlFromPageAndQueryParameters(
481
                $pageId,
482
                $urlQuery,
483 4
                $vv['subCfg']['baseUrl'] ?? null,
484
                $vv['subCfg']['force_ssl'] ?? 0
485 4
            );
486
487
            // Create key by which to determine unique-ness:
488
            $uKey = $url . '|' . $vv['subCfg']['userGroups'] . '|' . $vv['subCfg']['procInstrFilter'];
489
490 4
            if (isset($duplicateTrack[$uKey])) {
491 4
                //if the url key is registered just display it and do not resubmit is
492 4
                $urlLog[] = '<em><span class="text-muted">' . htmlspecialchars($url) . '</span></em>';
493 4
            } else {
494 4
                // Scheduled time:
495
                $schTime = $scheduledTime + round(count($duplicateTrack) * (60 / $reqMinute));
496
                $schTime = intval($schTime / 60) * 60;
497 4
                $formattedDate = BackendUtility::datetime($schTime);
498 4
                $this->urlList[] = '[' . $formattedDate . '] ' . $url;
499 4
                $urlList = '[' . $formattedDate . '] ' . htmlspecialchars($url);
500
501 4
                // Submit for crawling!
502
                if ($submitCrawlUrls) {
503
                    $added = $this->addUrl(
504
                        $pageId,
505
                        $url,
506 4
                        $vv['subCfg'],
507 4
                        $scheduledTime,
508
                        $configurationHash,
509
                        $skipInnerCheck
510
                    );
511
                    if ($added === false) {
512 4
                        $urlList .= ' (URL already existed)';
513
                    }
514 4
                } elseif ($downloadCrawlUrls) {
515
                    $downloadUrls[$url] = $url;
516
                }
517 4
                $urlLog[] = $urlList;
518
            }
519
            $duplicateTrack[$uKey] = true;
520
        }
521
522
        return implode('<br>', $urlLog);
523
    }
524
525
    /**
526
     * Returns true if input processing instruction is among registered ones.
527 5
     *
528
     * @param string $piString PI to test
529 5
     * @param array $incomingProcInstructions Processing instructions
530 1
     * @return boolean
531
     */
532
    public function drawURLs_PIfilter($piString, array $incomingProcInstructions)
533 4
    {
534 4
        if (empty($incomingProcInstructions)) {
535 2
            return true;
536
        }
537
538 2
        foreach ($incomingProcInstructions as $pi) {
539
            if (GeneralUtility::inList($piString, $pi)) {
540
                return true;
541 5
            }
542
        }
543 5
        return false;
544 5
    }
545
546
    public function getPageTSconfigForId($id): array
547
    {
548
        if (! $this->MP) {
549
            $pageTSconfig = BackendUtility::getPagesTSconfig($id);
0 ignored issues
show
Deprecated Code introduced by
The function TYPO3\CMS\Backend\Utilit...ity::getPagesTSconfig() has been deprecated. ( Ignorable by Annotation )

If this is a false-positive, you can also ignore this issue in your code via the ignore-deprecated  annotation

549
            $pageTSconfig = /** @scrutinizer ignore-deprecated */ BackendUtility::getPagesTSconfig($id);
Loading history...
550
        } else {
551
            // TODO: Please check, this makes no sense to split a boolean value.
552 5
            [, $mountPointId] = explode('-', $this->MP);
0 ignored issues
show
Bug introduced by
$this->MP of type true is incompatible with the type string expected by parameter $string of explode(). ( Ignorable by Annotation )

If this is a false-positive, you can also ignore this issue in your code via the ignore-type  annotation

552
            [, $mountPointId] = explode('-', /** @scrutinizer ignore-type */ $this->MP);
Loading history...
553
            $pageTSconfig = BackendUtility::getPagesTSconfig($mountPointId);
0 ignored issues
show
Bug introduced by
$mountPointId of type string is incompatible with the type integer expected by parameter $id of TYPO3\CMS\Backend\Utilit...ity::getPagesTSconfig(). ( Ignorable by Annotation )

If this is a false-positive, you can also ignore this issue in your code via the ignore-type  annotation

553
            $pageTSconfig = BackendUtility::getPagesTSconfig(/** @scrutinizer ignore-type */ $mountPointId);
Loading history...
Deprecated Code introduced by
The function TYPO3\CMS\Backend\Utilit...ity::getPagesTSconfig() has been deprecated. ( Ignorable by Annotation )

If this is a false-positive, you can also ignore this issue in your code via the ignore-deprecated  annotation

553
            $pageTSconfig = /** @scrutinizer ignore-deprecated */ BackendUtility::getPagesTSconfig($mountPointId);
Loading history...
554
        }
555
556
        // Call a hook to alter configuration
557
        if (is_array($GLOBALS['TYPO3_CONF_VARS']['EXTCONF']['crawler']['getPageTSconfigForId'])) {
558
            $params = [
559
                'pageId' => $id,
560
                'pageTSConfig' => &$pageTSconfig,
561 5
            ];
562
            foreach ($GLOBALS['TYPO3_CONF_VARS']['EXTCONF']['crawler']['getPageTSconfigForId'] as $userFunc) {
563
                GeneralUtility::callUserFunction($userFunc, $params, $this);
564
            }
565
        }
566
        return $pageTSconfig;
567
    }
568 4
569
    /**
570
     * This methods returns an array of configurations.
571 4
     * Adds no urls!
572
     */
573 4
    public function getUrlsForPageId(int $pageId): array
574
    {
575
        // Get page TSconfig for page ID
576 4
        $pageTSconfig = $this->getPageTSconfigForId($pageId);
577 4
578 3
        $res = [];
579 3
580
        // Fetch Crawler Configuration from pageTSconfig
581 3
        $crawlerCfg = $pageTSconfig['tx_crawler.']['crawlerCfg.']['paramSets.'] ?? [];
582
        foreach ($crawlerCfg as $key => $values) {
583 3
            if (! is_array($values)) {
584 3
                continue;
585
            }
586 3
            $key = str_replace('.', '', $key);
587 3
            // Sub configuration for a single configuration string:
588
            $subCfg = (array) $crawlerCfg[$key . '.'];
589 3
            $subCfg['key'] = $key;
590
591
            if (strcmp($subCfg['procInstrFilter'] ?? '', '')) {
592
                $subCfg['procInstrFilter'] = implode(',', GeneralUtility::trimExplode(',', $subCfg['procInstrFilter']));
593 3
            }
594
            $pidOnlyList = implode(',', GeneralUtility::trimExplode(',', $subCfg['pidsOnly'], true));
595
596 3
            // process configuration if it is not page-specific or if the specific page is the current page:
597 3
            // TODO: Check if $pidOnlyList can be kept as Array instead of imploded
598 3
            if (! strcmp((string) $subCfg['pidsOnly'], '') || GeneralUtility::inList($pidOnlyList, strval($pageId))) {
599 3
600 3
                // Explode, process etc.:
601
                $res[$key] = [];
602
                $res[$key]['subCfg'] = $subCfg;
603 3
                $res[$key]['paramParsed'] = GeneralUtility::explodeUrl2Array($crawlerCfg[$key]);
604 3
                $res[$key]['paramExpanded'] = $this->expandParameters($res[$key]['paramParsed'], $pageId);
605
                $res[$key]['origin'] = 'pagets';
606
607
                // recognize MP value
608
                if (! $this->MP) {
609
                    $res[$key]['URLs'] = $this->compileUrls($res[$key]['paramExpanded'], ['?id=' . $pageId]);
610
                } else {
611
                    $res[$key]['URLs'] = $this->compileUrls($res[$key]['paramExpanded'], ['?id=' . $pageId . '&MP=' . $this->MP]);
0 ignored issues
show
Bug introduced by
Are you sure $this->MP of type true can be used in concatenation? ( Ignorable by Annotation )

If this is a false-positive, you can also ignore this issue in your code via the ignore-type  annotation

611
                    $res[$key]['URLs'] = $this->compileUrls($res[$key]['paramExpanded'], ['?id=' . $pageId . '&MP=' . /** @scrutinizer ignore-type */ $this->MP]);
Loading history...
612 4
                }
613 4
            }
614
        }
615
616 1
        // Get configuration from tx_crawler_configuration records up the rootline
617 1
        $crawlerConfigurations = $this->configurationRepository->getCrawlerConfigurationRecordsFromRootLine($pageId);
618
        foreach ($crawlerConfigurations as $configurationRecord) {
619
620
            // check access to the configuration record
621 1
            if (empty($configurationRecord['begroups']) || $this->getBackendUser()->isAdmin() || $this->hasGroupAccess($this->getBackendUser()->user['usergroup_cached_list'], $configurationRecord['begroups'])) {
622 1
                $pidOnlyList = implode(',', GeneralUtility::trimExplode(',', $configurationRecord['pidsonly'], true));
623
624
                // process configuration if it is not page-specific or if the specific page is the current page:
625 1
                // TODO: Check if $pidOnlyList can be kept as Array instead of imploded
626
                if (! strcmp($configurationRecord['pidsonly'], '') || GeneralUtility::inList($pidOnlyList, strval($pageId))) {
627
                    $key = $configurationRecord['name'];
628 1
629 1
                    // don't overwrite previously defined paramSets
630
                    if (! isset($res[$key])) {
631
632 1
                        /* @var $TSparserObject TypoScriptParser */
633 1
                        $TSparserObject = GeneralUtility::makeInstance(TypoScriptParser::class);
634 1
                        $TSparserObject->parse($configurationRecord['processing_instruction_parameters_ts']);
635 1
636 1
                        $subCfg = [
637 1
                            'procInstrFilter' => $configurationRecord['processing_instruction_filter'],
638 1
                            'procInstrParams.' => $TSparserObject->setup,
639
                            'baseUrl' => $configurationRecord['base_url'],
640
                            'force_ssl' => (int) $configurationRecord['force_ssl'],
641 1
                            'userGroups' => $configurationRecord['fegroups'],
642 1
                            'exclude' => $configurationRecord['exclude'],
643 1
                            'key' => $key,
644 1
                        ];
645 1
646 1
                        if (! in_array($pageId, $this->expandExcludeString($subCfg['exclude']), true)) {
647 1
                            $res[$key] = [];
648
                            $res[$key]['subCfg'] = $subCfg;
649
                            $res[$key]['paramParsed'] = GeneralUtility::explodeUrl2Array($configurationRecord['configuration']);
650
                            $res[$key]['paramExpanded'] = $this->expandParameters($res[$key]['paramParsed'], $pageId);
651
                            $res[$key]['URLs'] = $this->compileUrls($res[$key]['paramExpanded'], ['?id=' . $pageId]);
652
                            $res[$key]['origin'] = 'tx_crawler_configuration_' . $configurationRecord['uid'];
653
                        }
654 4
                    }
655
                }
656
            }
657
        }
658
659
        foreach ($GLOBALS['TYPO3_CONF_VARS']['EXTCONF']['crawler']['processUrls'] ?? [] as $func) {
660 4
            $params = [
661
                'res' => &$res,
662
            ];
663
            GeneralUtility::callUserFunction($func, $params, $this);
664
        }
665
        return $res;
666
    }
667 1
668
    /**
669 1
     * Find all configurations of subpages of a page
670 1
     * TODO: Write Functional Tests
671 1
     */
672 1
    public function getConfigurationsForBranch(int $rootid, int $depth): array
673
    {
674
        $configurationsForBranch = [];
675
        $pageTSconfig = $this->getPageTSconfigForId($rootid);
676
        $sets = $pageTSconfig['tx_crawler.']['crawlerCfg.']['paramSets.'] ?? [];
677
        foreach ($sets as $key => $value) {
678 1
            if (! is_array($value)) {
679 1
                continue;
680 1
            }
681 1
            $configurationsForBranch[] = substr($key, -1) === '.' ? substr($key, 0, -1) : $key;
682
        }
683
        $pids = [];
684 1
        $rootLine = BackendUtility::BEgetRootLine($rootid);
685 1
        foreach ($rootLine as $node) {
686 1
            $pids[] = $node['uid'];
687 1
        }
688 1
        /* @var PageTreeView $tree */
689
        $tree = GeneralUtility::makeInstance(PageTreeView::class);
690
        $perms_clause = $this->getBackendUser()->getPagePermsClause(Permission::PAGE_SHOW);
691
        $tree->init(empty($perms_clause) ? '' : ('AND ' . $perms_clause));
692 1
        $tree->getTree($rootid, $depth, '');
693
        foreach ($tree->tree as $node) {
694 1
            $pids[] = $node['row']['uid'];
695 1
        }
696 1
697 1
        $queryBuilder = $this->getQueryBuilder('tx_crawler_configuration');
698
        $statement = $queryBuilder
699 1
            ->select('name')
700
            ->from('tx_crawler_configuration')
701 1
            ->where(
702 1
                $queryBuilder->expr()->in('pid', $queryBuilder->createNamedParameter($pids, Connection::PARAM_INT_ARRAY))
703
            )
704 1
            ->execute();
705
706
        while ($row = $statement->fetch()) {
707
            $configurationsForBranch[] = $row['name'];
708
        }
709
        return $configurationsForBranch;
710
    }
711
712
    /**
713
     * Check if a user has access to an item
714
     * (e.g. get the group list of the current logged in user from $GLOBALS['TSFE']->gr_list)
715
     *
716 3
     * @param string $groupList Comma-separated list of (fe_)group UIDs from a user
717
     * @param string $accessList Comma-separated list of (fe_)group UIDs of the item to access
718 3
     * @return bool TRUE if at least one of the users group UIDs is in the access list or the access list is empty
719 1
     * @see \TYPO3\CMS\Frontend\Page\PageRepository::getMultipleGroupsWhereClause()
720
     */
721 2
    public function hasGroupAccess($groupList, $accessList)
722 2
    {
723 1
        if (empty($accessList)) {
724
            return true;
725
        }
726 1
        foreach (GeneralUtility::intExplode(',', $groupList) as $groupUid) {
727
            if (GeneralUtility::inList($accessList, $groupUid)) {
728
                return true;
729
            }
730
        }
731
        return false;
732
    }
733
734
    /**
735
     * Will expand the parameters configuration to individual values. This follows a certain syntax of the value of each parameter.
736
     * Syntax of values:
737
     * - Basically: If the value is wrapped in [...] it will be expanded according to the following syntax, otherwise the value is taken literally
738
     * - Configuration is splitted by "|" and the parts are processed individually and finally added together
739
     * - For each configuration part:
740
     *         - "[int]-[int]" = Integer range, will be expanded to all values in between, values included, starting from low to high (max. 1000). Example "1-34" or "-40--30"
741
     *         - "_TABLE:[TCA table name];[_PID:[optional page id, default is current page]];[_ENABLELANG:1]" = Look up of table records from PID, filtering out deleted records. Example "_TABLE:tt_content; _PID:123"
742
     *        _ENABLELANG:1 picks only original records without their language overlays
743
     *         - Default: Literal value
744
     *
745
     * @param array $paramArray Array with key (GET var name) and values (value of GET var which is configuration for expansion)
746 11
     * @param integer $pid Current page ID
747
     * @return array
748
     *
749 11
     * TODO: Write Functional Tests
750 11
     */
751
    public function expandParameters($paramArray, $pid)
752
    {
753 11
        // Traverse parameter names:
754
        foreach ($paramArray as $p => $v) {
755 11
            $v = trim($v);
756 11
757
            // If value is encapsulated in square brackets it means there are some ranges of values to find, otherwise the value is literal
758
            if (strpos($v, '[') === 0 && substr($v, -1) === ']') {
759 11
                // So, find the value inside brackets and reset the paramArray value as an array.
760 11
                $v = substr($v, 1, -1);
761
                $paramArray[$p] = [];
762
763 11
                // Explode parts and traverse them:
764 1
                $parts = explode('|', $v);
765
                foreach ($parts as $pV) {
766
767
                    // Look for integer range: (fx. 1-34 or -40--30 // reads minus 40 to minus 30)
768 1
                    if (preg_match('/^(-?[0-9]+)\s*-\s*(-?[0-9]+)$/', trim($pV), $reg)) {
769 1
                        $reg = $this->swapIfFirstIsLargerThanSecond($reg);
770 1
771 1
                        // Traverse range, add values:
772 1
                        // Limit to size of range!
773
                        $runAwayBrake = 1000;
774
                        for ($a = $reg[1]; $a <= $reg[2]; $a++) {
775
                            $paramArray[$p][] = $a;
776 10
                            $runAwayBrake--;
777
                            if ($runAwayBrake <= 0) {
778
                                break;
779 6
                            }
780 6
                        }
781 6
                    } elseif (strpos(trim($pV), '_TABLE:') === 0) {
782 6
783 6
                        // Parse parameters:
784
                        $subparts = GeneralUtility::trimExplode(';', $pV);
785
                        $subpartParams = [];
786
                        foreach ($subparts as $spV) {
787 6
                            [$pKey, $pVal] = GeneralUtility::trimExplode(':', $spV);
788 6
                            $subpartParams[$pKey] = $pVal;
789 6
                        }
790 6
791 6
                        // Table exists:
792 6
                        if (isset($GLOBALS['TCA'][$subpartParams['_TABLE']])) {
793
                            $lookUpPid = isset($subpartParams['_PID']) ? intval($subpartParams['_PID']) : intval($pid);
794 6
                            $recursiveDepth = isset($subpartParams['_RECURSIVE']) ? intval($subpartParams['_RECURSIVE']) : 0;
795 6
                            $pidField = isset($subpartParams['_PIDFIELD']) ? trim($subpartParams['_PIDFIELD']) : 'pid';
796 6
                            $where = $subpartParams['_WHERE'] ?? '';
797
                            $addTable = $subpartParams['_ADDTABLE'] ?? '';
798 6
799
                            $fieldName = $subpartParams['_FIELD'] ? $subpartParams['_FIELD'] : 'uid';
800 2
                            if ($fieldName === 'uid' || $GLOBALS['TCA'][$subpartParams['_TABLE']]['columns'][$fieldName]) {
801 2
                                $queryBuilder = $this->getQueryBuilder($subpartParams['_TABLE']);
802 2
803
                                if ($recursiveDepth > 0) {
804 4
                                    /** @var QueryGenerator $queryGenerator */
805
                                    $queryGenerator = GeneralUtility::makeInstance(QueryGenerator::class);
806
                                    $pidList = $queryGenerator->getTreeList($lookUpPid, $recursiveDepth, 0, 1);
807 6
                                    $pidArray = GeneralUtility::intExplode(',', $pidList);
808 6
                                } else {
809 6
                                    $pidArray = [(string) $lookUpPid];
810
                                }
811
812 6
                                $queryBuilder->getRestrictions()
813 6
                                    ->removeAll()
814 6
                                    ->add(GeneralUtility::makeInstance(DeletedRestriction::class));
815 6
816
                                $queryBuilder
817
                                    ->select($fieldName)
818
                                    ->from($subpartParams['_TABLE'])
819 6
                                    ->where(
820
                                        $queryBuilder->expr()->in($pidField, $queryBuilder->createNamedParameter($pidArray, Connection::PARAM_INT_ARRAY)),
821
                                        $where
822
                                    );
823 6
824
                                if (! empty($addTable)) {
825 6
                                    // TODO: Check if this works as intended!
826
                                    $queryBuilder->add('from', $addTable);
827
                                }
828
                                $transOrigPointerField = $GLOBALS['TCA'][$subpartParams['_TABLE']]['ctrl']['transOrigPointerField'];
829
830
                                if ($subpartParams['_ENABLELANG'] && $transOrigPointerField) {
831
                                    $queryBuilder->andWhere(
832
                                        $queryBuilder->expr()->lte(
833
                                            $transOrigPointerField,
834 6
                                            0
835
                                        )
836 6
                                    );
837 6
                                }
838 6
839
                                $statement = $queryBuilder->execute();
840
841 6
                                $rows = [];
842 6
                                while ($row = $statement->fetch()) {
843
                                    $rows[$row[$fieldName]] = $row;
844
                                }
845
846
                                if (is_array($rows)) {
847
                                    $paramArray[$p] = array_merge($paramArray[$p], array_keys($rows));
848 4
                                }
849
                            }
850
                        }
851 11
                    } else {
852
                        // Just add value:
853
                        $paramArray[$p][] = $pV;
854
                    }
855
                    // Hook for processing own expandParameters place holder
856
                    if (is_array($GLOBALS['TYPO3_CONF_VARS']['SC_OPTIONS']['crawler/class.tx_crawler_lib.php']['expandParameters'])) {
857
                        $_params = [
858
                            'pObj' => &$this,
859
                            'paramArray' => &$paramArray,
860
                            'currentKey' => $p,
861
                            'currentValue' => $pV,
862
                            'pid' => $pid,
863
                        ];
864
                        foreach ($GLOBALS['TYPO3_CONF_VARS']['SC_OPTIONS']['crawler/class.tx_crawler_lib.php']['expandParameters'] as $_funcRef) {
865
                            GeneralUtility::callUserFunction($_funcRef, $_params, $this);
866 11
                        }
867 11
                    }
868
                }
869
870 4
                // Make unique set of values and sort array by key:
871
                $paramArray[$p] = array_unique($paramArray[$p]);
872
                ksort($paramArray);
873
            } else {
874 11
                // Set the literal value as only value in array:
875
                $paramArray[$p] = [$v];
876
            }
877
        }
878
879
        return $paramArray;
880
    }
881
882
    /**
883
     * Compiling URLs from parameter array (output of expandParameters())
884
     * The number of URLs will be the multiplication of the number of parameter values for each key
885 7
     *
886
     * @param array $paramArray Output of expandParameters(): Array with keys (GET var names) and for each an array of values
887 7
     * @param array $urls URLs accumulated in this array (for recursion)
888 7
     * @return array
889
     */
890
    public function compileUrls($paramArray, array $urls)
891 6
    {
892 6
        if (empty($paramArray)) {
893 6
            return $urls;
894
        }
895
        // shift first off stack:
896 6
        reset($paramArray);
897 6
        $varName = key($paramArray);
898 5
        $valueSet = array_shift($paramArray);
899 5
900
        // Traverse value set:
901 5
        $newUrls = [];
902
        foreach ($urls as $url) {
903
            foreach ($valueSet as $val) {
904
                $newUrls[] = $url . (strcmp((string) $val, '') ? '&' . rawurlencode($varName) . '=' . rawurlencode((string) $val) : '');
905
906 6
                if (count($newUrls) > $this->maximumUrlsToCompile) {
907
                    break;
908
                }
909
            }
910
        }
911
        return $this->compileUrls($paramArray, $newUrls);
912
    }
913
914
    /************************************
915
     *
916
     * Crawler log
917
     *
918
     ************************************/
919
920
    /**
921
     * Return array of records from crawler queue for input page ID
922
     *
923
     * @param integer $id Page ID for which to look up log entries.
924
     * @param boolean $doFlush If TRUE, then entries selected at DELETED(!) instead of selected!
925
     * @param boolean $doFullFlush
926 4
     * @param integer $itemsPerPage Limit the amount of entries per page default is 10
927
     * @return array
928 4
     *
929
     * @deprecated
930 4
     */
931 4
    public function getLogEntriesForPageId($id, QueueFilter $queueFilter, $doFlush = false, $doFullFlush = false, $itemsPerPage = 10)
0 ignored issues
show
Unused Code introduced by
The parameter $doFullFlush is not used and could be removed. ( Ignorable by Annotation )

If this is a false-positive, you can also ignore this issue in your code via the ignore-unused  annotation

931
    public function getLogEntriesForPageId($id, QueueFilter $queueFilter, $doFlush = false, /** @scrutinizer ignore-unused */ $doFullFlush = false, $itemsPerPage = 10)

This check looks for parameters that have been defined for a function or method, but which are not used in the method body.

Loading history...
932 4
    {
933 4
        $queryBuilder = GeneralUtility::makeInstance(ConnectionPool::class)->getQueryBuilderForTable($this->tableName);
934
        $queryBuilder
935 4
            ->select('*')
936
            ->from($this->tableName)
937 4
            ->where(
938 4
                $queryBuilder->expr()->eq('page_id', $queryBuilder->createNamedParameter($id, PDO::PARAM_INT))
939 4
            )
940 4
            ->orderBy('scheduled', 'DESC');
941
942
        $expressionBuilder = GeneralUtility::makeInstance(ConnectionPool::class)
943
            ->getConnectionForTable($this->tableName)
944 4
            ->getExpressionBuilder();
945 4
        $query = $expressionBuilder->andX();
0 ignored issues
show
Unused Code introduced by
The assignment to $query is dead and can be removed.
Loading history...
946
        // PHPStorm adds the highlight that the $addWhere is immediately overwritten,
947
        // but the $query = $expressionBuilder->andX() ensures that the $addWhere is written correctly with AND
948 4
        // between the statements, it's not a mistake in the code.
949
        switch ($queueFilter) {
950
            case 'pending':
951
                $queryBuilder->andWhere($queryBuilder->expr()->eq('exec_time', 0));
952
                break;
953 4
            case 'finished':
954 2
                $queryBuilder->andWhere($queryBuilder->expr()->gt('exec_time', 0));
955
                break;
956 4
        }
957
958 4
        if ($doFlush) {
959
            $this->queueRepository->flushQueue($queueFilter);
960
        }
961 4
        if ($itemsPerPage > 0) {
962
            $queryBuilder
963
                ->setMaxResults((int) $itemsPerPage);
964
        }
965
966
        return $queryBuilder->execute()->fetchAll();
967
    }
968
969
    /**
970
     * Return array of records from crawler queue for input set ID
971
     *
972
     * @param int $set_id Set ID for which to look up log entries.
973
     * @param string $filter Filter: "all" => all entries, "pending" => all that is not yet run, "finished" => all complete ones
974
     * @param bool $doFlush If TRUE, then entries selected at DELETED(!) instead of selected!
975 6
     * @param int $itemsPerPage Limit the amount of entries per page default is 10
976
     * @return array
977 6
     *
978
     * @deprecated
979 6
     */
980 6
    public function getLogEntriesForSetId(int $set_id, string $filter = '', bool $doFlush = false, bool $doFullFlush = false, int $itemsPerPage = 10)
981 6
    {
982 6
        $queryBuilder = GeneralUtility::makeInstance(ConnectionPool::class)->getQueryBuilderForTable($this->tableName);
983
        $queryBuilder
984 6
            ->select('*')
985
            ->from($this->tableName)
986 6
            ->where(
987 6
                $queryBuilder->expr()->eq('set_id', $queryBuilder->createNamedParameter($set_id, PDO::PARAM_INT))
988 6
            )
989 6
            ->orderBy('scheduled', 'DESC');
990
991
        $expressionBuilder = GeneralUtility::makeInstance(ConnectionPool::class)
992
            ->getConnectionForTable($this->tableName)
993 6
            ->getExpressionBuilder();
994 6
        $query = $expressionBuilder->andX();
995 6
        // PHPStorm adds the highlight that the $addWhere is immediately overwritten,
996 1
        // but the $query = $expressionBuilder->andX() ensures that the $addWhere is written correctly with AND
997 1
        // between the statements, it's not a mistake in the code.
998 1
        $addWhere = '';
999 5
        switch ($filter) {
1000 1
            case 'pending':
1001 1
                $queryBuilder->andWhere($queryBuilder->expr()->eq('exec_time', 0));
1002 1
                $addWhere = $query->add($expressionBuilder->eq('exec_time', 0));
0 ignored issues
show
Unused Code introduced by
The assignment to $addWhere is dead and can be removed.
Loading history...
1003
                break;
1004 6
            case 'finished':
1005 4
                $queryBuilder->andWhere($queryBuilder->expr()->gt('exec_time', 0));
1006 4
                $addWhere = $query->add($expressionBuilder->gt('exec_time', 0));
1007 4
                break;
1008
        }
1009 2
        if ($doFlush) {
1010
            $addWhere = $query->add($expressionBuilder->eq('set_id', (int) $set_id));
1011 2
            $this->flushQueue($doFullFlush ? '' : $addWhere);
0 ignored issues
show
Deprecated Code introduced by
The function AOE\Crawler\Controller\C...ontroller::flushQueue() has been deprecated. ( Ignorable by Annotation )

If this is a false-positive, you can also ignore this issue in your code via the ignore-deprecated  annotation

1011
            /** @scrutinizer ignore-deprecated */ $this->flushQueue($doFullFlush ? '' : $addWhere);
Loading history...
1012
            return [];
1013
        }
1014 2
        if ($itemsPerPage > 0) {
1015
            $queryBuilder
1016
                ->setMaxResults((int) $itemsPerPage);
1017
        }
1018
1019
        return $queryBuilder->execute()->fetchAll();
1020
    }
1021
1022
    /**
1023
     * Adding call back entries to log (called from hooks typically, see indexed search class "class.crawler.php"
1024
     *
1025
     * @param integer $setId Set ID
1026
     * @param array $params Parameters to pass to call back function
1027
     * @param string $callBack Call back object reference, eg. 'EXT:indexed_search/class.crawler.php:&tx_indexedsearch_crawler'
1028
     * @param integer $page_id Page ID to attach it to
1029
     * @param integer $schedule Time at which to activate
1030
     */
1031
    public function addQueueEntry_callBack($setId, $params, $callBack, $page_id = 0, $schedule = 0): void
1032
    {
1033
        if (! is_array($params)) {
0 ignored issues
show
introduced by
The condition is_array($params) is always true.
Loading history...
1034
            $params = [];
1035
        }
1036
        $params['_CALLBACKOBJ'] = $callBack;
1037
1038
        GeneralUtility::makeInstance(ConnectionPool::class)->getConnectionForTable('tx_crawler_queue')
1039
            ->insert(
1040
                'tx_crawler_queue',
1041
                [
1042
                    'page_id' => (int) $page_id,
1043
                    'parameters' => json_encode($params),
1044
                    'scheduled' => (int) $schedule ?: $this->getCurrentTime(),
1045
                    'exec_time' => 0,
1046
                    'set_id' => (int) $setId,
1047
                    'result_data' => '',
1048
                ]
1049
            );
1050
    }
1051
1052
    /************************************
1053
     *
1054
     * URL setting
1055
     *
1056
     ************************************/
1057
1058
    /**
1059
     * Setting a URL for crawling:
1060
     *
1061
     * @param integer $id Page ID
1062
     * @param string $url Complete URL
1063
     * @param array $subCfg Sub configuration array (from TS config)
1064 8
     * @param integer $tstamp Scheduled-time
1065
     * @param string $configurationHash (optional) configuration hash
1066
     * @param bool $skipInnerDuplicationCheck (optional) skip inner duplication check
1067
     * @return bool
1068
     */
1069
    public function addUrl(
1070
        $id,
1071
        $url,
1072 8
        array $subCfg,
1073 8
        $tstamp,
1074
        $configurationHash = '',
1075
        $skipInnerDuplicationCheck = false
1076
    ) {
1077 8
        $urlAdded = false;
1078
        $rows = [];
1079
1080
        // Creating parameters:
1081 8
        $parameters = [
1082 8
            'url' => $url,
1083 1
        ];
1084
1085
        // fe user group simulation:
1086
        $uGs = implode(',', array_unique(GeneralUtility::intExplode(',', $subCfg['userGroups'], true)));
1087 8
        if ($uGs) {
1088 8
            $parameters['feUserGroupList'] = $uGs;
1089 5
        }
1090
1091
        // Setting processing instructions
1092
        $parameters['procInstructions'] = GeneralUtility::trimExplode(',', $subCfg['procInstrFilter']);
1093 8
        if (is_array($subCfg['procInstrParams.'])) {
1094
            $parameters['procInstrParams'] = $subCfg['procInstrParams.'];
1095 8
        }
1096 8
1097 8
        // Compile value array:
1098 8
        $parameters_serialized = json_encode($parameters);
1099 8
        $fieldArray = [
1100 8
            'page_id' => (int) $id,
1101 8
            'parameters' => $parameters_serialized,
1102 8
            'parameters_hash' => GeneralUtility::shortMD5($parameters_serialized),
1103 8
            'configuration_hash' => $configurationHash,
1104
            'scheduled' => $tstamp,
1105
            'exec_time' => 0,
1106 8
            'set_id' => (int) $this->setID,
1107
            'result_data' => '',
1108 1
            'configuration' => $subCfg['key'],
1109
        ];
1110 7
1111
        if ($this->registerQueueEntriesInternallyOnly) {
1112 6
            //the entries will only be registered and not stored to the database
1113 6
            $this->queueEntries[] = $fieldArray;
1114
        } else {
1115 6
            if (! $skipInnerDuplicationCheck) {
1116 6
                // check if there is already an equal entry
1117 6
                $rows = $this->queueRepository->getDuplicateQueueItemsIfExists(
1118
                    (bool) $this->extensionSettings['enableTimeslot'],
1119
                    $tstamp,
1120
                    $this->getCurrentTime(),
1121 7
                    $fieldArray['page_id'],
1122 6
                    $fieldArray['parameters_hash']
1123 6
                );
1124 6
            }
1125
1126
            if (empty($rows)) {
1127 6
                $connectionForCrawlerQueue = GeneralUtility::makeInstance(ConnectionPool::class)->getConnectionForTable('tx_crawler_queue');
1128 6
                $connectionForCrawlerQueue->insert(
1129 6
                    'tx_crawler_queue',
1130
                    $fieldArray
1131 6
                );
1132 6
                $uid = $connectionForCrawlerQueue->lastInsertId('tx_crawler_queue', 'qid');
1133 6
                $rows[] = $uid;
1134 6
                $urlAdded = true;
1135
1136
                $signalPayload = ['uid' => $uid, 'fieldArray' => $fieldArray];
1137
                SignalSlotUtility::emitSignal(
1138 3
                    self::class,
1139 3
                    SignalSlotUtility::SIGNAL_URL_ADDED_TO_QUEUE,
1140 3
                    $signalPayload
1141 3
                );
1142
            } else {
1143
                $signalPayload = ['rows' => $rows, 'fieldArray' => $fieldArray];
1144
                SignalSlotUtility::emitSignal(
1145
                    self::class,
1146
                    SignalSlotUtility::SIGNAL_DUPLICATE_URL_IN_QUEUE,
1147 8
                    $signalPayload
1148
                );
1149
            }
1150
        }
1151
1152
        return $urlAdded;
1153
    }
1154
1155 2
    /**
1156
     * Returns the current system time
1157 2
     *
1158
     * @return int
1159
     */
1160
    public function getCurrentTime()
1161
    {
1162
        return time();
1163
    }
1164
1165
    /************************************
1166
     *
1167
     * URL reading
1168
     *
1169
     ************************************/
1170
1171
    /**
1172
     * Read URL for single queue entry
1173
     *
1174
     * @param integer $queueId
1175
     * @param boolean $force If set, will process even if exec_time has been set!
1176
     * @return integer
1177
     */
1178
    public function readUrl($queueId, $force = false)
1179
    {
1180
        $queryBuilder = GeneralUtility::makeInstance(ConnectionPool::class)->getQueryBuilderForTable($this->tableName);
1181
        $ret = 0;
1182
        $this->logger->debug('crawler-readurl start ' . microtime(true));
1183
        // Get entry:
1184
        $queryBuilder
1185
            ->select('*')
1186
            ->from('tx_crawler_queue')
1187
            ->where(
1188
                $queryBuilder->expr()->eq('qid', $queryBuilder->createNamedParameter($queueId, PDO::PARAM_INT))
1189
            );
1190
        if (! $force) {
1191
            $queryBuilder
1192
                ->andWhere('exec_time = 0')
1193
                ->andWhere('process_scheduled > 0');
1194
        }
1195
        $queueRec = $queryBuilder->execute()->fetch();
1196
1197
        if (! is_array($queueRec)) {
1198
            return;
1199
        }
1200
1201
        SignalSlotUtility::emitSignal(
1202
            self::class,
1203
            SignalSlotUtility::SIGNAL_QUEUEITEM_PREPROCESS,
1204
            [$queueId, &$queueRec]
1205
        );
1206
1207
        // Set exec_time to lock record:
1208
        $field_array = ['exec_time' => $this->getCurrentTime()];
1209
1210
        if (isset($this->processID)) {
1211
            //if mulitprocessing is used we need to store the id of the process which has handled this entry
1212
            $field_array['process_id_completed'] = $this->processID;
1213
        }
1214
1215
        GeneralUtility::makeInstance(ConnectionPool::class)->getConnectionForTable('tx_crawler_queue')
1216
            ->update(
1217
                'tx_crawler_queue',
1218
                $field_array,
1219
                ['qid' => (int) $queueId]
1220
            );
1221
1222
        $result = $this->queueExecutor->executeQueueItem($queueRec, $this);
1223
        if ($result['content'] === null) {
1224
            $resultData = 'An errors happened';
1225
        } else {
1226
            /** @var JsonCompatibilityConverter $jsonCompatibilityConverter */
1227
            $jsonCompatibilityConverter = GeneralUtility::makeInstance(JsonCompatibilityConverter::class);
1228
            $resultData = $jsonCompatibilityConverter->convert($result['content']);
1229
        }
1230
1231
        //atm there's no need to point to specific pollable extensions
1232
        if (is_array($GLOBALS['TYPO3_CONF_VARS']['EXTCONF']['crawler']['pollSuccess'])) {
1233
            foreach ($GLOBALS['TYPO3_CONF_VARS']['EXTCONF']['crawler']['pollSuccess'] as $pollable) {
1234
                // only check the success value if the instruction is runnig
1235
                // it is important to name the pollSuccess key same as the procInstructions key
1236
                if (is_array($resultData['parameters']['procInstructions'])
1237
                    && in_array(
1238
                        $pollable,
1239
                        $resultData['parameters']['procInstructions'], true
1240
                    )
1241
                ) {
1242
                    if (! empty($resultData['success'][$pollable]) && $resultData['success'][$pollable]) {
1243
                        $ret |= self::CLI_STATUS_POLLABLE_PROCESSED;
1244
                    }
1245
                }
1246
            }
1247
        }
1248
1249
        // Set result in log which also denotes the end of the processing of this entry.
1250
        $field_array = ['result_data' => json_encode($result)];
1251
1252
        SignalSlotUtility::emitSignal(
1253
            self::class,
1254
            SignalSlotUtility::SIGNAL_QUEUEITEM_POSTPROCESS,
1255
            [$queueId, &$field_array]
1256
        );
1257
1258
        GeneralUtility::makeInstance(ConnectionPool::class)->getConnectionForTable('tx_crawler_queue')
1259
            ->update(
1260
                'tx_crawler_queue',
1261
                $field_array,
1262
                ['qid' => (int) $queueId]
1263
            );
1264
1265
        $this->logger->debug('crawler-readurl stop ' . microtime(true));
1266
        return $ret;
1267
    }
1268
1269
    /**
1270
     * Read URL for not-yet-inserted log-entry
1271
     *
1272
     * @param array $field_array Queue field array,
1273
     *
1274
     * @return array|bool|mixed|string
1275
     */
1276
    public function readUrlFromArray($field_array)
1277
    {
1278
        // Set exec_time to lock record:
1279
        $field_array['exec_time'] = $this->getCurrentTime();
1280
        $connectionForCrawlerQueue = GeneralUtility::makeInstance(ConnectionPool::class)->getConnectionForTable($this->tableName);
1281
        $connectionForCrawlerQueue->insert(
1282
            $this->tableName,
1283
            $field_array
1284
        );
1285
        $queueId = $connectionForCrawlerQueue->lastInsertId($this->tableName, 'qid');
1286
        $result = $this->queueExecutor->executeQueueItem($field_array, $this);
1287
1288
        // Set result in log which also denotes the end of the processing of this entry.
1289
        $field_array = ['result_data' => json_encode($result)];
1290
1291
        SignalSlotUtility::emitSignal(
1292
            self::class,
1293
            SignalSlotUtility::SIGNAL_QUEUEITEM_POSTPROCESS,
1294
            [$queueId, &$field_array]
1295
        );
1296
1297
        $connectionForCrawlerQueue->update(
1298
            $this->tableName,
1299
            $field_array,
1300
            ['qid' => $queueId]
1301
        );
1302
1303
        return $result;
1304
    }
1305
1306
    /*****************************
1307
     *
1308
     * Compiling URLs to crawl - tools
1309
     *
1310
     *****************************/
1311
1312
    /**
1313
     * @param integer $id Root page id to start from.
1314
     * @param integer $depth Depth of tree, 0=only id-page, 1= on sublevel, 99 = infinite
1315
     * @param integer $scheduledTime Unix Time when the URL is timed to be visited when put in queue
1316
     * @param integer $reqMinute Number of requests per minute (creates the interleave between requests)
1317
     * @param boolean $submitCrawlUrls If set, submits the URLs to queue in database (real crawling)
1318
     * @param boolean $downloadCrawlUrls If set (and submitcrawlUrls is false) will fill $downloadUrls with entries)
1319
     * @param array $incomingProcInstructions Array of processing instructions
1320
     * @param array $configurationSelection Array of configuration keys
1321
     * @return string
1322
     */
1323
    public function getPageTreeAndUrls(
1324
        $id,
1325
        $depth,
1326
        $scheduledTime,
1327
        $reqMinute,
1328
        $submitCrawlUrls,
1329
        $downloadCrawlUrls,
1330
        array $incomingProcInstructions,
1331
        array $configurationSelection
1332
    ) {
1333
        $this->scheduledTime = $scheduledTime;
1334
        $this->reqMinute = $reqMinute;
1335
        $this->submitCrawlUrls = $submitCrawlUrls;
1336
        $this->downloadCrawlUrls = $downloadCrawlUrls;
1337
        $this->incomingProcInstructions = $incomingProcInstructions;
1338
        $this->incomingConfigurationSelection = $configurationSelection;
1339
1340
        $this->duplicateTrack = [];
1341
        $this->downloadUrls = [];
1342
1343
        // Drawing tree:
1344
        /* @var PageTreeView $tree */
1345
        $tree = GeneralUtility::makeInstance(PageTreeView::class);
1346
        $perms_clause = $this->getBackendUser()->getPagePermsClause(Permission::PAGE_SHOW);
1347
        $tree->init('AND ' . $perms_clause);
1348
1349
        $pageInfo = BackendUtility::readPageAccess($id, $perms_clause);
1350
        if (is_array($pageInfo)) {
0 ignored issues
show
introduced by
The condition is_array($pageInfo) is always true.
Loading history...
1351
            // Set root row:
1352
            $tree->tree[] = [
1353
                'row' => $pageInfo,
1354
                'HTML' => $this->iconFactory->getIconForRecord('pages', $pageInfo, Icon::SIZE_SMALL),
1355
            ];
1356
        }
1357
1358
        // Get branch beneath:
1359
        if ($depth) {
1360
            $tree->getTree($id, $depth, '');
1361
        }
1362
1363
        // Traverse page tree:
1364
        $code = '';
1365
1366
        foreach ($tree->tree as $data) {
1367
            $this->MP = false;
1368
1369
            // recognize mount points
1370
            if ($data['row']['doktype'] === PageRepository::DOKTYPE_MOUNTPOINT) {
1371
                $mountpage = $this->pageRepository->getPage($data['row']['uid']);
1372
1373
                // fetch mounted pages
1374
                $this->MP = $mountpage[0]['mount_pid'] . '-' . $data['row']['uid'];
0 ignored issues
show
Documentation Bug introduced by
The property $MP was declared of type boolean, but $mountpage[0]['mount_pid...' . $data['row']['uid'] is of type string. Maybe add a type cast?

This check looks for assignments to scalar types that may be of the wrong type.

To ensure the code behaves as expected, it may be a good idea to add an explicit type cast.

$answer = 42;

$correct = false;

$correct = (bool) $answer;
Loading history...
1375
1376
                $mountTree = GeneralUtility::makeInstance(PageTreeView::class);
1377
                $mountTree->init('AND ' . $perms_clause);
1378
                $mountTree->getTree($mountpage[0]['mount_pid'], $depth);
1379
1380
                foreach ($mountTree->tree as $mountData) {
1381
                    $code .= $this->drawURLs_addRowsForPage(
1382
                        $mountData['row'],
1383
                        $mountData['HTML'] . BackendUtility::getRecordTitle('pages', $mountData['row'], true)
1384
                    );
1385
                }
1386
1387
                // replace page when mount_pid_ol is enabled
1388
                if ($mountpage[0]['mount_pid_ol']) {
1389
                    $data['row']['uid'] = $mountpage[0]['mount_pid'];
1390
                } else {
1391
                    // if the mount_pid_ol is not set the MP must not be used for the mountpoint page
1392
                    $this->MP = false;
1393
                }
1394
            }
1395
1396
            $code .= $this->drawURLs_addRowsForPage(
1397
                $data['row'],
1398
                $data['HTML'] . BackendUtility::getRecordTitle('pages', $data['row'], true)
1399
            );
1400
        }
1401
1402
        return $code;
1403
    }
1404
1405
    /**
1406 2
     * Expands exclude string
1407
     *
1408
     * @param string $excludeString Exclude string
1409 2
     * @return array
1410 2
     */
1411
    public function expandExcludeString($excludeString)
1412 2
    {
1413 2
        // internal static caches;
1414
        static $expandedExcludeStringCache;
1415 2
        static $treeCache;
1416
1417 1
        if (empty($expandedExcludeStringCache[$excludeString])) {
1418 1
            $pidList = [];
1419
1420 1
            if (! empty($excludeString)) {
1421
                /** @var PageTreeView $tree */
1422 1
                $tree = GeneralUtility::makeInstance(PageTreeView::class);
1423 1
                $tree->init('AND ' . $this->getBackendUser()->getPagePermsClause(Permission::PAGE_SHOW));
1424
1425
                $excludeParts = GeneralUtility::trimExplode(',', $excludeString);
1426 1
1427 1
                foreach ($excludeParts as $excludePart) {
1428
                    [$pid, $depth] = GeneralUtility::trimExplode('+', $excludePart);
1429
1430 1
                    // default is "page only" = "depth=0"
1431
                    if (empty($depth)) {
1432 1
                        $depth = (stristr($excludePart, '+')) ? 99 : 0;
1433
                    }
1434
1435
                    $pidList[] = (int) $pid;
1436
1437
                    if ($depth > 0) {
1438
                        if (empty($treeCache[$pid][$depth])) {
1439
                            $tree->reset();
1440
                            $tree->getTree($pid, $depth);
0 ignored issues
show
Bug introduced by
$pid of type string is incompatible with the type integer expected by parameter $uid of TYPO3\CMS\Backend\Tree\V...ractTreeView::getTree(). ( Ignorable by Annotation )

If this is a false-positive, you can also ignore this issue in your code via the ignore-type  annotation

1440
                            $tree->getTree(/** @scrutinizer ignore-type */ $pid, $depth);
Loading history...
1441
                            $treeCache[$pid][$depth] = $tree->tree;
1442
                        }
1443
1444
                        foreach ($treeCache[$pid][$depth] as $data) {
1445
                            $pidList[] = (int) $data['row']['uid'];
1446 2
                        }
1447
                    }
1448
                }
1449 2
            }
1450
1451
            $expandedExcludeStringCache[$excludeString] = array_unique($pidList);
1452
        }
1453
1454
        return $expandedExcludeStringCache[$excludeString];
1455
    }
1456
1457
    /**
1458
     * Create the rows for display of the page tree
1459
     * For each page a number of rows are shown displaying GET variable configuration
1460
     */
1461
    public function drawURLs_addRowsForPage(array $pageRow, string $pageTitle): string
1462
    {
1463
        $skipMessage = '';
1464
1465
        // Get list of configurations
1466
        $configurations = $this->getUrlsForPageRow($pageRow, $skipMessage);
1467
1468
        if (! empty($this->incomingConfigurationSelection)) {
1469
            // remove configuration that does not match the current selection
1470
            foreach ($configurations as $confKey => $confArray) {
1471
                if (! in_array($confKey, $this->incomingConfigurationSelection, true)) {
1472
                    unset($configurations[$confKey]);
1473
                }
1474
            }
1475
        }
1476
1477
        // Traverse parameter combinations:
1478
        $c = 0;
1479
        $content = '';
1480
        if (! empty($configurations)) {
1481
            foreach ($configurations as $confKey => $confArray) {
1482
1483
                // Title column:
1484
                if (! $c) {
1485
                    $titleClm = '<td rowspan="' . count($configurations) . '">' . $pageTitle . '</td>';
1486
                } else {
1487
                    $titleClm = '';
1488
                }
1489
1490
                if (! in_array($pageRow['uid'], $this->expandExcludeString($confArray['subCfg']['exclude']), true)) {
1491
1492
                    // URL list:
1493
                    $urlList = $this->urlListFromUrlArray(
1494
                        $confArray,
1495
                        $pageRow,
1496
                        $this->scheduledTime,
1497
                        $this->reqMinute,
1498
                        $this->submitCrawlUrls,
1499
                        $this->downloadCrawlUrls,
1500
                        $this->duplicateTrack,
1501
                        $this->downloadUrls,
1502
                        // if empty the urls won't be filtered by processing instructions
1503
                        $this->incomingProcInstructions
1504
                    );
1505
1506
                    // Expanded parameters:
1507
                    $paramExpanded = '';
1508
                    $calcAccu = [];
1509
                    $calcRes = 1;
1510
                    foreach ($confArray['paramExpanded'] as $gVar => $gVal) {
1511
                        $paramExpanded .= '
1512
                            <tr>
1513
                                <td>' . htmlspecialchars('&' . $gVar . '=') . '<br/>' .
1514
                            '(' . count($gVal) . ')' .
1515
                            '</td>
1516
                                <td nowrap="nowrap">' . nl2br(htmlspecialchars(implode(chr(10), $gVal))) . '</td>
1517
                            </tr>
1518
                        ';
1519
                        $calcRes *= count($gVal);
1520
                        $calcAccu[] = count($gVal);
1521
                    }
1522
                    $paramExpanded = '<table>' . $paramExpanded . '</table>';
1523
                    $paramExpanded .= 'Comb: ' . implode('*', $calcAccu) . '=' . $calcRes;
1524
1525
                    // Options
1526
                    $optionValues = '';
1527
                    if ($confArray['subCfg']['userGroups']) {
1528
                        $optionValues .= 'User Groups: ' . $confArray['subCfg']['userGroups'] . '<br/>';
1529
                    }
1530
                    if ($confArray['subCfg']['procInstrFilter']) {
1531
                        $optionValues .= 'ProcInstr: ' . $confArray['subCfg']['procInstrFilter'] . '<br/>';
1532
                    }
1533
1534
                    // Compile row:
1535
                    $content .= '
1536
                        <tr>
1537
                            ' . $titleClm . '
1538
                            <td>' . htmlspecialchars($confKey) . '</td>
1539
                            <td>' . nl2br(htmlspecialchars(rawurldecode(trim(str_replace('&', chr(10) . '&', GeneralUtility::implodeArrayForUrl('', $confArray['paramParsed'])))))) . '</td>
1540
                            <td>' . $paramExpanded . '</td>
1541
                            <td nowrap="nowrap">' . $urlList . '</td>
1542
                            <td nowrap="nowrap">' . $optionValues . '</td>
1543
                            <td nowrap="nowrap">' . DebugUtility::viewArray($confArray['subCfg']['procInstrParams.']) . '</td>
1544
                        </tr>';
1545
                } else {
1546
                    $content .= '<tr>
1547
                            ' . $titleClm . '
1548
                            <td>' . htmlspecialchars($confKey) . '</td>
1549
                            <td colspan="5"><em>No entries</em> (Page is excluded in this configuration)</td>
1550
                        </tr>';
1551
                }
1552
1553
                $c++;
1554
            }
1555
        } else {
1556
            $message = ! empty($skipMessage) ? ' (' . $skipMessage . ')' : '';
1557
1558
            // Compile row:
1559
            $content .= '
1560
                <tr>
1561
                    <td>' . $pageTitle . '</td>
1562
                    <td colspan="6"><em>No entries</em>' . $message . '</td>
1563
                </tr>';
1564
        }
1565
1566
        return $content;
1567
    }
1568
1569
    /*****************************
1570
     *
1571
     * CLI functions
1572
     *
1573
     *****************************/
1574
1575
    /**
1576
     * Running the functionality of the CLI (crawling URLs from queue)
1577
     */
1578
    public function CLI_run(int $countInARun, int $sleepTime, int $sleepAfterFinish): int
1579
    {
1580
        $result = 0;
1581
        $counter = 0;
1582
1583
        // First, run hooks:
1584
        $this->CLI_runHooks();
1585
1586
        // Clean up the queue
1587
        $this->queueRepository->cleanupQueue();
1588
1589
        // Select entries:
1590
        $rows = $this->queueRepository->fetchRecordsToBeCrawled($countInARun);
1591
1592
        if (! empty($rows)) {
1593
            $quidList = [];
1594
1595
            foreach ($rows as $r) {
1596
                $quidList[] = $r['qid'];
1597
            }
1598
1599
            $processId = $this->CLI_buildProcessId();
1600
1601
            //save the number of assigned queue entries to determine how many have been processed later
1602
            $numberOfAffectedRows = $this->queueRepository->updateProcessIdAndSchedulerForQueueIds($quidList, $processId);
1603
            $this->processRepository->updateProcessAssignItemsCount($numberOfAffectedRows, $processId);
1604
1605
            if ($numberOfAffectedRows !== count($quidList)) {
1606
                $this->CLI_debug('Nothing processed due to multi-process collision (' . $this->CLI_buildProcessId() . ')');
0 ignored issues
show
Deprecated Code introduced by
The function AOE\Crawler\Controller\C...Controller::CLI_debug() has been deprecated. ( Ignorable by Annotation )

If this is a false-positive, you can also ignore this issue in your code via the ignore-deprecated  annotation

1606
                /** @scrutinizer ignore-deprecated */ $this->CLI_debug('Nothing processed due to multi-process collision (' . $this->CLI_buildProcessId() . ')');
Loading history...
1607
                return ($result | self::CLI_STATUS_ABORTED);
1608
            }
1609
1610
            foreach ($rows as $r) {
1611
                $result |= $this->readUrl($r['qid']);
1612
1613
                $counter++;
1614
                // Just to relax the system
1615
                usleep((int) $sleepTime);
1616
1617
                // if during the start and the current read url the cli has been disable we need to return from the function
1618
                // mark the process NOT as ended.
1619
                if ($this->crawler->isDisabled()) {
1620
                    return ($result | self::CLI_STATUS_ABORTED);
1621
                }
1622
1623
                if (! $this->processRepository->isProcessActive($this->CLI_buildProcessId())) {
1624
                    $this->CLI_debug('conflict / timeout (' . $this->CLI_buildProcessId() . ')');
0 ignored issues
show
Deprecated Code introduced by
The function AOE\Crawler\Controller\C...Controller::CLI_debug() has been deprecated. ( Ignorable by Annotation )

If this is a false-positive, you can also ignore this issue in your code via the ignore-deprecated  annotation

1624
                    /** @scrutinizer ignore-deprecated */ $this->CLI_debug('conflict / timeout (' . $this->CLI_buildProcessId() . ')');
Loading history...
1625
                    $result |= self::CLI_STATUS_ABORTED;
1626
                    //possible timeout
1627
                    break;
1628
                }
1629
            }
1630
1631
            sleep((int) $sleepAfterFinish);
1632
1633
            $msg = 'Rows: ' . $counter;
1634
            $this->CLI_debug($msg . ' (' . $this->CLI_buildProcessId() . ')');
0 ignored issues
show
Deprecated Code introduced by
The function AOE\Crawler\Controller\C...Controller::CLI_debug() has been deprecated. ( Ignorable by Annotation )

If this is a false-positive, you can also ignore this issue in your code via the ignore-deprecated  annotation

1634
            /** @scrutinizer ignore-deprecated */ $this->CLI_debug($msg . ' (' . $this->CLI_buildProcessId() . ')');
Loading history...
1635
        } else {
1636
            $this->CLI_debug('Nothing within queue which needs to be processed (' . $this->CLI_buildProcessId() . ')');
0 ignored issues
show
Deprecated Code introduced by
The function AOE\Crawler\Controller\C...Controller::CLI_debug() has been deprecated. ( Ignorable by Annotation )

If this is a false-positive, you can also ignore this issue in your code via the ignore-deprecated  annotation

1636
            /** @scrutinizer ignore-deprecated */ $this->CLI_debug('Nothing within queue which needs to be processed (' . $this->CLI_buildProcessId() . ')');
Loading history...
1637
        }
1638
1639
        if ($counter > 0) {
1640
            $result |= self::CLI_STATUS_PROCESSED;
1641
        }
1642
1643
        return $result;
1644
    }
1645
1646
    /**
1647
     * Activate hooks
1648
     */
1649
    public function CLI_runHooks(): void
1650
    {
1651
        foreach ($GLOBALS['TYPO3_CONF_VARS']['EXTCONF']['crawler']['cli_hooks'] ?? [] as $objRef) {
1652
            $hookObj = GeneralUtility::makeInstance($objRef);
1653
            if (is_object($hookObj)) {
1654
                $hookObj->crawler_init($this);
1655
            }
1656
        }
1657
    }
1658
1659
    /**
1660
     * Try to acquire a new process with the given id
1661
     * also performs some auto-cleanup for orphan processes
1662
     * @param string $id identification string for the process
1663
     * @return boolean
1664
     * @todo preemption might not be the most elegant way to clean up
1665
     */
1666
    public function CLI_checkAndAcquireNewProcess($id)
1667
    {
1668
        $ret = true;
1669
1670
        $systemProcessId = getmypid();
1671
        if (! $systemProcessId) {
1672
            return false;
1673
        }
1674
1675
        $processCount = 0;
1676
        $orphanProcesses = [];
1677
1678
        $activeProcesses = $this->processRepository->findAllActive();
1679
        $currentTime = $this->getCurrentTime();
1680
1681
        /** @var Process $process */
1682
        foreach ($activeProcesses as $process) {
1683
            if ($process->getTtl() < $currentTime) {
1684
                $orphanProcesses[] = $process->getProcessId();
1685
            } else {
1686
                $processCount++;
1687
            }
1688
        }
1689
1690
        // if there are less than allowed active processes then add a new one
1691
        if ($processCount < (int) $this->extensionSettings['processLimit']) {
1692
            $this->CLI_debug('add process ' . $this->CLI_buildProcessId() . ' (' . ($processCount + 1) . '/' . (int) $this->extensionSettings['processLimit'] . ')');
0 ignored issues
show
Deprecated Code introduced by
The function AOE\Crawler\Controller\C...Controller::CLI_debug() has been deprecated. ( Ignorable by Annotation )

If this is a false-positive, you can also ignore this issue in your code via the ignore-deprecated  annotation

1692
            /** @scrutinizer ignore-deprecated */ $this->CLI_debug('add process ' . $this->CLI_buildProcessId() . ' (' . ($processCount + 1) . '/' . (int) $this->extensionSettings['processLimit'] . ')');
Loading history...
1693
1694
            GeneralUtility::makeInstance(ConnectionPool::class)->getConnectionForTable('tx_crawler_process')->insert(
1695
                'tx_crawler_process',
1696
                [
1697
                    'process_id' => $id,
1698
                    'active' => 1,
1699
                    'ttl' => $currentTime + (int) $this->extensionSettings['processMaxRunTime'],
1700
                    'system_process_id' => $systemProcessId,
1701
                ]
1702
            );
1703
        } else {
1704
            $this->CLI_debug('Processlimit reached (' . ($processCount) . '/' . (int) $this->extensionSettings['processLimit'] . ')');
0 ignored issues
show
Deprecated Code introduced by
The function AOE\Crawler\Controller\C...Controller::CLI_debug() has been deprecated. ( Ignorable by Annotation )

If this is a false-positive, you can also ignore this issue in your code via the ignore-deprecated  annotation

1704
            /** @scrutinizer ignore-deprecated */ $this->CLI_debug('Processlimit reached (' . ($processCount) . '/' . (int) $this->extensionSettings['processLimit'] . ')');
Loading history...
1705
            $ret = false;
1706
        }
1707
1708
        $this->processRepository->deleteProcessesMarkedAsDeleted();
1709
        $this->CLI_releaseProcesses($orphanProcesses);
1710
1711
        return $ret;
1712
    }
1713
1714
    /**
1715
     * Release a process and the required resources
1716
     *
1717
     * @param mixed $releaseIds string with a single process-id or array with multiple process-ids
1718
     * @return boolean
1719
     */
1720
    public function CLI_releaseProcesses($releaseIds)
1721
    {
1722
        $queryBuilder = GeneralUtility::makeInstance(ConnectionPool::class)->getQueryBuilderForTable($this->tableName);
1723
1724
        if (! is_array($releaseIds)) {
1725
            $releaseIds = [$releaseIds];
1726
        }
1727
1728
        if (empty($releaseIds)) {
1729
            //nothing to release
1730
            return false;
1731
        }
1732
1733
        // some kind of 2nd chance algo - this way you need at least 2 processes to have a real cleanup
1734
        // this ensures that a single process can't mess up the entire process table
1735
1736
        // mark all processes as deleted which have no "waiting" queue-entires and which are not active
1737
1738
        $queryBuilder
1739
            ->update($this->tableName, 'q')
1740
            ->where(
1741
                'q.process_id IN(SELECT p.process_id FROM tx_crawler_process as p WHERE p.active = 0)'
1742
            )
1743
            ->set('q.process_scheduled', 0)
1744
            ->set('q.process_id', '')
1745
            ->execute();
1746
1747
        // FIXME: Not entirely sure that this is equivalent to the previous version
1748
        $queryBuilder->resetQueryPart('set');
1749
1750
        $queryBuilder
1751
            ->update('tx_crawler_process')
1752
            ->where(
1753
                $queryBuilder->expr()->eq('active', 0),
1754
                'process_id IN(SELECT q.process_id FROM tx_crawler_queue as q WHERE q.exec_time = 0)'
1755
            )
1756
            ->set('system_process_id', 0)
1757
            ->execute();
1758
1759
        $this->processRepository->markRequestedProcessesAsNotActive($releaseIds);
1760
        $this->queueRepository->unsetProcessScheduledAndProcessIdForQueueEntries($releaseIds);
1761
1762
        return true;
1763
    }
1764
1765 1
    /**
1766
     * Create a unique Id for the current process
1767 1
     *
1768
     * @return string the ID
1769
     */
1770 1
    public function CLI_buildProcessId()
1771
    {
1772
        if (! $this->processID) {
1773
            $this->processID = GeneralUtility::shortMD5(microtime(true));
1774
        }
1775
        return $this->processID;
1776
    }
1777
1778
    /**
1779
     * Prints a message to the stdout (only if debug-mode is enabled)
1780
     *
1781
     * @param string $msg the message
1782
     * @deprecated
1783
     * @codeCoverageIgnore
1784
     */
1785
    public function CLI_debug($msg): void
1786
    {
1787
        if ((int) $this->extensionSettings['processDebug']) {
1788
            echo $msg . "\n";
1789
            flush();
1790
        }
1791
    }
1792
1793
    /**
1794
     * Cleans up entries that stayed for too long in the queue. These are:
1795 1
     * - processed entries that are over 1.5 days in age
1796
     * - scheduled entries that are over 7 days old
1797
     *
1798 1
     * @deprecated
1799 1
     */
1800
    public function cleanUpOldQueueEntries(): void
1801 1
    {
1802 1
        // 24*60*60 Seconds in 24 hours
1803 1
        $processedAgeInSeconds = $this->extensionSettings['cleanUpProcessedAge'] * 86400;
1804 1
        $scheduledAgeInSeconds = $this->extensionSettings['cleanUpScheduledAge'] * 86400;
1805
1806
        $now = time();
1807
        $condition = '(exec_time<>0 AND exec_time<' . ($now - $processedAgeInSeconds) . ') OR scheduled<=' . ($now - $scheduledAgeInSeconds);
1808
        $this->flushQueue($condition);
0 ignored issues
show
Deprecated Code introduced by
The function AOE\Crawler\Controller\C...ontroller::flushQueue() has been deprecated. ( Ignorable by Annotation )

If this is a false-positive, you can also ignore this issue in your code via the ignore-deprecated  annotation

1808
        /** @scrutinizer ignore-deprecated */ $this->flushQueue($condition);
Loading history...
1809
    }
1810
1811
    /**
1812
     * Removes queue entries
1813 5
     *
1814
     * @param string $where SQL related filter for the entries which should be removed
1815 5
     *
1816
     * @deprecated
1817 5
     */
1818
    protected function flushQueue($where = ''): void
1819
    {
1820 5
        $realWhere = strlen((string) $where) > 0 ? $where : '1=1';
1821 5
1822 5
        $queryBuilder = $this->getQueryBuilder($this->tableName);
1823 5
1824 5
        $groups = $queryBuilder
1825 5
            ->selectLiteral('DISTINCT set_id')
1826 5
            ->from($this->tableName)
1827
            ->where($realWhere)
1828 4
            ->execute()
1829 4
            ->fetchAll();
1830 4
        if (is_array($groups)) {
0 ignored issues
show
introduced by
The condition is_array($groups) is always true.
Loading history...
1831 4
            foreach ($groups as $group) {
1832 4
                $subSet = $queryBuilder
1833
                    ->select('qid', 'set_id')
1834 4
                    ->from($this->tableName)
1835 4
                    ->where(
1836
                        $realWhere,
1837 4
                        $queryBuilder->expr()->eq('set_id', $group['set_id'])
1838 4
                    )
1839 4
                    ->execute()
1840 4
                    ->fetchAll();
1841
1842
                $payLoad = ['subSet' => $subSet];
1843
                SignalSlotUtility::emitSignal(
1844
                    self::class,
1845
                    SignalSlotUtility::SIGNAL_QUEUE_ENTRY_FLUSH,
1846
                    $payLoad
1847 5
                );
1848 5
            }
1849 5
        }
1850 5
1851
        $queryBuilder
1852
            ->delete($this->tableName)
1853
            ->where($realWhere)
1854
            ->execute();
1855
    }
1856
1857
    /**
1858
     * This method determines duplicates for a queue entry with the same parameters and this timestamp.
1859
     * If the timestamp is in the past, it will check if there is any unprocessed queue entry in the past.
1860
     * If the timestamp is in the future it will check, if the queued entry has exactly the same timestamp
1861
     *
1862
     * @param int $tstamp
1863 5
     * @param array $fieldArray
1864
     *
1865 5
     * @return array
1866
     * @deprecated
1867 5
     */
1868
    protected function getDuplicateRowsIfExist($tstamp, $fieldArray)
1869 5
    {
1870
        $rows = [];
1871 5
1872 5
        $currentTime = $this->getCurrentTime();
1873
1874 5
        $queryBuilder = GeneralUtility::makeInstance(ConnectionPool::class)->getQueryBuilderForTable($this->tableName);
1875 2
        $queryBuilder
1876 1
            ->select('qid')
1877 1
            ->from('tx_crawler_queue');
1878
        //if this entry is scheduled with "now"
1879 1
        if ($tstamp <= $currentTime) {
1880 1
            if ($this->extensionSettings['enableTimeslot']) {
1881
                $timeBegin = $currentTime - 100;
1882 1
                $timeEnd = $currentTime + 100;
1883 1
                $queryBuilder
1884
                    ->where(
1885
                        'scheduled BETWEEN ' . $timeBegin . ' AND ' . $timeEnd . ''
1886
                    )
1887 1
                    ->orWhere(
1888 2
                        $queryBuilder->expr()->lte('scheduled', $currentTime)
1889
                    );
1890
            } else {
1891 3
                $queryBuilder
1892
                    ->where(
1893
                        $queryBuilder->expr()->lte('scheduled', $currentTime)
1894 3
                    );
1895 3
            }
1896
        } elseif ($tstamp > $currentTime) {
1897
            //entry with a timestamp in the future need to have the same schedule time
1898
            $queryBuilder
1899
                ->where(
1900 5
                    $queryBuilder->expr()->eq('scheduled', $tstamp)
1901 5
                );
1902 5
        }
1903 5
1904
        $queryBuilder
1905 5
            ->andWhere('NOT exec_time')
1906
            ->andWhere('NOT process_id')
1907 5
            ->andWhere($queryBuilder->expr()->eq('page_id', $queryBuilder->createNamedParameter($fieldArray['page_id'], PDO::PARAM_INT)))
1908 5
            ->andWhere($queryBuilder->expr()->eq('parameters_hash', $queryBuilder->createNamedParameter($fieldArray['parameters_hash'], PDO::PARAM_STR)));
1909
1910
        $statement = $queryBuilder->execute();
1911 5
1912
        while ($row = $statement->fetch()) {
1913
            $rows[] = $row['qid'];
1914
        }
1915
1916
        return $rows;
1917
    }
1918
1919 10
    /**
1920
     * Returns a md5 hash generated from a serialized configuration array.
1921 10
     *
1922 10
     * @return string
1923 10
     */
1924
    protected function getConfigurationHash(array $configuration)
1925
    {
1926
        unset($configuration['paramExpanded']);
1927
        unset($configuration['URLs']);
1928
        return md5(serialize($configuration));
1929
    }
1930
1931
    /**
1932
     * Build a URL from a Page and the Query String. If the page has a Site configuration, it can be built by using
1933
     * the Site instance.
1934
     *
1935
     * @param int $httpsOrHttp see tx_crawler_configuration.force_ssl
1936
     * @throws SiteNotFoundException
1937
     * @throws InvalidRouteArgumentsException
1938
     *
1939
     * @deprecated Using CrawlerController::getUrlFromPageAndQueryParameters() is deprecated since 9.1.1 and will be removed in v11.x, please use UrlService->getUrlFromPageAndQueryParameters() instead.
1940
     * @codeCoverageIgnore
1941
     */
1942
    protected function getUrlFromPageAndQueryParameters(int $pageId, string $queryString, ?string $alternativeBaseUrl, int $httpsOrHttp): UriInterface
1943 1
    {
1944
        $urlService = new UrlService();
1945
        return $urlService->getUrlFromPageAndQueryParameters($pageId, $queryString, $alternativeBaseUrl, $httpsOrHttp);
1946 1
    }
1947
1948
    protected function swapIfFirstIsLargerThanSecond(array $reg): array
1949
    {
1950
        // Swap if first is larger than last:
1951
        if ($reg[1] > $reg[2]) {
1952 1
            $temp = $reg[2];
1953
            $reg[2] = $reg[1];
1954
            $reg[1] = $temp;
1955
        }
1956
1957
        return $reg;
1958 2
    }
1959
1960
    /**
1961 2
     * @return BackendUserAuthentication
1962 2
     */
1963 2
    private function getBackendUser()
1964
    {
1965 2
        // Make sure the _cli_ user is loaded
1966
        Bootstrap::initializeBackendAuthentication();
1967
        if ($this->backendUser === null) {
1968
            $this->backendUser = $GLOBALS['BE_USER'];
1969
        }
1970
        return $this->backendUser;
1971
    }
1972
1973 12
    /**
1974
     * Get querybuilder for given table
1975 12
     *
1976
     * @return QueryBuilder
1977
     */
1978
    private function getQueryBuilder(string $table)
1979
    {
1980
        return GeneralUtility::makeInstance(ConnectionPool::class)->getQueryBuilderForTable($table);
1981
    }
1982
}
1983