Passed
Push — refactor/crawlerController ( acc48a...4b5059 )
by Tomas Norre
07:52 queued 03:40
created

CrawlerController::swapIfFirstIsLargerThanSecond()   A

Complexity

Conditions 2
Paths 2

Size

Total Lines 10
Code Lines 5

Duplication

Lines 0
Ratio 0 %

Code Coverage

Tests 3
CRAP Score 2.5

Importance

Changes 0
Metric Value
cc 2
eloc 5
nc 2
nop 1
dl 0
loc 10
ccs 3
cts 6
cp 0.5
crap 2.5
rs 10
c 0
b 0
f 0
1
<?php
2
3
declare(strict_types=1);
4
5
namespace AOE\Crawler\Controller;
6
7
/***************************************************************
8
 *  Copyright notice
9
 *
10
 *  (c) 2020 AOE GmbH <[email protected]>
11
 *
12
 *  All rights reserved
13
 *
14
 *  This script is part of the TYPO3 project. The TYPO3 project is
15
 *  free software; you can redistribute it and/or modify
16
 *  it under the terms of the GNU General Public License as published by
17
 *  the Free Software Foundation; either version 3 of the License, or
18
 *  (at your option) any later version.
19
 *
20
 *  The GNU General Public License can be found at
21
 *  http://www.gnu.org/copyleft/gpl.html.
22
 *
23
 *  This script is distributed in the hope that it will be useful,
24
 *  but WITHOUT ANY WARRANTY; without even the implied warranty of
25
 *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
26
 *  GNU General Public License for more details.
27
 *
28
 *  This copyright notice MUST APPEAR in all copies of the script!
29
 ***************************************************************/
30
31
use AOE\Crawler\Configuration\ExtensionConfigurationProvider;
32
use AOE\Crawler\Converter\JsonCompatibilityConverter;
33
use AOE\Crawler\Crawler;
34
use AOE\Crawler\CrawlStrategy\CrawlStrategyFactory;
35
use AOE\Crawler\Domain\Model\Process;
36
use AOE\Crawler\Domain\Repository\ConfigurationRepository;
37
use AOE\Crawler\Domain\Repository\ProcessRepository;
38
use AOE\Crawler\Domain\Repository\QueueRepository;
39
use AOE\Crawler\QueueExecutor;
40
use AOE\Crawler\Service\UrlService;
41
use AOE\Crawler\Utility\SignalSlotUtility;
42
use Psr\Http\Message\UriInterface;
43
use Psr\Log\LoggerAwareInterface;
44
use Psr\Log\LoggerAwareTrait;
45
use TYPO3\CMS\Backend\Tree\View\PageTreeView;
46
use TYPO3\CMS\Backend\Utility\BackendUtility;
47
use TYPO3\CMS\Core\Authentication\BackendUserAuthentication;
48
use TYPO3\CMS\Core\Compatibility\PublicMethodDeprecationTrait;
49
use TYPO3\CMS\Core\Compatibility\PublicPropertyDeprecationTrait;
50
use TYPO3\CMS\Core\Core\Bootstrap;
51
use TYPO3\CMS\Core\Core\Environment;
52
use TYPO3\CMS\Core\Database\Connection;
53
use TYPO3\CMS\Core\Database\ConnectionPool;
54
use TYPO3\CMS\Core\Database\Query\Restriction\DeletedRestriction;
55
use TYPO3\CMS\Core\Imaging\Icon;
56
use TYPO3\CMS\Core\Imaging\IconFactory;
57
use TYPO3\CMS\Core\Site\Entity\Site;
58
use TYPO3\CMS\Core\Type\Bitmask\Permission;
59
use TYPO3\CMS\Core\TypoScript\Parser\TypoScriptParser;
60
use TYPO3\CMS\Core\Utility\DebugUtility;
61
use TYPO3\CMS\Core\Utility\GeneralUtility;
62
use TYPO3\CMS\Core\Utility\MathUtility;
63
use TYPO3\CMS\Extbase\Object\ObjectManager;
64
use TYPO3\CMS\Frontend\Page\PageRepository;
65
66
/**
67
 * Class CrawlerController
68
 *
69
 * @package AOE\Crawler\Controller
70
 */
71
class CrawlerController implements LoggerAwareInterface
72
{
73
    use LoggerAwareTrait;
74
    use PublicMethodDeprecationTrait;
75
    use PublicPropertyDeprecationTrait;
76
77
    public const CLI_STATUS_NOTHING_PROCCESSED = 0;
78
79
    //queue not empty
80
    public const CLI_STATUS_REMAIN = 1;
81
82
    //(some) queue items where processed
83
    public const CLI_STATUS_PROCESSED = 2;
84
85
    //instance didn't finish
86
    public const CLI_STATUS_ABORTED = 4;
87
88
    public const CLI_STATUS_POLLABLE_PROCESSED = 8;
89
90
    /**
91
     * @var integer
92
     */
93
    public $setID = 0;
94
95
    /**
96
     * @var string
97
     */
98
    public $processID = '';
99
100
    /**
101
     * @var array
102
     */
103
    public $duplicateTrack = [];
104
105
    /**
106
     * @var array
107
     */
108
    public $downloadUrls = [];
109
110
    /**
111
     * @var array
112
     */
113
    public $incomingProcInstructions = [];
114
115
    /**
116
     * @var array
117
     */
118
    public $incomingConfigurationSelection = [];
119
120
    /**
121
     * @var bool
122
     */
123
    public $registerQueueEntriesInternallyOnly = false;
124
125
    /**
126
     * @var array
127
     */
128
    public $queueEntries = [];
129
130
    /**
131
     * @var array
132
     */
133
    public $urlList = [];
134
135
    /**
136
     * @var array
137
     */
138
    public $extensionSettings = [];
139
140
    /**
141
     * Mount Point
142
     *
143
     * @var bool
144
     * Todo: Check what this is used for and adjust the type hint or code, as bool doesn't match the current code.
145
     */
146
    public $MP = false;
147
148
    /**
149
     * @var string
150
     * @deprecated
151
     */
152
    protected $processFilename;
153
154
    /**
155
     * Holds the internal access mode can be 'gui','cli' or 'cli_im'
156
     *
157
     * @var string
158
     * @deprecated
159
     */
160
    protected $accessMode;
161
162
    /**
163
     * @var QueueRepository
164
     */
165
    protected $queueRepository;
166
167
    /**
168
     * @var ProcessRepository
169
     */
170
    protected $processRepository;
171
172
    /**
173
     * @var ConfigurationRepository
174
     */
175
    protected $configurationRepository;
176
177
    /**
178
     * @var string
179
     */
180
    protected $tableName = 'tx_crawler_queue';
181
182
    /**
183
     * @var QueueExecutor
184
     */
185
    protected $queueExecutor;
186
187
    /**
188
     * @var int
189
     */
190
    protected $maximumUrlsToCompile = 10000;
191
192
    /**
193
     * @var IconFactory
194
     */
195
    protected $iconFactory;
196
197
    /**
198
     * @var string[]
199
     */
200
    private $deprecatedPublicMethods = [
0 ignored issues
show
introduced by
The private property $deprecatedPublicMethods is not used, and could be removed.
Loading history...
201
        'cleanUpOldQueueEntries' => 'Using CrawlerController::cleanUpOldQueueEntries() is deprecated since 9.0.1 and will be removed in v11.x, please use QueueRepository->cleanUpOldQueueEntries() instead.',
202
        'CLI_debug' => 'Using CrawlerController->CLI_debug() is deprecated since 9.1.3 and will be removed in v11.x',
203
        'getAccessMode' => 'Using CrawlerController->getAccessMode() is deprecated since 9.1.3 and will be removed in v11.x',
204
        'getLogEntriesForSetId' => 'Using crawlerController::getLogEntriesForSetId() is deprecated since 9.0.1 and will be removed in v11.x',
205
        'flushQueue' => 'Using CrawlerController::flushQueue() is deprecated since 9.0.1 and will be removed in v11.x, please use QueueRepository->flushQueue() instead.',
206
        'setAccessMode' => 'Using CrawlerController->setAccessMode() is deprecated since 9.1.3 and will be removed in v11.x',
207
        'getDisabled' => 'Using CrawlerController->getDisabled() is deprecated since 9.1.3 and will be removed in v11.x, please use Crawler->isDisabled() instead',
208
        'setDisabled' => 'Using CrawlerController->setDisabled() is deprecated since 9.1.3 and will be removed in v11.x, please use Crawler->setDisabled() instead',
209
        'getProcessFilename' => 'Using CrawlerController->getProcessFilename() is deprecated since 9.1.3 and will be removed in v11.x',
210
        'setProcessFilename' => 'Using CrawlerController->setProcessFilename() is deprecated since 9.1.3 and will be removed in v11.x',
211
212
    ];
213
214
    /**
215
     * @var string[]
216
     */
217
    private $deprecatedPublicProperties = [
1 ignored issue
show
introduced by
The private property $deprecatedPublicProperties is not used, and could be removed.
Loading history...
218
        'accessMode' => 'Using CrawlerController->accessMode is deprecated since 9.1.3 and will be removed in v11.x',
219
        'processFilename' => 'Using CrawlerController->accessMode is deprecated since 9.1.3 and will be removed in v11.x',
220
    ];
221
222
    /**
223
     * @var BackendUserAuthentication|null
224
     */
225
    private $backendUser;
226
227
    /**
228
     * @var integer
229
     */
230
    private $scheduledTime = 0;
231
232
    /**
233
     * @var integer
234
     */
235
    private $reqMinute = 0;
236
237
    /**
238
     * @var bool
239
     */
240
    private $submitCrawlUrls = false;
241
242
    /**
243
     * @var bool
244
     */
245
    private $downloadCrawlUrls = false;
246
247
    /**
248
     * @var PageRepository
249
     */
250
    private $pageRepository;
251
252
    /**
253
     * @var Crawler
254
     */
255
    private $crawler;
256
257
    /************************************
258
     *
259
     * Getting URLs based on Page TSconfig
260
     *
261
     ************************************/
262
263 36
    public function __construct()
264
    {
265 36
        $objectManager = GeneralUtility::makeInstance(ObjectManager::class);
266 36
        $crawlStrategyFactory = GeneralUtility::makeInstance(CrawlStrategyFactory::class);
267 36
        $this->queueRepository = $objectManager->get(QueueRepository::class);
268 36
        $this->processRepository = $objectManager->get(ProcessRepository::class);
269 36
        $this->configurationRepository = $objectManager->get(ConfigurationRepository::class);
270 36
        $this->pageRepository = $objectManager->get(PageRepository::class);
271 36
        $this->queueExecutor = GeneralUtility::makeInstance(QueueExecutor::class, $crawlStrategyFactory);
272 36
        $this->iconFactory = GeneralUtility::makeInstance(IconFactory::class);
273 36
        $this->crawler = GeneralUtility::makeInstance(Crawler::class);
274
275 36
        $this->processFilename = Environment::getVarPath() . '/lock/tx_crawler.proc';
0 ignored issues
show
Deprecated Code introduced by
The property AOE\Crawler\Controller\C...oller::$processFilename has been deprecated. ( Ignorable by Annotation )

If this is a false-positive, you can also ignore this issue in your code via the ignore-deprecated  annotation

275
        /** @scrutinizer ignore-deprecated */ $this->processFilename = Environment::getVarPath() . '/lock/tx_crawler.proc';
Loading history...
276
277
        /** @var ExtensionConfigurationProvider $configurationProvider */
278 36
        $configurationProvider = GeneralUtility::makeInstance(ExtensionConfigurationProvider::class);
279 36
        $settings = $configurationProvider->getExtensionConfiguration();
280 36
        $this->extensionSettings = is_array($settings) ? $settings : [];
0 ignored issues
show
introduced by
The condition is_array($settings) is always true.
Loading history...
281
282
        // set defaults:
283 36
        if (MathUtility::convertToPositiveInteger($this->extensionSettings['countInARun']) === 0) {
284
            $this->extensionSettings['countInARun'] = 100;
285
        }
286
287 36
        $this->extensionSettings['processLimit'] = MathUtility::forceIntegerInRange($this->extensionSettings['processLimit'], 1, 99, 1);
288 36
        $this->maximumUrlsToCompile = MathUtility::forceIntegerInRange($this->extensionSettings['maxCompileUrls'], 1, 1000000000, 10000);
289 36
    }
290
291
    /**
292
     * Method to set the accessMode can be gui, cli or cli_im
293
     *
294
     * @return string
295
     * @deprecated
296
     */
297 1
    public function getAccessMode()
298
    {
299 1
        return $this->accessMode;
0 ignored issues
show
Deprecated Code introduced by
The property AOE\Crawler\Controller\C...Controller::$accessMode has been deprecated. ( Ignorable by Annotation )

If this is a false-positive, you can also ignore this issue in your code via the ignore-deprecated  annotation

299
        return /** @scrutinizer ignore-deprecated */ $this->accessMode;
Loading history...
300
    }
301
302
    /**
303
     * @param string $accessMode
304
     * @deprecated
305
     */
306 1
    public function setAccessMode($accessMode): void
307
    {
308 1
        $this->accessMode = $accessMode;
0 ignored issues
show
Deprecated Code introduced by
The property AOE\Crawler\Controller\C...Controller::$accessMode has been deprecated. ( Ignorable by Annotation )

If this is a false-positive, you can also ignore this issue in your code via the ignore-deprecated  annotation

308
        /** @scrutinizer ignore-deprecated */ $this->accessMode = $accessMode;
Loading history...
309 1
    }
310
311
    /**
312
     * Set disabled status to prevent processes from being processed
313
     *
314
     * @param bool $disabled (optional, defaults to true)
315
     * @deprecated
316
     */
317 2
    public function setDisabled($disabled = true): void
318
    {
319 2
        if ($disabled) {
320 1
            GeneralUtility::writeFile($this->processFilename, '');
0 ignored issues
show
Deprecated Code introduced by
The property AOE\Crawler\Controller\C...oller::$processFilename has been deprecated. ( Ignorable by Annotation )

If this is a false-positive, you can also ignore this issue in your code via the ignore-deprecated  annotation

320
            GeneralUtility::writeFile(/** @scrutinizer ignore-deprecated */ $this->processFilename, '');
Loading history...
321
        } else {
322 1
            if (is_file($this->processFilename)) {
0 ignored issues
show
Deprecated Code introduced by
The property AOE\Crawler\Controller\C...oller::$processFilename has been deprecated. ( Ignorable by Annotation )

If this is a false-positive, you can also ignore this issue in your code via the ignore-deprecated  annotation

322
            if (is_file(/** @scrutinizer ignore-deprecated */ $this->processFilename)) {
Loading history...
323 1
                unlink($this->processFilename);
0 ignored issues
show
Deprecated Code introduced by
The property AOE\Crawler\Controller\C...oller::$processFilename has been deprecated. ( Ignorable by Annotation )

If this is a false-positive, you can also ignore this issue in your code via the ignore-deprecated  annotation

323
                unlink(/** @scrutinizer ignore-deprecated */ $this->processFilename);
Loading history...
324
            }
325
        }
326 2
    }
327
328
    /**
329
     * Get disable status
330
     *
331
     * @return bool true if disabled
332
     * @deprecated
333
     */
334 2
    public function getDisabled()
335
    {
336 2
        return is_file($this->processFilename);
0 ignored issues
show
Deprecated Code introduced by
The property AOE\Crawler\Controller\C...oller::$processFilename has been deprecated. ( Ignorable by Annotation )

If this is a false-positive, you can also ignore this issue in your code via the ignore-deprecated  annotation

336
        return is_file(/** @scrutinizer ignore-deprecated */ $this->processFilename);
Loading history...
337
    }
338
339
    /**
340
     * @param string $filenameWithPath
341
     * @deprecated
342
     */
343 3
    public function setProcessFilename($filenameWithPath): void
344
    {
345 3
        $this->processFilename = $filenameWithPath;
0 ignored issues
show
Deprecated Code introduced by
The property AOE\Crawler\Controller\C...oller::$processFilename has been deprecated. ( Ignorable by Annotation )

If this is a false-positive, you can also ignore this issue in your code via the ignore-deprecated  annotation

345
        /** @scrutinizer ignore-deprecated */ $this->processFilename = $filenameWithPath;
Loading history...
346 3
    }
347
348
    /**
349
     * @return string
350
     * @deprecated
351
     */
352 1
    public function getProcessFilename()
353
    {
354 1
        return $this->processFilename;
0 ignored issues
show
Deprecated Code introduced by
The property AOE\Crawler\Controller\C...oller::$processFilename has been deprecated. ( Ignorable by Annotation )

If this is a false-positive, you can also ignore this issue in your code via the ignore-deprecated  annotation

354
        return /** @scrutinizer ignore-deprecated */ $this->processFilename;
Loading history...
355
    }
356
357
    /**
358
     * Sets the extensions settings (unserialized pendant of $TYPO3_CONF_VARS['EXT']['extConf']['crawler']).
359
     */
360 14
    public function setExtensionSettings(array $extensionSettings): void
361
    {
362 14
        $this->extensionSettings = $extensionSettings;
363 14
    }
364
365
    /**
366
     * Check if the given page should be crawled
367
     *
368
     * @return false|string false if the page should be crawled (not excluded), true / skipMessage if it should be skipped
369
     */
370 12
    public function checkIfPageShouldBeSkipped(array $pageRow)
371
    {
372 12
        $skipPage = false;
373
        // message will be overwritten later
374 12
        $skipMessage = 'Skipped';
375
376
        // if page is hidden
377 12
        if (! $this->extensionSettings['crawlHiddenPages']) {
378 12
            if ($pageRow['hidden']) {
379 1
                $skipPage = true;
380 1
                $skipMessage = 'Because page is hidden';
381
            }
382
        }
383
384 12
        if (! $skipPage) {
385 11
            if (GeneralUtility::inList('3,4,199,254,255', $pageRow['doktype'])) {
386 3
                $skipPage = true;
387 3
                $skipMessage = 'Because doktype is not allowed';
388
            }
389
        }
390
391 12
        if (! $skipPage) {
392 8
            foreach ($GLOBALS['TYPO3_CONF_VARS']['EXTCONF']['crawler']['excludeDoktype'] ?? [] as $key => $doktypeList) {
393 1
                if (GeneralUtility::inList($doktypeList, $pageRow['doktype'])) {
394 1
                    $skipPage = true;
395 1
                    $skipMessage = 'Doktype was excluded by "' . $key . '"';
396 1
                    break;
397
                }
398
            }
399
        }
400
401 12
        if (! $skipPage) {
402
            // veto hook
403 7
            foreach ($GLOBALS['TYPO3_CONF_VARS']['EXTCONF']['crawler']['pageVeto'] ?? [] as $key => $func) {
404
                $params = [
405 2
                    'pageRow' => $pageRow,
406
                ];
407
                // expects "false" if page is ok and "true" or a skipMessage if this page should _not_ be crawled
408 2
                $veto = GeneralUtility::callUserFunction($func, $params, $this);
409 2
                if ($veto !== false) {
410 2
                    $skipPage = true;
411 2
                    if (is_string($veto)) {
412 1
                        $skipMessage = $veto;
413
                    } else {
414 1
                        $skipMessage = 'Veto from hook "' . htmlspecialchars($key) . '"';
415
                    }
416
                    // no need to execute other hooks if a previous one return a veto
417 2
                    break;
418
                }
419
            }
420
        }
421
422 12
        return $skipPage ? $skipMessage : false;
423
    }
424
425
    /**
426
     * Wrapper method for getUrlsForPageId()
427
     * It returns an array of configurations and no urls!
428
     *
429
     * @param array $pageRow Page record with at least dok-type and uid columns.
430
     * @param string $skipMessage
431
     * @return array
432
     * @see getUrlsForPageId()
433
     */
434 6
    public function getUrlsForPageRow(array $pageRow, &$skipMessage = '')
435
    {
436 6
        $message = $this->checkIfPageShouldBeSkipped($pageRow);
437 6
        if ($message === false) {
438 5
            $res = $this->getUrlsForPageId($pageRow['uid']);
439 5
            $skipMessage = '';
440
        } else {
441 1
            $skipMessage = $message;
442 1
            $res = [];
443
        }
444
445 6
        return $res;
446
    }
447
448
    /**
449
     * Creates a list of URLs from input array (and submits them to queue if asked for)
450
     * See Web > Info module script + "indexed_search"'s crawler hook-client using this!
451
     *
452
     * @param array $vv Information about URLs from pageRow to crawl.
453
     * @param array $pageRow Page row
454
     * @param int $scheduledTime Unix time to schedule indexing to, typically time()
455
     * @param int $reqMinute Number of requests per minute (creates the interleave between requests)
456
     * @param bool $submitCrawlUrls If set, submits the URLs to queue
457
     * @param bool $downloadCrawlUrls If set (and submitcrawlUrls is false) will fill $downloadUrls with entries)
458
     * @param array $duplicateTrack Array which is passed by reference and contains the an id per url to secure we will not crawl duplicates
459
     * @param array $downloadUrls Array which will be filled with URLS for download if flag is set.
460
     * @param array $incomingProcInstructions Array of processing instructions
461
     * @return string List of URLs (meant for display in backend module)
462
     */
463 4
    public function urlListFromUrlArray(
464
        array $vv,
465
        array $pageRow,
466
        $scheduledTime,
467
        $reqMinute,
468
        $submitCrawlUrls,
469
        $downloadCrawlUrls,
470
        array &$duplicateTrack,
471
        array &$downloadUrls,
472
        array $incomingProcInstructions
473
    ) {
474 4
        if (! is_array($vv['URLs'])) {
475
            return 'ERROR - no URL generated';
476
        }
477 4
        $urlLog = [];
478 4
        $pageId = (int) $pageRow['uid'];
479 4
        $configurationHash = $this->getConfigurationHash($vv);
480 4
        $skipInnerCheck = $this->queueRepository->noUnprocessedQueueEntriesForPageWithConfigurationHashExist($pageId, $configurationHash);
481
482 4
        $urlService = new UrlService();
483
484 4
        foreach ($vv['URLs'] as $urlQuery) {
485 4
            if (! $this->drawURLs_PIfilter($vv['subCfg']['procInstrFilter'], $incomingProcInstructions)) {
486
                continue;
487
            }
488 4
            $url = (string) $urlService->getUrlFromPageAndQueryParameters(
489 4
                $pageId,
490 4
                $urlQuery,
491 4
                $vv['subCfg']['baseUrl'] ?? null,
492 4
                $vv['subCfg']['force_ssl'] ?? 0
493
            );
494
495
            // Create key by which to determine unique-ness:
496 4
            $uKey = $url . '|' . $vv['subCfg']['userGroups'] . '|' . $vv['subCfg']['procInstrFilter'];
497
498 4
            if (isset($duplicateTrack[$uKey])) {
499
                //if the url key is registered just display it and do not resubmit is
500
                $urlLog[] = '<em><span class="text-muted">' . htmlspecialchars($url) . '</span></em>';
501
            } else {
502
                // Scheduled time:
503 4
                $schTime = $scheduledTime + round(count($duplicateTrack) * (60 / $reqMinute));
504 4
                $schTime = intval($schTime / 60) * 60;
505 4
                $formattedDate = BackendUtility::datetime($schTime);
506 4
                $this->urlList[] = '[' . $formattedDate . '] ' . $url;
507 4
                $urlList = '[' . $formattedDate . '] ' . htmlspecialchars($url);
508
509
                // Submit for crawling!
510 4
                if ($submitCrawlUrls) {
511 4
                    $added = $this->addUrl(
512 4
                        $pageId,
513 4
                        $url,
514 4
                        $vv['subCfg'],
515 4
                        $scheduledTime,
516 4
                        $configurationHash,
517 4
                        $skipInnerCheck
518
                    );
519 4
                    if ($added === false) {
520 4
                        $urlList .= ' (URL already existed)';
521
                    }
522
                } elseif ($downloadCrawlUrls) {
523
                    $downloadUrls[$url] = $url;
524
                }
525 4
                $urlLog[] = $urlList;
526
            }
527 4
            $duplicateTrack[$uKey] = true;
528
        }
529
530 4
        return implode('<br>', $urlLog);
531
    }
532
533
    /**
534
     * Returns true if input processing instruction is among registered ones.
535
     *
536
     * @param string $piString PI to test
537
     * @param array $incomingProcInstructions Processing instructions
538
     * @return boolean
539
     */
540 5
    public function drawURLs_PIfilter($piString, array $incomingProcInstructions)
541
    {
542 5
        if (empty($incomingProcInstructions)) {
543 1
            return true;
544
        }
545
546 4
        foreach ($incomingProcInstructions as $pi) {
547 4
            if (GeneralUtility::inList($piString, $pi)) {
548 2
                return true;
549
            }
550
        }
551 2
        return false;
552
    }
553
554 5
    public function getPageTSconfigForId($id): array
555
    {
556 5
        if (! $this->MP) {
557 5
            $pageTSconfig = BackendUtility::getPagesTSconfig($id);
0 ignored issues
show
Deprecated Code introduced by
The function TYPO3\CMS\Backend\Utilit...ity::getPagesTSconfig() has been deprecated. ( Ignorable by Annotation )

If this is a false-positive, you can also ignore this issue in your code via the ignore-deprecated  annotation

557
            $pageTSconfig = /** @scrutinizer ignore-deprecated */ BackendUtility::getPagesTSconfig($id);
Loading history...
558
        } else {
559
            // TODO: Please check, this makes no sense to split a boolean value.
560
            [, $mountPointId] = explode('-', $this->MP);
0 ignored issues
show
Bug introduced by
$this->MP of type true is incompatible with the type string expected by parameter $string of explode(). ( Ignorable by Annotation )

If this is a false-positive, you can also ignore this issue in your code via the ignore-type  annotation

560
            [, $mountPointId] = explode('-', /** @scrutinizer ignore-type */ $this->MP);
Loading history...
561
            $pageTSconfig = BackendUtility::getPagesTSconfig($mountPointId);
0 ignored issues
show
Bug introduced by
$mountPointId of type string is incompatible with the type integer expected by parameter $id of TYPO3\CMS\Backend\Utilit...ity::getPagesTSconfig(). ( Ignorable by Annotation )

If this is a false-positive, you can also ignore this issue in your code via the ignore-type  annotation

561
            $pageTSconfig = BackendUtility::getPagesTSconfig(/** @scrutinizer ignore-type */ $mountPointId);
Loading history...
Deprecated Code introduced by
The function TYPO3\CMS\Backend\Utilit...ity::getPagesTSconfig() has been deprecated. ( Ignorable by Annotation )

If this is a false-positive, you can also ignore this issue in your code via the ignore-deprecated  annotation

561
            $pageTSconfig = /** @scrutinizer ignore-deprecated */ BackendUtility::getPagesTSconfig($mountPointId);
Loading history...
562
        }
563
564
        // Call a hook to alter configuration
565 5
        if (is_array($GLOBALS['TYPO3_CONF_VARS']['EXTCONF']['crawler']['getPageTSconfigForId'])) {
566
            $params = [
567
                'pageId' => $id,
568
                'pageTSConfig' => &$pageTSconfig,
569
            ];
570
            foreach ($GLOBALS['TYPO3_CONF_VARS']['EXTCONF']['crawler']['getPageTSconfigForId'] as $userFunc) {
571
                GeneralUtility::callUserFunction($userFunc, $params, $this);
572
            }
573
        }
574 5
        return $pageTSconfig;
575
    }
576
577
    /**
578
     * This methods returns an array of configurations.
579
     * Adds no urls!
580
     */
581 4
    public function getUrlsForPageId(int $pageId): array
582
    {
583
        // Get page TSconfig for page ID
584 4
        $pageTSconfig = $this->getPageTSconfigForId($pageId);
585
586 4
        $res = [];
587
588
        // Fetch Crawler Configuration from pageTSconfig
589 4
        $crawlerCfg = $pageTSconfig['tx_crawler.']['crawlerCfg.']['paramSets.'] ?? [];
590 4
        foreach ($crawlerCfg as $key => $values) {
591 3
            if (! is_array($values)) {
592 3
                continue;
593
            }
594 3
            $key = str_replace('.', '', $key);
595
            // Sub configuration for a single configuration string:
596 3
            $subCfg = (array) $crawlerCfg[$key . '.'];
597 3
            $subCfg['key'] = $key;
598
599 3
            if (strcmp($subCfg['procInstrFilter'] ?? '', '')) {
600 3
                $subCfg['procInstrFilter'] = implode(',', GeneralUtility::trimExplode(',', $subCfg['procInstrFilter']));
601
            }
602 3
            $pidOnlyList = implode(',', GeneralUtility::trimExplode(',', $subCfg['pidsOnly'], true));
603
604
            // process configuration if it is not page-specific or if the specific page is the current page:
605
            // TODO: Check if $pidOnlyList can be kept as Array instead of imploded
606 3
            if (! strcmp((string) $subCfg['pidsOnly'], '') || GeneralUtility::inList($pidOnlyList, strval($pageId))) {
607
608
                // Explode, process etc.:
609 3
                $res[$key] = [];
610 3
                $res[$key]['subCfg'] = $subCfg;
611 3
                $res[$key]['paramParsed'] = GeneralUtility::explodeUrl2Array($crawlerCfg[$key]);
612 3
                $res[$key]['paramExpanded'] = $this->expandParameters($res[$key]['paramParsed'], $pageId);
613 3
                $res[$key]['origin'] = 'pagets';
614
615
                // recognize MP value
616 3
                if (! $this->MP) {
617 3
                    $res[$key]['URLs'] = $this->compileUrls($res[$key]['paramExpanded'], ['?id=' . $pageId]);
618
                } else {
619
                    $res[$key]['URLs'] = $this->compileUrls($res[$key]['paramExpanded'], ['?id=' . $pageId . '&MP=' . $this->MP]);
0 ignored issues
show
Bug introduced by
Are you sure $this->MP of type true can be used in concatenation? ( Ignorable by Annotation )

If this is a false-positive, you can also ignore this issue in your code via the ignore-type  annotation

619
                    $res[$key]['URLs'] = $this->compileUrls($res[$key]['paramExpanded'], ['?id=' . $pageId . '&MP=' . /** @scrutinizer ignore-type */ $this->MP]);
Loading history...
620
                }
621
            }
622
        }
623
624
        // Get configuration from tx_crawler_configuration records up the rootline
625 4
        $crawlerConfigurations = $this->configurationRepository->getCrawlerConfigurationRecordsFromRootLine($pageId);
626 4
        foreach ($crawlerConfigurations as $configurationRecord) {
627
628
            // check access to the configuration record
629 1
            if (empty($configurationRecord['begroups']) || $this->getBackendUser()->isAdmin() || $this->hasGroupAccess($this->getBackendUser()->user['usergroup_cached_list'], $configurationRecord['begroups'])) {
630 1
                $pidOnlyList = implode(',', GeneralUtility::trimExplode(',', $configurationRecord['pidsonly'], true));
631
632
                // process configuration if it is not page-specific or if the specific page is the current page:
633
                // TODO: Check if $pidOnlyList can be kept as Array instead of imploded
634 1
                if (! strcmp($configurationRecord['pidsonly'], '') || GeneralUtility::inList($pidOnlyList, strval($pageId))) {
635 1
                    $key = $configurationRecord['name'];
636
637
                    // don't overwrite previously defined paramSets
638 1
                    if (! isset($res[$key])) {
639
640
                        /* @var $TSparserObject \TYPO3\CMS\Core\TypoScript\Parser\TypoScriptParser */
641 1
                        $TSparserObject = GeneralUtility::makeInstance(TypoScriptParser::class);
642 1
                        $TSparserObject->parse($configurationRecord['processing_instruction_parameters_ts']);
643
644
                        $subCfg = [
645 1
                            'procInstrFilter' => $configurationRecord['processing_instruction_filter'],
646 1
                            'procInstrParams.' => $TSparserObject->setup,
647 1
                            'baseUrl' => $configurationRecord['base_url'],
648 1
                            'force_ssl' => (int) $configurationRecord['force_ssl'],
649 1
                            'userGroups' => $configurationRecord['fegroups'],
650 1
                            'exclude' => $configurationRecord['exclude'],
651 1
                            'key' => $key,
652
                        ];
653
654 1
                        if (! in_array($pageId, $this->expandExcludeString($subCfg['exclude']), true)) {
655 1
                            $res[$key] = [];
656 1
                            $res[$key]['subCfg'] = $subCfg;
657 1
                            $res[$key]['paramParsed'] = GeneralUtility::explodeUrl2Array($configurationRecord['configuration']);
658 1
                            $res[$key]['paramExpanded'] = $this->expandParameters($res[$key]['paramParsed'], $pageId);
659 1
                            $res[$key]['URLs'] = $this->compileUrls($res[$key]['paramExpanded'], ['?id=' . $pageId]);
660 1
                            $res[$key]['origin'] = 'tx_crawler_configuration_' . $configurationRecord['uid'];
661
                        }
662
                    }
663
                }
664
            }
665
        }
666
667 4
        foreach ($GLOBALS['TYPO3_CONF_VARS']['EXTCONF']['crawler']['processUrls'] ?? [] as $func) {
668
            $params = [
669
                'res' => &$res,
670
            ];
671
            GeneralUtility::callUserFunction($func, $params, $this);
672
        }
673 4
        return $res;
674
    }
675
676
    /**
677
     * Find all configurations of subpages of a page
678
     * TODO: Write Functional Tests
679
     */
680 1
    public function getConfigurationsForBranch(int $rootid, int $depth): array
681
    {
682 1
        $configurationsForBranch = [];
683 1
        $pageTSconfig = $this->getPageTSconfigForId($rootid);
684 1
        $sets = $pageTSconfig['tx_crawler.']['crawlerCfg.']['paramSets.'] ?? [];
685 1
        foreach ($sets as $key => $value) {
686
            if (! is_array($value)) {
687
                continue;
688
            }
689
            $configurationsForBranch[] = substr($key, -1) === '.' ? substr($key, 0, -1) : $key;
690
        }
691 1
        $pids = [];
692 1
        $rootLine = BackendUtility::BEgetRootLine($rootid);
693 1
        foreach ($rootLine as $node) {
694 1
            $pids[] = $node['uid'];
695
        }
696
        /* @var PageTreeView $tree */
697 1
        $tree = GeneralUtility::makeInstance(PageTreeView::class);
698 1
        $perms_clause = $this->getBackendUser()->getPagePermsClause(Permission::PAGE_SHOW);
699 1
        $tree->init(empty($perms_clause) ? '' : ('AND ' . $perms_clause));
700 1
        $tree->getTree($rootid, $depth, '');
701 1
        foreach ($tree->tree as $node) {
702
            $pids[] = $node['row']['uid'];
703
        }
704
705 1
        $queryBuilder = $this->getQueryBuilder('tx_crawler_configuration');
706
        $statement = $queryBuilder
707 1
            ->select('name')
708 1
            ->from('tx_crawler_configuration')
709 1
            ->where(
710 1
                $queryBuilder->expr()->in('pid', $queryBuilder->createNamedParameter($pids, Connection::PARAM_INT_ARRAY))
711
            )
712 1
            ->execute();
713
714 1
        while ($row = $statement->fetch()) {
715 1
            $configurationsForBranch[] = $row['name'];
716
        }
717 1
        return $configurationsForBranch;
718
    }
719
720
    /**
721
     * Check if a user has access to an item
722
     * (e.g. get the group list of the current logged in user from $GLOBALS['TSFE']->gr_list)
723
     *
724
     * @param string $groupList Comma-separated list of (fe_)group UIDs from a user
725
     * @param string $accessList Comma-separated list of (fe_)group UIDs of the item to access
726
     * @return bool TRUE if at least one of the users group UIDs is in the access list or the access list is empty
727
     * @see \TYPO3\CMS\Frontend\Page\PageRepository::getMultipleGroupsWhereClause()
728
     */
729 3
    public function hasGroupAccess($groupList, $accessList)
730
    {
731 3
        if (empty($accessList)) {
732 1
            return true;
733
        }
734 2
        foreach (GeneralUtility::intExplode(',', $groupList) as $groupUid) {
735 2
            if (GeneralUtility::inList($accessList, $groupUid)) {
736 1
                return true;
737
            }
738
        }
739 1
        return false;
740
    }
741
742
    /**
743
     * Will expand the parameters configuration to individual values. This follows a certain syntax of the value of each parameter.
744
     * Syntax of values:
745
     * - Basically: If the value is wrapped in [...] it will be expanded according to the following syntax, otherwise the value is taken literally
746
     * - Configuration is splitted by "|" and the parts are processed individually and finally added together
747
     * - For each configuration part:
748
     *         - "[int]-[int]" = Integer range, will be expanded to all values in between, values included, starting from low to high (max. 1000). Example "1-34" or "-40--30"
749
     *         - "_TABLE:[TCA table name];[_PID:[optional page id, default is current page]];[_ENABLELANG:1]" = Look up of table records from PID, filtering out deleted records. Example "_TABLE:tt_content; _PID:123"
750
     *        _ENABLELANG:1 picks only original records without their language overlays
751
     *         - Default: Literal value
752
     *
753
     * @param array $paramArray Array with key (GET var name) and values (value of GET var which is configuration for expansion)
754
     * @param integer $pid Current page ID
755
     * @return array
756
     *
757
     * TODO: Write Functional Tests
758
     */
759 11
    public function expandParameters($paramArray, $pid)
760
    {
761
        // Traverse parameter names:
762 11
        foreach ($paramArray as $p => $v) {
763 11
            $v = trim($v);
764
765
            // If value is encapsulated in square brackets it means there are some ranges of values to find, otherwise the value is literal
766 11
            if (strpos($v, '[') === 0 && substr($v, -1) === ']') {
767
                // So, find the value inside brackets and reset the paramArray value as an array.
768 11
                $v = substr($v, 1, -1);
769 11
                $paramArray[$p] = [];
770
771
                // Explode parts and traverse them:
772 11
                $parts = explode('|', $v);
773 11
                foreach ($parts as $pV) {
774
775
                    // Look for integer range: (fx. 1-34 or -40--30 // reads minus 40 to minus 30)
776 11
                    if (preg_match('/^(-?[0-9]+)\s*-\s*(-?[0-9]+)$/', trim($pV), $reg)) {
777 1
                        $reg = $this->swapIfFirstIsLargerThanSecond($reg);
778
779
                        // Traverse range, add values:
780
                        // Limit to size of range!
781 1
                        $runAwayBrake = 1000;
782 1
                        for ($a = $reg[1]; $a <= $reg[2]; $a++) {
783 1
                            $paramArray[$p][] = $a;
784 1
                            $runAwayBrake--;
785 1
                            if ($runAwayBrake <= 0) {
786
                                break;
787
                            }
788
                        }
789 10
                    } elseif (strpos(trim($pV), '_TABLE:') === 0) {
790
791
                        // Parse parameters:
792 6
                        $subparts = GeneralUtility::trimExplode(';', $pV);
793 6
                        $subpartParams = [];
794 6
                        foreach ($subparts as $spV) {
795 6
                            [$pKey, $pVal] = GeneralUtility::trimExplode(':', $spV);
796 6
                            $subpartParams[$pKey] = $pVal;
797
                        }
798
799
                        // Table exists:
800 6
                        if (isset($GLOBALS['TCA'][$subpartParams['_TABLE']])) {
801 6
                            $lookUpPid = isset($subpartParams['_PID']) ? intval($subpartParams['_PID']) : intval($pid);
802 6
                            $recursiveDepth = isset($subpartParams['_RECURSIVE']) ? intval($subpartParams['_RECURSIVE']) : 0;
803 6
                            $pidField = isset($subpartParams['_PIDFIELD']) ? trim($subpartParams['_PIDFIELD']) : 'pid';
804 6
                            $where = $subpartParams['_WHERE'] ?? '';
805 6
                            $addTable = $subpartParams['_ADDTABLE'] ?? '';
806
807 6
                            $fieldName = $subpartParams['_FIELD'] ? $subpartParams['_FIELD'] : 'uid';
808 6
                            if ($fieldName === 'uid' || $GLOBALS['TCA'][$subpartParams['_TABLE']]['columns'][$fieldName]) {
809 6
                                $queryBuilder = $this->getQueryBuilder($subpartParams['_TABLE']);
810
811 6
                                if ($recursiveDepth > 0) {
812
                                    /** @var \TYPO3\CMS\Core\Database\QueryGenerator $queryGenerator */
813 2
                                    $queryGenerator = GeneralUtility::makeInstance(\TYPO3\CMS\Core\Database\QueryGenerator::class);
814 2
                                    $pidList = $queryGenerator->getTreeList($lookUpPid, $recursiveDepth, 0, 1);
815 2
                                    $pidArray = GeneralUtility::intExplode(',', $pidList);
816
                                } else {
817 4
                                    $pidArray = [(string) $lookUpPid];
818
                                }
819
820 6
                                $queryBuilder->getRestrictions()
821 6
                                    ->removeAll()
822 6
                                    ->add(GeneralUtility::makeInstance(DeletedRestriction::class));
823
824
                                $queryBuilder
825 6
                                    ->select($fieldName)
826 6
                                    ->from($subpartParams['_TABLE'])
827 6
                                    ->where(
828 6
                                        $queryBuilder->expr()->in($pidField, $queryBuilder->createNamedParameter($pidArray, Connection::PARAM_INT_ARRAY)),
829 6
                                        $where
830
                                    );
831
832 6
                                if (! empty($addTable)) {
833
                                    // TODO: Check if this works as intended!
834
                                    $queryBuilder->add('from', $addTable);
835
                                }
836 6
                                $transOrigPointerField = $GLOBALS['TCA'][$subpartParams['_TABLE']]['ctrl']['transOrigPointerField'];
837
838 6
                                if ($subpartParams['_ENABLELANG'] && $transOrigPointerField) {
839
                                    $queryBuilder->andWhere(
840
                                        $queryBuilder->expr()->lte(
841
                                            $transOrigPointerField,
842
                                            0
843
                                        )
844
                                    );
845
                                }
846
847 6
                                $statement = $queryBuilder->execute();
848
849 6
                                $rows = [];
850 6
                                while ($row = $statement->fetch()) {
851 6
                                    $rows[$row[$fieldName]] = $row;
852
                                }
853
854 6
                                if (is_array($rows)) {
855 6
                                    $paramArray[$p] = array_merge($paramArray[$p], array_keys($rows));
856
                                }
857
                            }
858
                        }
859
                    } else {
860
                        // Just add value:
861 4
                        $paramArray[$p][] = $pV;
862
                    }
863
                    // Hook for processing own expandParameters place holder
864 11
                    if (is_array($GLOBALS['TYPO3_CONF_VARS']['SC_OPTIONS']['crawler/class.tx_crawler_lib.php']['expandParameters'])) {
865
                        $_params = [
866
                            'pObj' => &$this,
867
                            'paramArray' => &$paramArray,
868
                            'currentKey' => $p,
869
                            'currentValue' => $pV,
870
                            'pid' => $pid,
871
                        ];
872
                        foreach ($GLOBALS['TYPO3_CONF_VARS']['SC_OPTIONS']['crawler/class.tx_crawler_lib.php']['expandParameters'] as $_funcRef) {
873
                            GeneralUtility::callUserFunction($_funcRef, $_params, $this);
874
                        }
875
                    }
876
                }
877
878
                // Make unique set of values and sort array by key:
879 11
                $paramArray[$p] = array_unique($paramArray[$p]);
880 11
                ksort($paramArray);
881
            } else {
882
                // Set the literal value as only value in array:
883 4
                $paramArray[$p] = [$v];
884
            }
885
        }
886
887 11
        return $paramArray;
888
    }
889
890
    /**
891
     * Compiling URLs from parameter array (output of expandParameters())
892
     * The number of URLs will be the multiplication of the number of parameter values for each key
893
     *
894
     * @param array $paramArray Output of expandParameters(): Array with keys (GET var names) and for each an array of values
895
     * @param array $urls URLs accumulated in this array (for recursion)
896
     * @return array
897
     */
898 7
    public function compileUrls($paramArray, array $urls)
899
    {
900 7
        if (empty($paramArray)) {
901 7
            return $urls;
902
        }
903
        // shift first off stack:
904 6
        reset($paramArray);
905 6
        $varName = key($paramArray);
906 6
        $valueSet = array_shift($paramArray);
907
908
        // Traverse value set:
909 6
        $newUrls = [];
910 6
        foreach ($urls as $url) {
911 5
            foreach ($valueSet as $val) {
912 5
                $newUrls[] = $url . (strcmp((string) $val, '') ? '&' . rawurlencode($varName) . '=' . rawurlencode((string) $val) : '');
913
914 5
                if (count($newUrls) > $this->maximumUrlsToCompile) {
915
                    break;
916
                }
917
            }
918
        }
919 6
        return $this->compileUrls($paramArray, $newUrls);
920
    }
921
922
    /************************************
923
     *
924
     * Crawler log
925
     *
926
     ************************************/
927
928
    /**
929
     * Return array of records from crawler queue for input page ID
930
     *
931
     * @param integer $id Page ID for which to look up log entries.
932
     * @param string $filter Filter: "all" => all entries, "pending" => all that is not yet run, "finished" => all complete ones
933
     * @param boolean $doFlush If TRUE, then entries selected at DELETED(!) instead of selected!
934
     * @param boolean $doFullFlush
935
     * @param integer $itemsPerPage Limit the amount of entries per page default is 10
936
     * @return array
937
     */
938 4
    public function getLogEntriesForPageId($id, $filter = '', $doFlush = false, $doFullFlush = false, $itemsPerPage = 10)
939
    {
940 4
        $queryBuilder = GeneralUtility::makeInstance(ConnectionPool::class)->getQueryBuilderForTable($this->tableName);
941
        $queryBuilder
942 4
            ->select('*')
943 4
            ->from($this->tableName)
944 4
            ->where(
945 4
                $queryBuilder->expr()->eq('page_id', $queryBuilder->createNamedParameter($id, \PDO::PARAM_INT))
946
            )
947 4
            ->orderBy('scheduled', 'DESC');
948
949 4
        $expressionBuilder = GeneralUtility::makeInstance(ConnectionPool::class)
950 4
            ->getConnectionForTable($this->tableName)
951 4
            ->getExpressionBuilder();
952 4
        $query = $expressionBuilder->andX();
0 ignored issues
show
Unused Code introduced by
The assignment to $query is dead and can be removed.
Loading history...
953
        // PHPStorm adds the highlight that the $addWhere is immediately overwritten,
954
        // but the $query = $expressionBuilder->andX() ensures that the $addWhere is written correctly with AND
955
        // between the statements, it's not a mistake in the code.
956 4
        switch ($filter) {
957 4
            case 'pending':
958
                $queryBuilder->andWhere($queryBuilder->expr()->eq('exec_time', 0));
959
                break;
960 4
            case 'finished':
961
                $queryBuilder->andWhere($queryBuilder->expr()->gt('exec_time', 0));
962
                break;
963
        }
964
965 4
        if ($doFlush) {
966 2
            if ($doFullFlush) {
967 1
                $this->queueRepository->flushQueue('all');
968
            } else {
969 1
                $this->queueRepository->flushQueue($filter);
970
            }
971
        }
972 4
        if ($itemsPerPage > 0) {
973
            $queryBuilder
974 4
                ->setMaxResults((int) $itemsPerPage);
975
        }
976
977 4
        return $queryBuilder->execute()->fetchAll();
978
    }
979
980
    /**
981
     * Return array of records from crawler queue for input set ID
982
     *
983
     * @param int $set_id Set ID for which to look up log entries.
984
     * @param string $filter Filter: "all" => all entries, "pending" => all that is not yet run, "finished" => all complete ones
985
     * @param bool $doFlush If TRUE, then entries selected at DELETED(!) instead of selected!
986
     * @param int $itemsPerPage Limit the amount of entries per page default is 10
987
     * @return array
988
     *
989
     * @deprecated
990
     */
991 6
    public function getLogEntriesForSetId(int $set_id, string $filter = '', bool $doFlush = false, bool $doFullFlush = false, int $itemsPerPage = 10)
992
    {
993 6
        $queryBuilder = GeneralUtility::makeInstance(ConnectionPool::class)->getQueryBuilderForTable($this->tableName);
994
        $queryBuilder
995 6
            ->select('*')
996 6
            ->from($this->tableName)
997 6
            ->where(
998 6
                $queryBuilder->expr()->eq('set_id', $queryBuilder->createNamedParameter($set_id, \PDO::PARAM_INT))
999
            )
1000 6
            ->orderBy('scheduled', 'DESC');
1001
1002 6
        $expressionBuilder = GeneralUtility::makeInstance(ConnectionPool::class)
1003 6
            ->getConnectionForTable($this->tableName)
1004 6
            ->getExpressionBuilder();
1005 6
        $query = $expressionBuilder->andX();
1006
        // PHPStorm adds the highlight that the $addWhere is immediately overwritten,
1007
        // but the $query = $expressionBuilder->andX() ensures that the $addWhere is written correctly with AND
1008
        // between the statements, it's not a mistake in the code.
1009 6
        $addWhere = '';
1010 6
        switch ($filter) {
1011 6
            case 'pending':
1012 1
                $queryBuilder->andWhere($queryBuilder->expr()->eq('exec_time', 0));
1013 1
                $addWhere = $query->add($expressionBuilder->eq('exec_time', 0));
0 ignored issues
show
Unused Code introduced by
The assignment to $addWhere is dead and can be removed.
Loading history...
1014 1
                break;
1015 5
            case 'finished':
1016 1
                $queryBuilder->andWhere($queryBuilder->expr()->gt('exec_time', 0));
1017 1
                $addWhere = $query->add($expressionBuilder->gt('exec_time', 0));
1018 1
                break;
1019
        }
1020 6
        if ($doFlush) {
1021 4
            $addWhere = $query->add($expressionBuilder->eq('set_id', (int) $set_id));
1022 4
            $this->flushQueue($doFullFlush ? '' : $addWhere);
0 ignored issues
show
Deprecated Code introduced by
The function AOE\Crawler\Controller\C...ontroller::flushQueue() has been deprecated. ( Ignorable by Annotation )

If this is a false-positive, you can also ignore this issue in your code via the ignore-deprecated  annotation

1022
            /** @scrutinizer ignore-deprecated */ $this->flushQueue($doFullFlush ? '' : $addWhere);
Loading history...
1023 4
            return [];
1024
        }
1025 2
        if ($itemsPerPage > 0) {
1026
            $queryBuilder
1027 2
                ->setMaxResults((int) $itemsPerPage);
1028
        }
1029
1030 2
        return $queryBuilder->execute()->fetchAll();
1031
    }
1032
1033
    /**
1034
     * Adding call back entries to log (called from hooks typically, see indexed search class "class.crawler.php"
1035
     *
1036
     * @param integer $setId Set ID
1037
     * @param array $params Parameters to pass to call back function
1038
     * @param string $callBack Call back object reference, eg. 'EXT:indexed_search/class.crawler.php:&tx_indexedsearch_crawler'
1039
     * @param integer $page_id Page ID to attach it to
1040
     * @param integer $schedule Time at which to activate
1041
     */
1042
    public function addQueueEntry_callBack($setId, $params, $callBack, $page_id = 0, $schedule = 0): void
1043
    {
1044
        if (! is_array($params)) {
0 ignored issues
show
introduced by
The condition is_array($params) is always true.
Loading history...
1045
            $params = [];
1046
        }
1047
        $params['_CALLBACKOBJ'] = $callBack;
1048
1049
        GeneralUtility::makeInstance(ConnectionPool::class)->getConnectionForTable('tx_crawler_queue')
1050
            ->insert(
1051
                'tx_crawler_queue',
1052
                [
1053
                    'page_id' => (int) $page_id,
1054
                    'parameters' => json_encode($params),
1055
                    'scheduled' => (int) $schedule ?: $this->getCurrentTime(),
1056
                    'exec_time' => 0,
1057
                    'set_id' => (int) $setId,
1058
                    'result_data' => '',
1059
                ]
1060
            );
1061
    }
1062
1063
    /************************************
1064
     *
1065
     * URL setting
1066
     *
1067
     ************************************/
1068
1069
    /**
1070
     * Setting a URL for crawling:
1071
     *
1072
     * @param integer $id Page ID
1073
     * @param string $url Complete URL
1074
     * @param array $subCfg Sub configuration array (from TS config)
1075
     * @param integer $tstamp Scheduled-time
1076
     * @param string $configurationHash (optional) configuration hash
1077
     * @param bool $skipInnerDuplicationCheck (optional) skip inner duplication check
1078
     * @return bool
1079
     */
1080 8
    public function addUrl(
1081
        $id,
1082
        $url,
1083
        array $subCfg,
1084
        $tstamp,
1085
        $configurationHash = '',
1086
        $skipInnerDuplicationCheck = false
1087
    ) {
1088 8
        $urlAdded = false;
1089 8
        $rows = [];
1090
1091
        // Creating parameters:
1092
        $parameters = [
1093 8
            'url' => $url,
1094
        ];
1095
1096
        // fe user group simulation:
1097 8
        $uGs = implode(',', array_unique(GeneralUtility::intExplode(',', $subCfg['userGroups'], true)));
1098 8
        if ($uGs) {
1099 1
            $parameters['feUserGroupList'] = $uGs;
1100
        }
1101
1102
        // Setting processing instructions
1103 8
        $parameters['procInstructions'] = GeneralUtility::trimExplode(',', $subCfg['procInstrFilter']);
1104 8
        if (is_array($subCfg['procInstrParams.'])) {
1105 5
            $parameters['procInstrParams'] = $subCfg['procInstrParams.'];
1106
        }
1107
1108
        // Compile value array:
1109 8
        $parameters_serialized = json_encode($parameters);
1110
        $fieldArray = [
1111 8
            'page_id' => (int) $id,
1112 8
            'parameters' => $parameters_serialized,
1113 8
            'parameters_hash' => GeneralUtility::shortMD5($parameters_serialized),
1114 8
            'configuration_hash' => $configurationHash,
1115 8
            'scheduled' => $tstamp,
1116 8
            'exec_time' => 0,
1117 8
            'set_id' => (int) $this->setID,
1118 8
            'result_data' => '',
1119 8
            'configuration' => $subCfg['key'],
1120
        ];
1121
1122 8
        if ($this->registerQueueEntriesInternallyOnly) {
1123
            //the entries will only be registered and not stored to the database
1124 1
            $this->queueEntries[] = $fieldArray;
1125
        } else {
1126 7
            if (! $skipInnerDuplicationCheck) {
1127
                // check if there is already an equal entry
1128 6
                $rows = $this->getDuplicateRowsIfExist($tstamp, $fieldArray);
1129
            }
1130
1131 7
            if (empty($rows)) {
1132 6
                $connectionForCrawlerQueue = GeneralUtility::makeInstance(ConnectionPool::class)->getConnectionForTable('tx_crawler_queue');
1133 6
                $connectionForCrawlerQueue->insert(
1134 6
                    'tx_crawler_queue',
1135 6
                    $fieldArray
1136
                );
1137 6
                $uid = $connectionForCrawlerQueue->lastInsertId('tx_crawler_queue', 'qid');
1138 6
                $rows[] = $uid;
1139 6
                $urlAdded = true;
1140
1141 6
                $signalPayload = ['uid' => $uid, 'fieldArray' => $fieldArray];
1142 6
                SignalSlotUtility::emitSignal(
1143 6
                    self::class,
1144 6
                    SignalSlotUtility::SIGNAL_URL_ADDED_TO_QUEUE,
1145 6
                    $signalPayload
1146
                );
1147
            } else {
1148 3
                $signalPayload = ['rows' => $rows, 'fieldArray' => $fieldArray];
1149 3
                SignalSlotUtility::emitSignal(
1150 3
                    self::class,
1151 3
                    SignalSlotUtility::SIGNAL_DUPLICATE_URL_IN_QUEUE,
1152 3
                    $signalPayload
1153
                );
1154
            }
1155
        }
1156
1157 8
        return $urlAdded;
1158
    }
1159
1160
    /**
1161
     * Returns the current system time
1162
     *
1163
     * @return int
1164
     */
1165
    public function getCurrentTime()
1166
    {
1167
        return time();
1168
    }
1169
1170
    /************************************
1171
     *
1172
     * URL reading
1173
     *
1174
     ************************************/
1175
1176
    /**
1177
     * Read URL for single queue entry
1178
     *
1179
     * @param integer $queueId
1180
     * @param boolean $force If set, will process even if exec_time has been set!
1181
     * @return integer
1182
     */
1183
    public function readUrl($queueId, $force = false)
1184
    {
1185
        $queryBuilder = GeneralUtility::makeInstance(ConnectionPool::class)->getQueryBuilderForTable($this->tableName);
1186
        $ret = 0;
1187
        $this->logger->debug('crawler-readurl start ' . microtime(true));
1188
        // Get entry:
1189
        $queryBuilder
1190
            ->select('*')
1191
            ->from('tx_crawler_queue')
1192
            ->where(
1193
                $queryBuilder->expr()->eq('qid', $queryBuilder->createNamedParameter($queueId, \PDO::PARAM_INT))
1194
            );
1195
        if (! $force) {
1196
            $queryBuilder
1197
                ->andWhere('exec_time = 0')
1198
                ->andWhere('process_scheduled > 0');
1199
        }
1200
        $queueRec = $queryBuilder->execute()->fetch();
1201
1202
        if (! is_array($queueRec)) {
1203
            return;
1204
        }
1205
1206
        SignalSlotUtility::emitSignal(
1207
            self::class,
1208
            SignalSlotUtility::SIGNAL_QUEUEITEM_PREPROCESS,
1209
            [$queueId, &$queueRec]
1210
        );
1211
1212
        // Set exec_time to lock record:
1213
        $field_array = ['exec_time' => $this->getCurrentTime()];
1214
1215
        if (isset($this->processID)) {
1216
            //if mulitprocessing is used we need to store the id of the process which has handled this entry
1217
            $field_array['process_id_completed'] = $this->processID;
1218
        }
1219
1220
        GeneralUtility::makeInstance(ConnectionPool::class)->getConnectionForTable('tx_crawler_queue')
1221
            ->update(
1222
                'tx_crawler_queue',
1223
                $field_array,
1224
                ['qid' => (int) $queueId]
1225
            );
1226
1227
        $result = $this->queueExecutor->executeQueueItem($queueRec, $this);
1228
        if ($result['content'] === null) {
1229
            $resultData = 'An errors happened';
1230
        } else {
1231
            /** @var JsonCompatibilityConverter $jsonCompatibilityConverter */
1232
            $jsonCompatibilityConverter = GeneralUtility::makeInstance(JsonCompatibilityConverter::class);
1233
            $resultData = $jsonCompatibilityConverter->convert($result['content']);
1234
        }
1235
1236
        //atm there's no need to point to specific pollable extensions
1237
        if (is_array($GLOBALS['TYPO3_CONF_VARS']['EXTCONF']['crawler']['pollSuccess'])) {
1238
            foreach ($GLOBALS['TYPO3_CONF_VARS']['EXTCONF']['crawler']['pollSuccess'] as $pollable) {
1239
                // only check the success value if the instruction is runnig
1240
                // it is important to name the pollSuccess key same as the procInstructions key
1241
                if (is_array($resultData['parameters']['procInstructions'])
1242
                    && in_array(
1243
                        $pollable,
1244
                        $resultData['parameters']['procInstructions'], true
1245
                    )
1246
                ) {
1247
                    if (! empty($resultData['success'][$pollable]) && $resultData['success'][$pollable]) {
1248
                        $ret |= self::CLI_STATUS_POLLABLE_PROCESSED;
1249
                    }
1250
                }
1251
            }
1252
        }
1253
1254
        // Set result in log which also denotes the end of the processing of this entry.
1255
        $field_array = ['result_data' => json_encode($result)];
1256
1257
        SignalSlotUtility::emitSignal(
1258
            self::class,
1259
            SignalSlotUtility::SIGNAL_QUEUEITEM_POSTPROCESS,
1260
            [$queueId, &$field_array]
1261
        );
1262
1263
        GeneralUtility::makeInstance(ConnectionPool::class)->getConnectionForTable('tx_crawler_queue')
1264
            ->update(
1265
                'tx_crawler_queue',
1266
                $field_array,
1267
                ['qid' => (int) $queueId]
1268
            );
1269
1270
        $this->logger->debug('crawler-readurl stop ' . microtime(true));
1271
        return $ret;
1272
    }
1273
1274
    /**
1275
     * Read URL for not-yet-inserted log-entry
1276
     *
1277
     * @param array $field_array Queue field array,
1278
     *
1279
     * @return array|bool|mixed|string
1280
     */
1281
    public function readUrlFromArray($field_array)
1282
    {
1283
        // Set exec_time to lock record:
1284
        $field_array['exec_time'] = $this->getCurrentTime();
1285
        $connectionForCrawlerQueue = GeneralUtility::makeInstance(ConnectionPool::class)->getConnectionForTable($this->tableName);
1286
        $connectionForCrawlerQueue->insert(
1287
            $this->tableName,
1288
            $field_array
1289
        );
1290
        $queueId = $connectionForCrawlerQueue->lastInsertId($this->tableName, 'qid');
1291
        $result = $this->queueExecutor->executeQueueItem($field_array, $this);
1292
1293
        // Set result in log which also denotes the end of the processing of this entry.
1294
        $field_array = ['result_data' => json_encode($result)];
1295
1296
        SignalSlotUtility::emitSignal(
1297
            self::class,
1298
            SignalSlotUtility::SIGNAL_QUEUEITEM_POSTPROCESS,
1299
            [$queueId, &$field_array]
1300
        );
1301
1302
        $connectionForCrawlerQueue->update(
1303
            $this->tableName,
1304
            $field_array,
1305
            ['qid' => $queueId]
1306
        );
1307
1308
        return $result;
1309
    }
1310
1311
    /*****************************
1312
     *
1313
     * Compiling URLs to crawl - tools
1314
     *
1315
     *****************************/
1316
1317
    /**
1318
     * @param integer $id Root page id to start from.
1319
     * @param integer $depth Depth of tree, 0=only id-page, 1= on sublevel, 99 = infinite
1320
     * @param integer $scheduledTime Unix Time when the URL is timed to be visited when put in queue
1321
     * @param integer $reqMinute Number of requests per minute (creates the interleave between requests)
1322
     * @param boolean $submitCrawlUrls If set, submits the URLs to queue in database (real crawling)
1323
     * @param boolean $downloadCrawlUrls If set (and submitcrawlUrls is false) will fill $downloadUrls with entries)
1324
     * @param array $incomingProcInstructions Array of processing instructions
1325
     * @param array $configurationSelection Array of configuration keys
1326
     * @return string
1327
     */
1328
    public function getPageTreeAndUrls(
1329
        $id,
1330
        $depth,
1331
        $scheduledTime,
1332
        $reqMinute,
1333
        $submitCrawlUrls,
1334
        $downloadCrawlUrls,
1335
        array $incomingProcInstructions,
1336
        array $configurationSelection
1337
    ) {
1338
        $this->scheduledTime = $scheduledTime;
1339
        $this->reqMinute = $reqMinute;
1340
        $this->submitCrawlUrls = $submitCrawlUrls;
1341
        $this->downloadCrawlUrls = $downloadCrawlUrls;
1342
        $this->incomingProcInstructions = $incomingProcInstructions;
1343
        $this->incomingConfigurationSelection = $configurationSelection;
1344
1345
        $this->duplicateTrack = [];
1346
        $this->downloadUrls = [];
1347
1348
        // Drawing tree:
1349
        /* @var PageTreeView $tree */
1350
        $tree = GeneralUtility::makeInstance(PageTreeView::class);
1351
        $perms_clause = $this->getBackendUser()->getPagePermsClause(Permission::PAGE_SHOW);
1352
        $tree->init('AND ' . $perms_clause);
1353
1354
        $pageInfo = BackendUtility::readPageAccess($id, $perms_clause);
1355
        if (is_array($pageInfo)) {
0 ignored issues
show
introduced by
The condition is_array($pageInfo) is always true.
Loading history...
1356
            // Set root row:
1357
            $tree->tree[] = [
1358
                'row' => $pageInfo,
1359
                'HTML' => $this->iconFactory->getIconForRecord('pages', $pageInfo, Icon::SIZE_SMALL),
1360
            ];
1361
        }
1362
1363
        // Get branch beneath:
1364
        if ($depth) {
1365
            $tree->getTree($id, $depth, '');
1366
        }
1367
1368
        // Traverse page tree:
1369
        $code = '';
1370
1371
        foreach ($tree->tree as $data) {
1372
            $this->MP = false;
1373
1374
            // recognize mount points
1375
            if ($data['row']['doktype'] === PageRepository::DOKTYPE_MOUNTPOINT) {
1376
                $mountpage = $this->pageRepository->getPage($data['row']['uid']);
1377
1378
                // fetch mounted pages
1379
                $this->MP = $mountpage[0]['mount_pid'] . '-' . $data['row']['uid'];
0 ignored issues
show
Documentation Bug introduced by
The property $MP was declared of type boolean, but $mountpage[0]['mount_pid...' . $data['row']['uid'] is of type string. Maybe add a type cast?

This check looks for assignments to scalar types that may be of the wrong type.

To ensure the code behaves as expected, it may be a good idea to add an explicit type cast.

$answer = 42;

$correct = false;

$correct = (bool) $answer;
Loading history...
1380
1381
                $mountTree = GeneralUtility::makeInstance(PageTreeView::class);
1382
                $mountTree->init('AND ' . $perms_clause);
1383
                $mountTree->getTree($mountpage[0]['mount_pid'], $depth);
1384
1385
                foreach ($mountTree->tree as $mountData) {
1386
                    $code .= $this->drawURLs_addRowsForPage(
1387
                        $mountData['row'],
1388
                        $mountData['HTML'] . BackendUtility::getRecordTitle('pages', $mountData['row'], true)
1389
                    );
1390
                }
1391
1392
                // replace page when mount_pid_ol is enabled
1393
                if ($mountpage[0]['mount_pid_ol']) {
1394
                    $data['row']['uid'] = $mountpage[0]['mount_pid'];
1395
                } else {
1396
                    // if the mount_pid_ol is not set the MP must not be used for the mountpoint page
1397
                    $this->MP = false;
1398
                }
1399
            }
1400
1401
            $code .= $this->drawURLs_addRowsForPage(
1402
                $data['row'],
1403
                $data['HTML'] . BackendUtility::getRecordTitle('pages', $data['row'], true)
1404
            );
1405
        }
1406
1407
        return $code;
1408
    }
1409
1410
    /**
1411
     * Expands exclude string
1412
     *
1413
     * @param string $excludeString Exclude string
1414
     * @return array
1415
     */
1416 2
    public function expandExcludeString($excludeString)
1417
    {
1418
        // internal static caches;
1419 2
        static $expandedExcludeStringCache;
1420 2
        static $treeCache;
1421
1422 2
        if (empty($expandedExcludeStringCache[$excludeString])) {
1423 2
            $pidList = [];
1424
1425 2
            if (! empty($excludeString)) {
1426
                /** @var PageTreeView $tree */
1427 1
                $tree = GeneralUtility::makeInstance(PageTreeView::class);
1428 1
                $tree->init('AND ' . $this->getBackendUser()->getPagePermsClause(Permission::PAGE_SHOW));
1429
1430 1
                $excludeParts = GeneralUtility::trimExplode(',', $excludeString);
1431
1432 1
                foreach ($excludeParts as $excludePart) {
1433 1
                    [$pid, $depth] = GeneralUtility::trimExplode('+', $excludePart);
1434
1435
                    // default is "page only" = "depth=0"
1436 1
                    if (empty($depth)) {
1437 1
                        $depth = (stristr($excludePart, '+')) ? 99 : 0;
1438
                    }
1439
1440 1
                    $pidList[] = (int) $pid;
1441
1442 1
                    if ($depth > 0) {
1443
                        if (empty($treeCache[$pid][$depth])) {
1444
                            $tree->reset();
1445
                            $tree->getTree($pid, $depth);
0 ignored issues
show
Bug introduced by
$pid of type string is incompatible with the type integer expected by parameter $uid of TYPO3\CMS\Backend\Tree\V...ractTreeView::getTree(). ( Ignorable by Annotation )

If this is a false-positive, you can also ignore this issue in your code via the ignore-type  annotation

1445
                            $tree->getTree(/** @scrutinizer ignore-type */ $pid, $depth);
Loading history...
1446
                            $treeCache[$pid][$depth] = $tree->tree;
1447
                        }
1448
1449
                        foreach ($treeCache[$pid][$depth] as $data) {
1450
                            $pidList[] = (int) $data['row']['uid'];
1451
                        }
1452
                    }
1453
                }
1454
            }
1455
1456 2
            $expandedExcludeStringCache[$excludeString] = array_unique($pidList);
1457
        }
1458
1459 2
        return $expandedExcludeStringCache[$excludeString];
1460
    }
1461
1462
    /**
1463
     * Create the rows for display of the page tree
1464
     * For each page a number of rows are shown displaying GET variable configuration
1465
     */
1466
    public function drawURLs_addRowsForPage(array $pageRow, string $pageTitle): string
1467
    {
1468
        $skipMessage = '';
1469
1470
        // Get list of configurations
1471
        $configurations = $this->getUrlsForPageRow($pageRow, $skipMessage);
1472
1473
        if (! empty($this->incomingConfigurationSelection)) {
1474
            // remove configuration that does not match the current selection
1475
            foreach ($configurations as $confKey => $confArray) {
1476
                if (! in_array($confKey, $this->incomingConfigurationSelection, true)) {
1477
                    unset($configurations[$confKey]);
1478
                }
1479
            }
1480
        }
1481
1482
        // Traverse parameter combinations:
1483
        $c = 0;
1484
        $content = '';
1485
        if (! empty($configurations)) {
1486
            foreach ($configurations as $confKey => $confArray) {
1487
1488
                // Title column:
1489
                if (! $c) {
1490
                    $titleClm = '<td rowspan="' . count($configurations) . '">' . $pageTitle . '</td>';
1491
                } else {
1492
                    $titleClm = '';
1493
                }
1494
1495
                if (! in_array($pageRow['uid'], $this->expandExcludeString($confArray['subCfg']['exclude']), true)) {
1496
1497
                    // URL list:
1498
                    $urlList = $this->urlListFromUrlArray(
1499
                        $confArray,
1500
                        $pageRow,
1501
                        $this->scheduledTime,
1502
                        $this->reqMinute,
1503
                        $this->submitCrawlUrls,
1504
                        $this->downloadCrawlUrls,
1505
                        $this->duplicateTrack,
1506
                        $this->downloadUrls,
1507
                        // if empty the urls won't be filtered by processing instructions
1508
                        $this->incomingProcInstructions
1509
                    );
1510
1511
                    // Expanded parameters:
1512
                    $paramExpanded = '';
1513
                    $calcAccu = [];
1514
                    $calcRes = 1;
1515
                    foreach ($confArray['paramExpanded'] as $gVar => $gVal) {
1516
                        $paramExpanded .= '
1517
                            <tr>
1518
                                <td>' . htmlspecialchars('&' . $gVar . '=') . '<br/>' .
1519
                            '(' . count($gVal) . ')' .
1520
                            '</td>
1521
                                <td nowrap="nowrap">' . nl2br(htmlspecialchars(implode(chr(10), $gVal))) . '</td>
1522
                            </tr>
1523
                        ';
1524
                        $calcRes *= count($gVal);
1525
                        $calcAccu[] = count($gVal);
1526
                    }
1527
                    $paramExpanded = '<table>' . $paramExpanded . '</table>';
1528
                    $paramExpanded .= 'Comb: ' . implode('*', $calcAccu) . '=' . $calcRes;
1529
1530
                    // Options
1531
                    $optionValues = '';
1532
                    if ($confArray['subCfg']['userGroups']) {
1533
                        $optionValues .= 'User Groups: ' . $confArray['subCfg']['userGroups'] . '<br/>';
1534
                    }
1535
                    if ($confArray['subCfg']['procInstrFilter']) {
1536
                        $optionValues .= 'ProcInstr: ' . $confArray['subCfg']['procInstrFilter'] . '<br/>';
1537
                    }
1538
1539
                    // Compile row:
1540
                    $content .= '
1541
                        <tr>
1542
                            ' . $titleClm . '
1543
                            <td>' . htmlspecialchars($confKey) . '</td>
1544
                            <td>' . nl2br(htmlspecialchars(rawurldecode(trim(str_replace('&', chr(10) . '&', GeneralUtility::implodeArrayForUrl('', $confArray['paramParsed'])))))) . '</td>
1545
                            <td>' . $paramExpanded . '</td>
1546
                            <td nowrap="nowrap">' . $urlList . '</td>
1547
                            <td nowrap="nowrap">' . $optionValues . '</td>
1548
                            <td nowrap="nowrap">' . DebugUtility::viewArray($confArray['subCfg']['procInstrParams.']) . '</td>
1549
                        </tr>';
1550
                } else {
1551
                    $content .= '<tr>
1552
                            ' . $titleClm . '
1553
                            <td>' . htmlspecialchars($confKey) . '</td>
1554
                            <td colspan="5"><em>No entries</em> (Page is excluded in this configuration)</td>
1555
                        </tr>';
1556
                }
1557
1558
                $c++;
1559
            }
1560
        } else {
1561
            $message = ! empty($skipMessage) ? ' (' . $skipMessage . ')' : '';
1562
1563
            // Compile row:
1564
            $content .= '
1565
                <tr>
1566
                    <td>' . $pageTitle . '</td>
1567
                    <td colspan="6"><em>No entries</em>' . $message . '</td>
1568
                </tr>';
1569
        }
1570
1571
        return $content;
1572
    }
1573
1574
    /*****************************
1575
     *
1576
     * CLI functions
1577
     *
1578
     *****************************/
1579
1580
    /**
1581
     * Running the functionality of the CLI (crawling URLs from queue)
1582
     */
1583
    public function CLI_run(int $countInARun, int $sleepTime, int $sleepAfterFinish): int
1584
    {
1585
        $result = 0;
1586
        $counter = 0;
1587
1588
        // First, run hooks:
1589
        $this->CLI_runHooks();
1590
1591
        // Clean up the queue
1592
        $this->queueRepository->cleanupQueue();
1593
1594
        // Select entries:
1595
        $rows = $this->queueRepository->fetchRecordsToBeCrawled($countInARun);
1596
1597
        if (! empty($rows)) {
1598
            $quidList = [];
1599
1600
            foreach ($rows as $r) {
1601
                $quidList[] = $r['qid'];
1602
            }
1603
1604
            $processId = $this->CLI_buildProcessId();
1605
1606
            //save the number of assigned queue entries to determine how many have been processed later
1607
            $numberOfAffectedRows = $this->queueRepository->updateProcessIdAndSchedulerForQueueIds($quidList, $processId);
1608
            $this->processRepository->updateProcessAssignItemsCount($numberOfAffectedRows, $processId);
1609
1610
            if ($numberOfAffectedRows !== count($quidList)) {
1611
                $this->CLI_debug('Nothing processed due to multi-process collision (' . $this->CLI_buildProcessId() . ')');
0 ignored issues
show
Deprecated Code introduced by
The function AOE\Crawler\Controller\C...Controller::CLI_debug() has been deprecated. ( Ignorable by Annotation )

If this is a false-positive, you can also ignore this issue in your code via the ignore-deprecated  annotation

1611
                /** @scrutinizer ignore-deprecated */ $this->CLI_debug('Nothing processed due to multi-process collision (' . $this->CLI_buildProcessId() . ')');
Loading history...
1612
                return ($result | self::CLI_STATUS_ABORTED);
1613
            }
1614
1615
            foreach ($rows as $r) {
1616
                $result |= $this->readUrl($r['qid']);
1617
1618
                $counter++;
1619
                // Just to relax the system
1620
                usleep((int) $sleepTime);
1621
1622
                // if during the start and the current read url the cli has been disable we need to return from the function
1623
                // mark the process NOT as ended.
1624
                if ($this->crawler->isDisabled()) {
1625
                    return ($result | self::CLI_STATUS_ABORTED);
1626
                }
1627
1628
                if (! $this->processRepository->isProcessActive($this->CLI_buildProcessId())) {
1629
                    $this->CLI_debug('conflict / timeout (' . $this->CLI_buildProcessId() . ')');
0 ignored issues
show
Deprecated Code introduced by
The function AOE\Crawler\Controller\C...Controller::CLI_debug() has been deprecated. ( Ignorable by Annotation )

If this is a false-positive, you can also ignore this issue in your code via the ignore-deprecated  annotation

1629
                    /** @scrutinizer ignore-deprecated */ $this->CLI_debug('conflict / timeout (' . $this->CLI_buildProcessId() . ')');
Loading history...
1630
                    $result |= self::CLI_STATUS_ABORTED;
1631
                    //possible timeout
1632
                    break;
1633
                }
1634
            }
1635
1636
            sleep((int) $sleepAfterFinish);
1637
1638
            $msg = 'Rows: ' . $counter;
1639
            $this->CLI_debug($msg . ' (' . $this->CLI_buildProcessId() . ')');
0 ignored issues
show
Deprecated Code introduced by
The function AOE\Crawler\Controller\C...Controller::CLI_debug() has been deprecated. ( Ignorable by Annotation )

If this is a false-positive, you can also ignore this issue in your code via the ignore-deprecated  annotation

1639
            /** @scrutinizer ignore-deprecated */ $this->CLI_debug($msg . ' (' . $this->CLI_buildProcessId() . ')');
Loading history...
1640
        } else {
1641
            $this->CLI_debug('Nothing within queue which needs to be processed (' . $this->CLI_buildProcessId() . ')');
0 ignored issues
show
Deprecated Code introduced by
The function AOE\Crawler\Controller\C...Controller::CLI_debug() has been deprecated. ( Ignorable by Annotation )

If this is a false-positive, you can also ignore this issue in your code via the ignore-deprecated  annotation

1641
            /** @scrutinizer ignore-deprecated */ $this->CLI_debug('Nothing within queue which needs to be processed (' . $this->CLI_buildProcessId() . ')');
Loading history...
1642
        }
1643
1644
        if ($counter > 0) {
1645
            $result |= self::CLI_STATUS_PROCESSED;
1646
        }
1647
1648
        return $result;
1649
    }
1650
1651
    /**
1652
     * Activate hooks
1653
     */
1654
    public function CLI_runHooks(): void
1655
    {
1656
        foreach ($GLOBALS['TYPO3_CONF_VARS']['EXTCONF']['crawler']['cli_hooks'] ?? [] as $objRef) {
1657
            $hookObj = GeneralUtility::makeInstance($objRef);
1658
            if (is_object($hookObj)) {
1659
                $hookObj->crawler_init($this);
1660
            }
1661
        }
1662
    }
1663
1664
    /**
1665
     * Try to acquire a new process with the given id
1666
     * also performs some auto-cleanup for orphan processes
1667
     * @param string $id identification string for the process
1668
     * @return boolean
1669
     * @todo preemption might not be the most elegant way to clean up
1670
     */
1671
    public function CLI_checkAndAcquireNewProcess($id)
1672
    {
1673
        $ret = true;
1674
1675
        $systemProcessId = getmypid();
1676
        if (! $systemProcessId) {
1677
            return false;
1678
        }
1679
1680
        $processCount = 0;
1681
        $orphanProcesses = [];
1682
1683
        $activeProcesses = $this->processRepository->findAllActive();
1684
        $currentTime = $this->getCurrentTime();
1685
1686
        /** @var Process $process */
1687
        foreach ($activeProcesses as $process) {
1688
            if ($process->getTtl() < $currentTime) {
1689
                $orphanProcesses[] = $process->getProcessId();
1690
            } else {
1691
                $processCount++;
1692
            }
1693
        }
1694
1695
        // if there are less than allowed active processes then add a new one
1696
        if ($processCount < (int) $this->extensionSettings['processLimit']) {
1697
            $this->CLI_debug('add process ' . $this->CLI_buildProcessId() . ' (' . ($processCount + 1) . '/' . (int) $this->extensionSettings['processLimit'] . ')');
0 ignored issues
show
Deprecated Code introduced by
The function AOE\Crawler\Controller\C...Controller::CLI_debug() has been deprecated. ( Ignorable by Annotation )

If this is a false-positive, you can also ignore this issue in your code via the ignore-deprecated  annotation

1697
            /** @scrutinizer ignore-deprecated */ $this->CLI_debug('add process ' . $this->CLI_buildProcessId() . ' (' . ($processCount + 1) . '/' . (int) $this->extensionSettings['processLimit'] . ')');
Loading history...
1698
1699
            GeneralUtility::makeInstance(ConnectionPool::class)->getConnectionForTable('tx_crawler_process')->insert(
1700
                'tx_crawler_process',
1701
                [
1702
                    'process_id' => $id,
1703
                    'active' => 1,
1704
                    'ttl' => $currentTime + (int) $this->extensionSettings['processMaxRunTime'],
1705
                    'system_process_id' => $systemProcessId,
1706
                ]
1707
            );
1708
        } else {
1709
            $this->CLI_debug('Processlimit reached (' . ($processCount) . '/' . (int) $this->extensionSettings['processLimit'] . ')');
0 ignored issues
show
Deprecated Code introduced by
The function AOE\Crawler\Controller\C...Controller::CLI_debug() has been deprecated. ( Ignorable by Annotation )

If this is a false-positive, you can also ignore this issue in your code via the ignore-deprecated  annotation

1709
            /** @scrutinizer ignore-deprecated */ $this->CLI_debug('Processlimit reached (' . ($processCount) . '/' . (int) $this->extensionSettings['processLimit'] . ')');
Loading history...
1710
            $ret = false;
1711
        }
1712
1713
        $this->processRepository->deleteProcessesMarkedAsDeleted();
1714
        $this->CLI_releaseProcesses($orphanProcesses);
1715
1716
        return $ret;
1717
    }
1718
1719
    /**
1720
     * Release a process and the required resources
1721
     *
1722
     * @param mixed $releaseIds string with a single process-id or array with multiple process-ids
1723
     * @return boolean
1724
     */
1725
    public function CLI_releaseProcesses($releaseIds)
1726
    {
1727
        $queryBuilder = GeneralUtility::makeInstance(ConnectionPool::class)->getQueryBuilderForTable($this->tableName);
1728
1729
        if (! is_array($releaseIds)) {
1730
            $releaseIds = [$releaseIds];
1731
        }
1732
1733
        if (empty($releaseIds)) {
1734
            //nothing to release
1735
            return false;
1736
        }
1737
1738
        // some kind of 2nd chance algo - this way you need at least 2 processes to have a real cleanup
1739
        // this ensures that a single process can't mess up the entire process table
1740
1741
        // mark all processes as deleted which have no "waiting" queue-entires and which are not active
1742
1743
        $queryBuilder
1744
            ->update($this->tableName, 'q')
1745
            ->where(
1746
                'q.process_id IN(SELECT p.process_id FROM tx_crawler_process as p WHERE p.active = 0)'
1747
            )
1748
            ->set('q.process_scheduled', 0)
1749
            ->set('q.process_id', '')
1750
            ->execute();
1751
1752
        // FIXME: Not entirely sure that this is equivalent to the previous version
1753
        $queryBuilder->resetQueryPart('set');
1754
1755
        $queryBuilder
1756
            ->update('tx_crawler_process')
1757
            ->where(
1758
                $queryBuilder->expr()->eq('active', 0),
1759
                'process_id IN(SELECT q.process_id FROM tx_crawler_queue as q WHERE q.exec_time = 0)'
1760
            )
1761
            ->set('system_process_id', 0)
1762
            ->execute();
1763
1764
        $this->processRepository->markRequestedProcessesAsNotActive($releaseIds);
1765
        $this->queueRepository->unsetProcessScheduledAndProcessIdForQueueEntries($releaseIds);
1766
1767
        return true;
1768
    }
1769
1770
    /**
1771
     * Create a unique Id for the current process
1772
     *
1773
     * @return string the ID
1774
     */
1775 1
    public function CLI_buildProcessId()
1776
    {
1777 1
        if (! $this->processID) {
1778
            $this->processID = GeneralUtility::shortMD5(microtime(true));
1779
        }
1780 1
        return $this->processID;
1781
    }
1782
1783
    /**
1784
     * Prints a message to the stdout (only if debug-mode is enabled)
1785
     *
1786
     * @param string $msg the message
1787
     * @deprecated
1788
     */
1789
    public function CLI_debug($msg): void
1790
    {
1791
        if ((int) $this->extensionSettings['processDebug']) {
1792
            echo $msg . "\n";
1793
            flush();
1794
        }
1795
    }
1796
1797
    /**
1798
     * Cleans up entries that stayed for too long in the queue. These are:
1799
     * - processed entries that are over 1.5 days in age
1800
     * - scheduled entries that are over 7 days old
1801
     *
1802
     * @deprecated
1803
     */
1804 1
    public function cleanUpOldQueueEntries(): void
1805
    {
1806
        // 24*60*60 Seconds in 24 hours
1807 1
        $processedAgeInSeconds = $this->extensionSettings['cleanUpProcessedAge'] * 86400;
1808 1
        $scheduledAgeInSeconds = $this->extensionSettings['cleanUpScheduledAge'] * 86400;
1809
1810 1
        $now = time();
1811 1
        $condition = '(exec_time<>0 AND exec_time<' . ($now - $processedAgeInSeconds) . ') OR scheduled<=' . ($now - $scheduledAgeInSeconds);
1812 1
        $this->flushQueue($condition);
0 ignored issues
show
Deprecated Code introduced by
The function AOE\Crawler\Controller\C...ontroller::flushQueue() has been deprecated. ( Ignorable by Annotation )

If this is a false-positive, you can also ignore this issue in your code via the ignore-deprecated  annotation

1812
        /** @scrutinizer ignore-deprecated */ $this->flushQueue($condition);
Loading history...
1813 1
    }
1814
1815
    /**
1816
     * Removes queue entries
1817
     *
1818
     * @param string $where SQL related filter for the entries which should be removed
1819
     *
1820
     * @deprecated
1821
     */
1822 5
    protected function flushQueue($where = ''): void
1823
    {
1824 5
        $realWhere = strlen((string) $where) > 0 ? $where : '1=1';
1825
1826 5
        $queryBuilder = $this->getQueryBuilder($this->tableName);
1827
1828
        $groups = $queryBuilder
1829 5
            ->selectLiteral('DISTINCT set_id')
1830 5
            ->from($this->tableName)
1831 5
            ->where($realWhere)
1832 5
            ->execute()
1833 5
            ->fetchAll();
1834 5
        if (is_array($groups)) {
0 ignored issues
show
introduced by
The condition is_array($groups) is always true.
Loading history...
1835 5
            foreach ($groups as $group) {
1836
                $subSet = $queryBuilder
1837 4
                    ->select('qid', 'set_id')
1838 4
                    ->from($this->tableName)
1839 4
                    ->where(
1840 4
                        $realWhere,
1841 4
                        $queryBuilder->expr()->eq('set_id', $group['set_id'])
1842
                    )
1843 4
                    ->execute()
1844 4
                    ->fetchAll();
1845
1846 4
                $payLoad = ['subSet' => $subSet];
1847 4
                SignalSlotUtility::emitSignal(
1848 4
                    self::class,
1849 4
                    SignalSlotUtility::SIGNAL_QUEUE_ENTRY_FLUSH,
1850 4
                    $payLoad
1851
                );
1852
            }
1853
        }
1854
1855
        $queryBuilder
1856 5
            ->delete($this->tableName)
1857 5
            ->where($realWhere)
1858 5
            ->execute();
1859 5
    }
1860
1861
    /**
1862
     * This method determines duplicates for a queue entry with the same parameters and this timestamp.
1863
     * If the timestamp is in the past, it will check if there is any unprocessed queue entry in the past.
1864
     * If the timestamp is in the future it will check, if the queued entry has exactly the same timestamp
1865
     *
1866
     * @param int $tstamp
1867
     * @param array $fieldArray
1868
     *
1869
     * @return array
1870
     */
1871 9
    protected function getDuplicateRowsIfExist($tstamp, $fieldArray)
1872
    {
1873 9
        $rows = [];
1874
1875 9
        $currentTime = $this->getCurrentTime();
1876
1877 9
        $queryBuilder = GeneralUtility::makeInstance(ConnectionPool::class)->getQueryBuilderForTable($this->tableName);
1878
        $queryBuilder
1879 9
            ->select('qid')
1880 9
            ->from('tx_crawler_queue');
1881
        //if this entry is scheduled with "now"
1882 9
        if ($tstamp <= $currentTime) {
1883 3
            if ($this->extensionSettings['enableTimeslot']) {
1884 2
                $timeBegin = $currentTime - 100;
1885 2
                $timeEnd = $currentTime + 100;
1886
                $queryBuilder
1887 2
                    ->where(
1888 2
                        'scheduled BETWEEN ' . $timeBegin . ' AND ' . $timeEnd . ''
1889
                    )
1890 2
                    ->orWhere(
1891 2
                        $queryBuilder->expr()->lte('scheduled', $currentTime)
1892
                    );
1893
            } else {
1894
                $queryBuilder
1895 1
                    ->where(
1896 3
                        $queryBuilder->expr()->lte('scheduled', $currentTime)
1897
                    );
1898
            }
1899 6
        } elseif ($tstamp > $currentTime) {
1900
            //entry with a timestamp in the future need to have the same schedule time
1901
            $queryBuilder
1902 6
                ->where(
1903 6
                    $queryBuilder->expr()->eq('scheduled', $tstamp)
1904
                );
1905
        }
1906
1907
        $queryBuilder
1908 9
            ->andWhere('NOT exec_time')
1909 9
            ->andWhere('NOT process_id')
1910 9
            ->andWhere($queryBuilder->expr()->eq('page_id', $queryBuilder->createNamedParameter($fieldArray['page_id'], \PDO::PARAM_INT)))
1911 9
            ->andWhere($queryBuilder->expr()->eq('parameters_hash', $queryBuilder->createNamedParameter($fieldArray['parameters_hash'], \PDO::PARAM_STR)));
1912
1913 9
        $statement = $queryBuilder->execute();
1914
1915 9
        while ($row = $statement->fetch()) {
1916 7
            $rows[] = $row['qid'];
1917
        }
1918
1919 9
        return $rows;
1920
    }
1921
1922
    /**
1923
     * Returns a md5 hash generated from a serialized configuration array.
1924
     *
1925
     * @return string
1926
     */
1927 10
    protected function getConfigurationHash(array $configuration)
1928
    {
1929 10
        unset($configuration['paramExpanded']);
1930 10
        unset($configuration['URLs']);
1931 10
        return md5(serialize($configuration));
1932
    }
1933
1934
    /**
1935
     * Build a URL from a Page and the Query String. If the page has a Site configuration, it can be built by using
1936
     * the Site instance.
1937
     *
1938
     * @param int $httpsOrHttp see tx_crawler_configuration.force_ssl
1939
     * @throws \TYPO3\CMS\Core\Exception\SiteNotFoundException
1940
     * @throws \TYPO3\CMS\Core\Routing\InvalidRouteArgumentsException
1941
     *
1942
     * @deprecated Using CrawlerController::getUrlFromPageAndQueryParameters() is deprecated since 9.1.1 and will be removed in v11.x, please use UrlService->getUrlFromPageAndQueryParameters() instead.
1943
     */
1944
    protected function getUrlFromPageAndQueryParameters(int $pageId, string $queryString, ?string $alternativeBaseUrl, int $httpsOrHttp): UriInterface
1945
    {
1946
        $urlService = new UrlService();
1947
        return $urlService->getUrlFromPageAndQueryParameters($pageId, $queryString, $alternativeBaseUrl, $httpsOrHttp);
1948
    }
1949
1950 1
    protected function swapIfFirstIsLargerThanSecond(array $reg): array
1951
    {
1952
        // Swap if first is larger than last:
1953 1
        if ($reg[1] > $reg[2]) {
1954
            $temp = $reg[2];
1955
            $reg[2] = $reg[1];
1956
            $reg[1] = $temp;
1957
        }
1958
1959 1
        return $reg;
1960
    }
1961
1962
    /**
1963
     * @return BackendUserAuthentication
1964
     */
1965 2
    private function getBackendUser()
1966
    {
1967
        // Make sure the _cli_ user is loaded
1968 2
        Bootstrap::initializeBackendAuthentication();
1969 2
        if ($this->backendUser === null) {
1970 2
            $this->backendUser = $GLOBALS['BE_USER'];
1971
        }
1972 2
        return $this->backendUser;
1973
    }
1974
1975
    /**
1976
     * Get querybuilder for given table
1977
     *
1978
     * @return \TYPO3\CMS\Core\Database\Query\QueryBuilder
1979
     */
1980 12
    private function getQueryBuilder(string $table)
1981
    {
1982 12
        return GeneralUtility::makeInstance(ConnectionPool::class)->getQueryBuilderForTable($table);
1983
    }
1984
}
1985