Passed
Push — github-actions ( af8ecb...4482de )
by Tomas Norre
06:26 queued 03:04
created

CrawlerController::getProcessFilename()   A

Complexity

Conditions 1
Paths 1

Size

Total Lines 3
Code Lines 1

Duplication

Lines 0
Ratio 0 %

Code Coverage

Tests 2
CRAP Score 1

Importance

Changes 0
Metric Value
cc 1
eloc 1
c 0
b 0
f 0
nc 1
nop 0
dl 0
loc 3
ccs 2
cts 2
cp 1
crap 1
rs 10
1
<?php
2
3
declare(strict_types=1);
4
5
namespace AOE\Crawler\Controller;
6
7
/***************************************************************
8
 *  Copyright notice
9
 *
10
 *  (c) 2020 AOE GmbH <[email protected]>
11
 *
12
 *  All rights reserved
13
 *
14
 *  This script is part of the TYPO3 project. The TYPO3 project is
15
 *  free software; you can redistribute it and/or modify
16
 *  it under the terms of the GNU General Public License as published by
17
 *  the Free Software Foundation; either version 3 of the License, or
18
 *  (at your option) any later version.
19
 *
20
 *  The GNU General Public License can be found at
21
 *  http://www.gnu.org/copyleft/gpl.html.
22
 *
23
 *  This script is distributed in the hope that it will be useful,
24
 *  but WITHOUT ANY WARRANTY; without even the implied warranty of
25
 *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
26
 *  GNU General Public License for more details.
27
 *
28
 *  This copyright notice MUST APPEAR in all copies of the script!
29
 ***************************************************************/
30
31
use AOE\Crawler\Configuration\ExtensionConfigurationProvider;
32
use AOE\Crawler\Converter\JsonCompatibilityConverter;
33
use AOE\Crawler\Crawler;
34
use AOE\Crawler\CrawlStrategy\CrawlStrategyFactory;
35
use AOE\Crawler\Domain\Model\Process;
36
use AOE\Crawler\Domain\Repository\ConfigurationRepository;
37
use AOE\Crawler\Domain\Repository\ProcessRepository;
38
use AOE\Crawler\Domain\Repository\QueueRepository;
39
use AOE\Crawler\QueueExecutor;
40
use AOE\Crawler\Service\UrlService;
41
use AOE\Crawler\Utility\SignalSlotUtility;
42
use AOE\Crawler\Value\QueueFilter;
43
use Psr\Http\Message\UriInterface;
44
use Psr\Log\LoggerAwareInterface;
45
use Psr\Log\LoggerAwareTrait;
46
use TYPO3\CMS\Backend\Tree\View\PageTreeView;
47
use TYPO3\CMS\Backend\Utility\BackendUtility;
48
use TYPO3\CMS\Core\Authentication\BackendUserAuthentication;
49
use TYPO3\CMS\Core\Compatibility\PublicMethodDeprecationTrait;
50
use TYPO3\CMS\Core\Compatibility\PublicPropertyDeprecationTrait;
51
use TYPO3\CMS\Core\Core\Bootstrap;
52
use TYPO3\CMS\Core\Core\Environment;
53
use TYPO3\CMS\Core\Database\Connection;
54
use TYPO3\CMS\Core\Database\ConnectionPool;
55
use TYPO3\CMS\Core\Database\Query\Restriction\DeletedRestriction;
56
use TYPO3\CMS\Core\Imaging\Icon;
57
use TYPO3\CMS\Core\Imaging\IconFactory;
58
use TYPO3\CMS\Core\Site\Entity\Site;
59
use TYPO3\CMS\Core\Type\Bitmask\Permission;
60
use TYPO3\CMS\Core\TypoScript\Parser\TypoScriptParser;
61
use TYPO3\CMS\Core\Utility\DebugUtility;
62
use TYPO3\CMS\Core\Utility\GeneralUtility;
63
use TYPO3\CMS\Core\Utility\MathUtility;
64
use TYPO3\CMS\Extbase\Object\ObjectManager;
65
use TYPO3\CMS\Frontend\Page\PageRepository;
66
67
/**
68
 * Class CrawlerController
69
 *
70
 * @package AOE\Crawler\Controller
71
 */
72
class CrawlerController implements LoggerAwareInterface
73
{
74
    use LoggerAwareTrait;
75
    use PublicMethodDeprecationTrait;
76
    use PublicPropertyDeprecationTrait;
77
78
    public const CLI_STATUS_NOTHING_PROCCESSED = 0;
79
80
    //queue not empty
81
    public const CLI_STATUS_REMAIN = 1;
82
83
    //(some) queue items where processed
84
    public const CLI_STATUS_PROCESSED = 2;
85
86
    //instance didn't finish
87
    public const CLI_STATUS_ABORTED = 4;
88
89
    public const CLI_STATUS_POLLABLE_PROCESSED = 8;
90
91
    /**
92
     * @var integer
93
     */
94
    public $setID = 0;
95
96
    /**
97
     * @var string
98
     */
99
    public $processID = '';
100
101
    /**
102
     * @var array
103
     */
104
    public $duplicateTrack = [];
105
106
    /**
107
     * @var array
108
     */
109
    public $downloadUrls = [];
110
111
    /**
112
     * @var array
113
     */
114
    public $incomingProcInstructions = [];
115
116
    /**
117
     * @var array
118
     */
119
    public $incomingConfigurationSelection = [];
120
121
    /**
122
     * @var bool
123
     */
124
    public $registerQueueEntriesInternallyOnly = false;
125
126
    /**
127
     * @var array
128
     */
129
    public $queueEntries = [];
130
131
    /**
132
     * @var array
133
     */
134
    public $urlList = [];
135
136
    /**
137
     * @var array
138
     */
139
    public $extensionSettings = [];
140
141
    /**
142
     * Mount Point
143
     *
144
     * @var bool
145
     * Todo: Check what this is used for and adjust the type hint or code, as bool doesn't match the current code.
146
     */
147
    public $MP = false;
148
149
    /**
150
     * @var string
151
     * @deprecated
152
     */
153
    protected $processFilename;
154
155
    /**
156
     * Holds the internal access mode can be 'gui','cli' or 'cli_im'
157
     *
158
     * @var string
159
     * @deprecated
160
     */
161
    protected $accessMode;
162
163
    /**
164
     * @var QueueRepository
165
     */
166
    protected $queueRepository;
167
168
    /**
169
     * @var ProcessRepository
170
     */
171
    protected $processRepository;
172
173
    /**
174
     * @var ConfigurationRepository
175
     */
176
    protected $configurationRepository;
177
178
    /**
179
     * @var string
180
     */
181
    protected $tableName = 'tx_crawler_queue';
182
183
    /**
184
     * @var QueueExecutor
185
     */
186
    protected $queueExecutor;
187
188
    /**
189
     * @var int
190
     */
191
    protected $maximumUrlsToCompile = 10000;
192
193
    /**
194
     * @var IconFactory
195
     */
196
    protected $iconFactory;
197
198
    /**
199
     * @var string[]
200
     */
201
    private $deprecatedPublicMethods = [
0 ignored issues
show
introduced by
The private property $deprecatedPublicMethods is not used, and could be removed.
Loading history...
202
        'cleanUpOldQueueEntries' => 'Using CrawlerController::cleanUpOldQueueEntries() is deprecated since 9.0.1 and will be removed in v11.x, please use QueueRepository->cleanUpOldQueueEntries() instead.',
203
        'CLI_debug' => 'Using CrawlerController->CLI_debug() is deprecated since 9.1.3 and will be removed in v11.x',
204
        'getAccessMode' => 'Using CrawlerController->getAccessMode() is deprecated since 9.1.3 and will be removed in v11.x',
205
        'getLogEntriesForSetId' => 'Using crawlerController::getLogEntriesForSetId() is deprecated since 9.0.1 and will be removed in v11.x',
206
        'flushQueue' => 'Using CrawlerController::flushQueue() is deprecated since 9.0.1 and will be removed in v11.x, please use QueueRepository->flushQueue() instead.',
207
        'setAccessMode' => 'Using CrawlerController->setAccessMode() is deprecated since 9.1.3 and will be removed in v11.x',
208
        'getDisabled' => 'Using CrawlerController->getDisabled() is deprecated since 9.1.3 and will be removed in v11.x, please use Crawler->isDisabled() instead',
209
        'setDisabled' => 'Using CrawlerController->setDisabled() is deprecated since 9.1.3 and will be removed in v11.x, please use Crawler->setDisabled() instead',
210
        'getProcessFilename' => 'Using CrawlerController->getProcessFilename() is deprecated since 9.1.3 and will be removed in v11.x',
211
        'setProcessFilename' => 'Using CrawlerController->setProcessFilename() is deprecated since 9.1.3 and will be removed in v11.x',
212
        'getDuplicateRowsIfExist' => 'Using CrawlerController->getDuplicateRowsIfExist() is deprecated since 9.1.4 and will be remove in v11.x, please use QueueRepository->getDuplicateQueueItemsIfExists() instead',
213
214
    ];
215
216
    /**
217
     * @var string[]
218
     */
219
    private $deprecatedPublicProperties = [
1 ignored issue
show
introduced by
The private property $deprecatedPublicProperties is not used, and could be removed.
Loading history...
220
        'accessMode' => 'Using CrawlerController->accessMode is deprecated since 9.1.3 and will be removed in v11.x',
221
        'processFilename' => 'Using CrawlerController->accessMode is deprecated since 9.1.3 and will be removed in v11.x',
222
    ];
223
224
    /**
225
     * @var BackendUserAuthentication|null
226
     */
227
    private $backendUser;
228
229
    /**
230
     * @var integer
231
     */
232
    private $scheduledTime = 0;
233
234
    /**
235
     * @var integer
236
     */
237
    private $reqMinute = 0;
238
239
    /**
240
     * @var bool
241
     */
242
    private $submitCrawlUrls = false;
243
244
    /**
245
     * @var bool
246
     */
247
    private $downloadCrawlUrls = false;
248
249
    /**
250
     * @var PageRepository
251
     */
252
    private $pageRepository;
253
254
    /**
255
     * @var Crawler
256
     */
257
    private $crawler;
258
259
    /************************************
260
     *
261
     * Getting URLs based on Page TSconfig
262
     *
263
     ************************************/
264
265 36
    public function __construct()
266
    {
267 36
        $objectManager = GeneralUtility::makeInstance(ObjectManager::class);
268 36
        $crawlStrategyFactory = GeneralUtility::makeInstance(CrawlStrategyFactory::class);
269 36
        $this->queueRepository = $objectManager->get(QueueRepository::class);
270 36
        $this->processRepository = $objectManager->get(ProcessRepository::class);
271 36
        $this->configurationRepository = $objectManager->get(ConfigurationRepository::class);
272 36
        $this->pageRepository = $objectManager->get(PageRepository::class);
273 36
        $this->queueExecutor = GeneralUtility::makeInstance(QueueExecutor::class, $crawlStrategyFactory);
274 36
        $this->iconFactory = GeneralUtility::makeInstance(IconFactory::class);
275 36
        $this->crawler = GeneralUtility::makeInstance(Crawler::class);
276
277 36
        $this->processFilename = Environment::getVarPath() . '/lock/tx_crawler.proc';
0 ignored issues
show
Deprecated Code introduced by
The property AOE\Crawler\Controller\C...oller::$processFilename has been deprecated. ( Ignorable by Annotation )

If this is a false-positive, you can also ignore this issue in your code via the ignore-deprecated  annotation

277
        /** @scrutinizer ignore-deprecated */ $this->processFilename = Environment::getVarPath() . '/lock/tx_crawler.proc';
Loading history...
278
279
        /** @var ExtensionConfigurationProvider $configurationProvider */
280 36
        $configurationProvider = GeneralUtility::makeInstance(ExtensionConfigurationProvider::class);
281 36
        $settings = $configurationProvider->getExtensionConfiguration();
282 36
        $this->extensionSettings = is_array($settings) ? $settings : [];
0 ignored issues
show
introduced by
The condition is_array($settings) is always true.
Loading history...
283
284
        // set defaults:
285 36
        if (MathUtility::convertToPositiveInteger($this->extensionSettings['countInARun']) === 0) {
286
            $this->extensionSettings['countInARun'] = 100;
287
        }
288
289 36
        $this->extensionSettings['processLimit'] = MathUtility::forceIntegerInRange($this->extensionSettings['processLimit'], 1, 99, 1);
290 36
        $this->maximumUrlsToCompile = MathUtility::forceIntegerInRange($this->extensionSettings['maxCompileUrls'], 1, 1000000000, 10000);
291 36
    }
292
293
    /**
294
     * Method to set the accessMode can be gui, cli or cli_im
295
     *
296
     * @return string
297
     * @deprecated
298
     */
299 1
    public function getAccessMode()
300
    {
301 1
        return $this->accessMode;
0 ignored issues
show
Deprecated Code introduced by
The property AOE\Crawler\Controller\C...Controller::$accessMode has been deprecated. ( Ignorable by Annotation )

If this is a false-positive, you can also ignore this issue in your code via the ignore-deprecated  annotation

301
        return /** @scrutinizer ignore-deprecated */ $this->accessMode;
Loading history...
302
    }
303
304
    /**
305
     * @param string $accessMode
306
     * @deprecated
307
     */
308 1
    public function setAccessMode($accessMode): void
309
    {
310 1
        $this->accessMode = $accessMode;
0 ignored issues
show
Deprecated Code introduced by
The property AOE\Crawler\Controller\C...Controller::$accessMode has been deprecated. ( Ignorable by Annotation )

If this is a false-positive, you can also ignore this issue in your code via the ignore-deprecated  annotation

310
        /** @scrutinizer ignore-deprecated */ $this->accessMode = $accessMode;
Loading history...
311 1
    }
312
313
    /**
314
     * Set disabled status to prevent processes from being processed
315
     *
316
     * @param bool $disabled (optional, defaults to true)
317
     * @deprecated
318
     */
319 2
    public function setDisabled($disabled = true): void
320
    {
321 2
        if ($disabled) {
322 1
            GeneralUtility::writeFile($this->processFilename, '');
0 ignored issues
show
Deprecated Code introduced by
The property AOE\Crawler\Controller\C...oller::$processFilename has been deprecated. ( Ignorable by Annotation )

If this is a false-positive, you can also ignore this issue in your code via the ignore-deprecated  annotation

322
            GeneralUtility::writeFile(/** @scrutinizer ignore-deprecated */ $this->processFilename, '');
Loading history...
323
        } else {
324 1
            if (is_file($this->processFilename)) {
0 ignored issues
show
Deprecated Code introduced by
The property AOE\Crawler\Controller\C...oller::$processFilename has been deprecated. ( Ignorable by Annotation )

If this is a false-positive, you can also ignore this issue in your code via the ignore-deprecated  annotation

324
            if (is_file(/** @scrutinizer ignore-deprecated */ $this->processFilename)) {
Loading history...
325 1
                unlink($this->processFilename);
0 ignored issues
show
Deprecated Code introduced by
The property AOE\Crawler\Controller\C...oller::$processFilename has been deprecated. ( Ignorable by Annotation )

If this is a false-positive, you can also ignore this issue in your code via the ignore-deprecated  annotation

325
                unlink(/** @scrutinizer ignore-deprecated */ $this->processFilename);
Loading history...
326
            }
327
        }
328 2
    }
329
330
    /**
331
     * Get disable status
332
     *
333
     * @return bool true if disabled
334
     * @deprecated
335
     */
336 2
    public function getDisabled()
337
    {
338 2
        return is_file($this->processFilename);
0 ignored issues
show
Deprecated Code introduced by
The property AOE\Crawler\Controller\C...oller::$processFilename has been deprecated. ( Ignorable by Annotation )

If this is a false-positive, you can also ignore this issue in your code via the ignore-deprecated  annotation

338
        return is_file(/** @scrutinizer ignore-deprecated */ $this->processFilename);
Loading history...
339
    }
340
341
    /**
342
     * @param string $filenameWithPath
343
     * @deprecated
344
     */
345 3
    public function setProcessFilename($filenameWithPath): void
346
    {
347 3
        $this->processFilename = $filenameWithPath;
0 ignored issues
show
Deprecated Code introduced by
The property AOE\Crawler\Controller\C...oller::$processFilename has been deprecated. ( Ignorable by Annotation )

If this is a false-positive, you can also ignore this issue in your code via the ignore-deprecated  annotation

347
        /** @scrutinizer ignore-deprecated */ $this->processFilename = $filenameWithPath;
Loading history...
348 3
    }
349
350
    /**
351
     * @return string
352
     * @deprecated
353
     */
354 1
    public function getProcessFilename()
355
    {
356 1
        return $this->processFilename;
0 ignored issues
show
Deprecated Code introduced by
The property AOE\Crawler\Controller\C...oller::$processFilename has been deprecated. ( Ignorable by Annotation )

If this is a false-positive, you can also ignore this issue in your code via the ignore-deprecated  annotation

356
        return /** @scrutinizer ignore-deprecated */ $this->processFilename;
Loading history...
357
    }
358
359
    /**
360
     * Sets the extensions settings (unserialized pendant of $TYPO3_CONF_VARS['EXT']['extConf']['crawler']).
361
     */
362 14
    public function setExtensionSettings(array $extensionSettings): void
363
    {
364 14
        $this->extensionSettings = $extensionSettings;
365 14
    }
366
367
    /**
368
     * Check if the given page should be crawled
369
     *
370
     * @return false|string false if the page should be crawled (not excluded), true / skipMessage if it should be skipped
371
     */
372 12
    public function checkIfPageShouldBeSkipped(array $pageRow)
373
    {
374 12
        $skipPage = false;
375
        // message will be overwritten later
376 12
        $skipMessage = 'Skipped';
377
378
        // if page is hidden
379 12
        if (! $this->extensionSettings['crawlHiddenPages']) {
380 12
            if ($pageRow['hidden']) {
381 1
                $skipPage = true;
382 1
                $skipMessage = 'Because page is hidden';
383
            }
384
        }
385
386 12
        if (! $skipPage) {
387 11
            if (GeneralUtility::inList('3,4,199,254,255', $pageRow['doktype'])) {
388 3
                $skipPage = true;
389 3
                $skipMessage = 'Because doktype is not allowed';
390
            }
391
        }
392
393 12
        if (! $skipPage) {
394 8
            foreach ($GLOBALS['TYPO3_CONF_VARS']['EXTCONF']['crawler']['excludeDoktype'] ?? [] as $key => $doktypeList) {
395 1
                if (GeneralUtility::inList($doktypeList, $pageRow['doktype'])) {
396 1
                    $skipPage = true;
397 1
                    $skipMessage = 'Doktype was excluded by "' . $key . '"';
398 1
                    break;
399
                }
400
            }
401
        }
402
403 12
        if (! $skipPage) {
404
            // veto hook
405 7
            foreach ($GLOBALS['TYPO3_CONF_VARS']['EXTCONF']['crawler']['pageVeto'] ?? [] as $key => $func) {
406
                $params = [
407 2
                    'pageRow' => $pageRow,
408
                ];
409
                // expects "false" if page is ok and "true" or a skipMessage if this page should _not_ be crawled
410 2
                $veto = GeneralUtility::callUserFunction($func, $params, $this);
411 2
                if ($veto !== false) {
412 2
                    $skipPage = true;
413 2
                    if (is_string($veto)) {
414 1
                        $skipMessage = $veto;
415
                    } else {
416 1
                        $skipMessage = 'Veto from hook "' . htmlspecialchars($key) . '"';
417
                    }
418
                    // no need to execute other hooks if a previous one return a veto
419 2
                    break;
420
                }
421
            }
422
        }
423
424 12
        return $skipPage ? $skipMessage : false;
425
    }
426
427
    /**
428
     * Wrapper method for getUrlsForPageId()
429
     * It returns an array of configurations and no urls!
430
     *
431
     * @param array $pageRow Page record with at least dok-type and uid columns.
432
     * @param string $skipMessage
433
     * @return array
434
     * @see getUrlsForPageId()
435
     */
436 6
    public function getUrlsForPageRow(array $pageRow, &$skipMessage = '')
437
    {
438 6
        $message = $this->checkIfPageShouldBeSkipped($pageRow);
439 6
        if ($message === false) {
440 5
            $res = $this->getUrlsForPageId($pageRow['uid']);
441 5
            $skipMessage = '';
442
        } else {
443 1
            $skipMessage = $message;
444 1
            $res = [];
445
        }
446
447 6
        return $res;
448
    }
449
450
    /**
451
     * Creates a list of URLs from input array (and submits them to queue if asked for)
452
     * See Web > Info module script + "indexed_search"'s crawler hook-client using this!
453
     *
454
     * @param array $vv Information about URLs from pageRow to crawl.
455
     * @param array $pageRow Page row
456
     * @param int $scheduledTime Unix time to schedule indexing to, typically time()
457
     * @param int $reqMinute Number of requests per minute (creates the interleave between requests)
458
     * @param bool $submitCrawlUrls If set, submits the URLs to queue
459
     * @param bool $downloadCrawlUrls If set (and submitcrawlUrls is false) will fill $downloadUrls with entries)
460
     * @param array $duplicateTrack Array which is passed by reference and contains the an id per url to secure we will not crawl duplicates
461
     * @param array $downloadUrls Array which will be filled with URLS for download if flag is set.
462
     * @param array $incomingProcInstructions Array of processing instructions
463
     * @return string List of URLs (meant for display in backend module)
464
     */
465 4
    public function urlListFromUrlArray(
466
        array $vv,
467
        array $pageRow,
468
        $scheduledTime,
469
        $reqMinute,
470
        $submitCrawlUrls,
471
        $downloadCrawlUrls,
472
        array &$duplicateTrack,
473
        array &$downloadUrls,
474
        array $incomingProcInstructions
475
    ) {
476 4
        if (! is_array($vv['URLs'])) {
477
            return 'ERROR - no URL generated';
478
        }
479 4
        $urlLog = [];
480 4
        $pageId = (int) $pageRow['uid'];
481 4
        $configurationHash = $this->getConfigurationHash($vv);
482 4
        $skipInnerCheck = $this->queueRepository->noUnprocessedQueueEntriesForPageWithConfigurationHashExist($pageId, $configurationHash);
483
484 4
        $urlService = new UrlService();
485
486 4
        foreach ($vv['URLs'] as $urlQuery) {
487 4
            if (! $this->drawURLs_PIfilter($vv['subCfg']['procInstrFilter'], $incomingProcInstructions)) {
488
                continue;
489
            }
490 4
            $url = (string) $urlService->getUrlFromPageAndQueryParameters(
491 4
                $pageId,
492 4
                $urlQuery,
493 4
                $vv['subCfg']['baseUrl'] ?? null,
494 4
                $vv['subCfg']['force_ssl'] ?? 0
495
            );
496
497
            // Create key by which to determine unique-ness:
498 4
            $uKey = $url . '|' . $vv['subCfg']['userGroups'] . '|' . $vv['subCfg']['procInstrFilter'];
499
500 4
            if (isset($duplicateTrack[$uKey])) {
501
                //if the url key is registered just display it and do not resubmit is
502
                $urlLog[] = '<em><span class="text-muted">' . htmlspecialchars($url) . '</span></em>';
503
            } else {
504
                // Scheduled time:
505 4
                $schTime = $scheduledTime + round(count($duplicateTrack) * (60 / $reqMinute));
506 4
                $schTime = intval($schTime / 60) * 60;
507 4
                $formattedDate = BackendUtility::datetime($schTime);
508 4
                $this->urlList[] = '[' . $formattedDate . '] ' . $url;
509 4
                $urlList = '[' . $formattedDate . '] ' . htmlspecialchars($url);
510
511
                // Submit for crawling!
512 4
                if ($submitCrawlUrls) {
513 4
                    $added = $this->addUrl(
514 4
                        $pageId,
515 4
                        $url,
516 4
                        $vv['subCfg'],
517 4
                        $scheduledTime,
518 4
                        $configurationHash,
519 4
                        $skipInnerCheck
520
                    );
521 4
                    if ($added === false) {
522 4
                        $urlList .= ' (URL already existed)';
523
                    }
524
                } elseif ($downloadCrawlUrls) {
525
                    $downloadUrls[$url] = $url;
526
                }
527 4
                $urlLog[] = $urlList;
528
            }
529 4
            $duplicateTrack[$uKey] = true;
530
        }
531
532 4
        return implode('<br>', $urlLog);
533
    }
534
535
    /**
536
     * Returns true if input processing instruction is among registered ones.
537
     *
538
     * @param string $piString PI to test
539
     * @param array $incomingProcInstructions Processing instructions
540
     * @return boolean
541
     */
542 5
    public function drawURLs_PIfilter($piString, array $incomingProcInstructions)
543
    {
544 5
        if (empty($incomingProcInstructions)) {
545 1
            return true;
546
        }
547
548 4
        foreach ($incomingProcInstructions as $pi) {
549 4
            if (GeneralUtility::inList($piString, $pi)) {
550 2
                return true;
551
            }
552
        }
553 2
        return false;
554
    }
555
556 5
    public function getPageTSconfigForId($id): array
557
    {
558 5
        if (! $this->MP) {
559 5
            $pageTSconfig = BackendUtility::getPagesTSconfig($id);
0 ignored issues
show
Deprecated Code introduced by
The function TYPO3\CMS\Backend\Utilit...ity::getPagesTSconfig() has been deprecated. ( Ignorable by Annotation )

If this is a false-positive, you can also ignore this issue in your code via the ignore-deprecated  annotation

559
            $pageTSconfig = /** @scrutinizer ignore-deprecated */ BackendUtility::getPagesTSconfig($id);
Loading history...
560
        } else {
561
            // TODO: Please check, this makes no sense to split a boolean value.
562
            [, $mountPointId] = explode('-', $this->MP);
0 ignored issues
show
Bug introduced by
$this->MP of type true is incompatible with the type string expected by parameter $string of explode(). ( Ignorable by Annotation )

If this is a false-positive, you can also ignore this issue in your code via the ignore-type  annotation

562
            [, $mountPointId] = explode('-', /** @scrutinizer ignore-type */ $this->MP);
Loading history...
563
            $pageTSconfig = BackendUtility::getPagesTSconfig($mountPointId);
0 ignored issues
show
Bug introduced by
$mountPointId of type string is incompatible with the type integer expected by parameter $id of TYPO3\CMS\Backend\Utilit...ity::getPagesTSconfig(). ( Ignorable by Annotation )

If this is a false-positive, you can also ignore this issue in your code via the ignore-type  annotation

563
            $pageTSconfig = BackendUtility::getPagesTSconfig(/** @scrutinizer ignore-type */ $mountPointId);
Loading history...
Deprecated Code introduced by
The function TYPO3\CMS\Backend\Utilit...ity::getPagesTSconfig() has been deprecated. ( Ignorable by Annotation )

If this is a false-positive, you can also ignore this issue in your code via the ignore-deprecated  annotation

563
            $pageTSconfig = /** @scrutinizer ignore-deprecated */ BackendUtility::getPagesTSconfig($mountPointId);
Loading history...
564
        }
565
566
        // Call a hook to alter configuration
567 5
        if (is_array($GLOBALS['TYPO3_CONF_VARS']['EXTCONF']['crawler']['getPageTSconfigForId'])) {
568
            $params = [
569
                'pageId' => $id,
570
                'pageTSConfig' => &$pageTSconfig,
571
            ];
572
            foreach ($GLOBALS['TYPO3_CONF_VARS']['EXTCONF']['crawler']['getPageTSconfigForId'] as $userFunc) {
573
                GeneralUtility::callUserFunction($userFunc, $params, $this);
574
            }
575
        }
576 5
        return $pageTSconfig;
577
    }
578
579
    /**
580
     * This methods returns an array of configurations.
581
     * Adds no urls!
582
     */
583 4
    public function getUrlsForPageId(int $pageId): array
584
    {
585
        // Get page TSconfig for page ID
586 4
        $pageTSconfig = $this->getPageTSconfigForId($pageId);
587
588 4
        $res = [];
589
590
        // Fetch Crawler Configuration from pageTSconfig
591 4
        $crawlerCfg = $pageTSconfig['tx_crawler.']['crawlerCfg.']['paramSets.'] ?? [];
592 4
        foreach ($crawlerCfg as $key => $values) {
593 3
            if (! is_array($values)) {
594 3
                continue;
595
            }
596 3
            $key = str_replace('.', '', $key);
597
            // Sub configuration for a single configuration string:
598 3
            $subCfg = (array) $crawlerCfg[$key . '.'];
599 3
            $subCfg['key'] = $key;
600
601 3
            if (strcmp($subCfg['procInstrFilter'] ?? '', '')) {
602 3
                $subCfg['procInstrFilter'] = implode(',', GeneralUtility::trimExplode(',', $subCfg['procInstrFilter']));
603
            }
604 3
            $pidOnlyList = implode(',', GeneralUtility::trimExplode(',', $subCfg['pidsOnly'], true));
605
606
            // process configuration if it is not page-specific or if the specific page is the current page:
607
            // TODO: Check if $pidOnlyList can be kept as Array instead of imploded
608 3
            if (! strcmp((string) $subCfg['pidsOnly'], '') || GeneralUtility::inList($pidOnlyList, strval($pageId))) {
609
610
                // Explode, process etc.:
611 3
                $res[$key] = [];
612 3
                $res[$key]['subCfg'] = $subCfg;
613 3
                $res[$key]['paramParsed'] = GeneralUtility::explodeUrl2Array($crawlerCfg[$key]);
614 3
                $res[$key]['paramExpanded'] = $this->expandParameters($res[$key]['paramParsed'], $pageId);
615 3
                $res[$key]['origin'] = 'pagets';
616
617
                // recognize MP value
618 3
                if (! $this->MP) {
619 3
                    $res[$key]['URLs'] = $this->compileUrls($res[$key]['paramExpanded'], ['?id=' . $pageId]);
620
                } else {
621
                    $res[$key]['URLs'] = $this->compileUrls($res[$key]['paramExpanded'], ['?id=' . $pageId . '&MP=' . $this->MP]);
0 ignored issues
show
Bug introduced by
Are you sure $this->MP of type true can be used in concatenation? ( Ignorable by Annotation )

If this is a false-positive, you can also ignore this issue in your code via the ignore-type  annotation

621
                    $res[$key]['URLs'] = $this->compileUrls($res[$key]['paramExpanded'], ['?id=' . $pageId . '&MP=' . /** @scrutinizer ignore-type */ $this->MP]);
Loading history...
622
                }
623
            }
624
        }
625
626
        // Get configuration from tx_crawler_configuration records up the rootline
627 4
        $crawlerConfigurations = $this->configurationRepository->getCrawlerConfigurationRecordsFromRootLine($pageId);
628 4
        foreach ($crawlerConfigurations as $configurationRecord) {
629
630
            // check access to the configuration record
631 1
            if (empty($configurationRecord['begroups']) || $this->getBackendUser()->isAdmin() || $this->hasGroupAccess($this->getBackendUser()->user['usergroup_cached_list'], $configurationRecord['begroups'])) {
632 1
                $pidOnlyList = implode(',', GeneralUtility::trimExplode(',', $configurationRecord['pidsonly'], true));
633
634
                // process configuration if it is not page-specific or if the specific page is the current page:
635
                // TODO: Check if $pidOnlyList can be kept as Array instead of imploded
636 1
                if (! strcmp($configurationRecord['pidsonly'], '') || GeneralUtility::inList($pidOnlyList, strval($pageId))) {
637 1
                    $key = $configurationRecord['name'];
638
639
                    // don't overwrite previously defined paramSets
640 1
                    if (! isset($res[$key])) {
641
642
                        /* @var $TSparserObject \TYPO3\CMS\Core\TypoScript\Parser\TypoScriptParser */
643 1
                        $TSparserObject = GeneralUtility::makeInstance(TypoScriptParser::class);
644 1
                        $TSparserObject->parse($configurationRecord['processing_instruction_parameters_ts']);
645
646
                        $subCfg = [
647 1
                            'procInstrFilter' => $configurationRecord['processing_instruction_filter'],
648 1
                            'procInstrParams.' => $TSparserObject->setup,
649 1
                            'baseUrl' => $configurationRecord['base_url'],
650 1
                            'force_ssl' => (int) $configurationRecord['force_ssl'],
651 1
                            'userGroups' => $configurationRecord['fegroups'],
652 1
                            'exclude' => $configurationRecord['exclude'],
653 1
                            'key' => $key,
654
                        ];
655
656 1
                        if (! in_array($pageId, $this->expandExcludeString($subCfg['exclude']), true)) {
657 1
                            $res[$key] = [];
658 1
                            $res[$key]['subCfg'] = $subCfg;
659 1
                            $res[$key]['paramParsed'] = GeneralUtility::explodeUrl2Array($configurationRecord['configuration']);
660 1
                            $res[$key]['paramExpanded'] = $this->expandParameters($res[$key]['paramParsed'], $pageId);
661 1
                            $res[$key]['URLs'] = $this->compileUrls($res[$key]['paramExpanded'], ['?id=' . $pageId]);
662 1
                            $res[$key]['origin'] = 'tx_crawler_configuration_' . $configurationRecord['uid'];
663
                        }
664
                    }
665
                }
666
            }
667
        }
668
669 4
        foreach ($GLOBALS['TYPO3_CONF_VARS']['EXTCONF']['crawler']['processUrls'] ?? [] as $func) {
670
            $params = [
671
                'res' => &$res,
672
            ];
673
            GeneralUtility::callUserFunction($func, $params, $this);
674
        }
675 4
        return $res;
676
    }
677
678
    /**
679
     * Find all configurations of subpages of a page
680
     * TODO: Write Functional Tests
681
     */
682 1
    public function getConfigurationsForBranch(int $rootid, int $depth): array
683
    {
684 1
        $configurationsForBranch = [];
685 1
        $pageTSconfig = $this->getPageTSconfigForId($rootid);
686 1
        $sets = $pageTSconfig['tx_crawler.']['crawlerCfg.']['paramSets.'] ?? [];
687 1
        foreach ($sets as $key => $value) {
688
            if (! is_array($value)) {
689
                continue;
690
            }
691
            $configurationsForBranch[] = substr($key, -1) === '.' ? substr($key, 0, -1) : $key;
692
        }
693 1
        $pids = [];
694 1
        $rootLine = BackendUtility::BEgetRootLine($rootid);
695 1
        foreach ($rootLine as $node) {
696 1
            $pids[] = $node['uid'];
697
        }
698
        /* @var PageTreeView $tree */
699 1
        $tree = GeneralUtility::makeInstance(PageTreeView::class);
700 1
        $perms_clause = $this->getBackendUser()->getPagePermsClause(Permission::PAGE_SHOW);
701 1
        $tree->init(empty($perms_clause) ? '' : ('AND ' . $perms_clause));
702 1
        $tree->getTree($rootid, $depth, '');
703 1
        foreach ($tree->tree as $node) {
704
            $pids[] = $node['row']['uid'];
705
        }
706
707 1
        $queryBuilder = $this->getQueryBuilder('tx_crawler_configuration');
708
        $statement = $queryBuilder
709 1
            ->select('name')
710 1
            ->from('tx_crawler_configuration')
711 1
            ->where(
712 1
                $queryBuilder->expr()->in('pid', $queryBuilder->createNamedParameter($pids, Connection::PARAM_INT_ARRAY))
713
            )
714 1
            ->execute();
715
716 1
        while ($row = $statement->fetch()) {
717 1
            $configurationsForBranch[] = $row['name'];
718
        }
719 1
        return $configurationsForBranch;
720
    }
721
722
    /**
723
     * Check if a user has access to an item
724
     * (e.g. get the group list of the current logged in user from $GLOBALS['TSFE']->gr_list)
725
     *
726
     * @param string $groupList Comma-separated list of (fe_)group UIDs from a user
727
     * @param string $accessList Comma-separated list of (fe_)group UIDs of the item to access
728
     * @return bool TRUE if at least one of the users group UIDs is in the access list or the access list is empty
729
     * @see \TYPO3\CMS\Frontend\Page\PageRepository::getMultipleGroupsWhereClause()
730
     */
731 3
    public function hasGroupAccess($groupList, $accessList)
732
    {
733 3
        if (empty($accessList)) {
734 1
            return true;
735
        }
736 2
        foreach (GeneralUtility::intExplode(',', $groupList) as $groupUid) {
737 2
            if (GeneralUtility::inList($accessList, $groupUid)) {
738 1
                return true;
739
            }
740
        }
741 1
        return false;
742
    }
743
744
    /**
745
     * Will expand the parameters configuration to individual values. This follows a certain syntax of the value of each parameter.
746
     * Syntax of values:
747
     * - Basically: If the value is wrapped in [...] it will be expanded according to the following syntax, otherwise the value is taken literally
748
     * - Configuration is splitted by "|" and the parts are processed individually and finally added together
749
     * - For each configuration part:
750
     *         - "[int]-[int]" = Integer range, will be expanded to all values in between, values included, starting from low to high (max. 1000). Example "1-34" or "-40--30"
751
     *         - "_TABLE:[TCA table name];[_PID:[optional page id, default is current page]];[_ENABLELANG:1]" = Look up of table records from PID, filtering out deleted records. Example "_TABLE:tt_content; _PID:123"
752
     *        _ENABLELANG:1 picks only original records without their language overlays
753
     *         - Default: Literal value
754
     *
755
     * @param array $paramArray Array with key (GET var name) and values (value of GET var which is configuration for expansion)
756
     * @param integer $pid Current page ID
757
     * @return array
758
     *
759
     * TODO: Write Functional Tests
760
     */
761 11
    public function expandParameters($paramArray, $pid)
762
    {
763
        // Traverse parameter names:
764 11
        foreach ($paramArray as $p => $v) {
765 11
            $v = trim($v);
766
767
            // If value is encapsulated in square brackets it means there are some ranges of values to find, otherwise the value is literal
768 11
            if (strpos($v, '[') === 0 && substr($v, -1) === ']') {
769
                // So, find the value inside brackets and reset the paramArray value as an array.
770 11
                $v = substr($v, 1, -1);
771 11
                $paramArray[$p] = [];
772
773
                // Explode parts and traverse them:
774 11
                $parts = explode('|', $v);
775 11
                foreach ($parts as $pV) {
776
777
                    // Look for integer range: (fx. 1-34 or -40--30 // reads minus 40 to minus 30)
778 11
                    if (preg_match('/^(-?[0-9]+)\s*-\s*(-?[0-9]+)$/', trim($pV), $reg)) {
779 1
                        $reg = $this->swapIfFirstIsLargerThanSecond($reg);
780
781
                        // Traverse range, add values:
782
                        // Limit to size of range!
783 1
                        $runAwayBrake = 1000;
784 1
                        for ($a = $reg[1]; $a <= $reg[2]; $a++) {
785 1
                            $paramArray[$p][] = $a;
786 1
                            $runAwayBrake--;
787 1
                            if ($runAwayBrake <= 0) {
788
                                break;
789
                            }
790
                        }
791 10
                    } elseif (strpos(trim($pV), '_TABLE:') === 0) {
792
793
                        // Parse parameters:
794 6
                        $subparts = GeneralUtility::trimExplode(';', $pV);
795 6
                        $subpartParams = [];
796 6
                        foreach ($subparts as $spV) {
797 6
                            [$pKey, $pVal] = GeneralUtility::trimExplode(':', $spV);
798 6
                            $subpartParams[$pKey] = $pVal;
799
                        }
800
801
                        // Table exists:
802 6
                        if (isset($GLOBALS['TCA'][$subpartParams['_TABLE']])) {
803 6
                            $lookUpPid = isset($subpartParams['_PID']) ? intval($subpartParams['_PID']) : intval($pid);
804 6
                            $recursiveDepth = isset($subpartParams['_RECURSIVE']) ? intval($subpartParams['_RECURSIVE']) : 0;
805 6
                            $pidField = isset($subpartParams['_PIDFIELD']) ? trim($subpartParams['_PIDFIELD']) : 'pid';
806 6
                            $where = $subpartParams['_WHERE'] ?? '';
807 6
                            $addTable = $subpartParams['_ADDTABLE'] ?? '';
808
809 6
                            $fieldName = $subpartParams['_FIELD'] ? $subpartParams['_FIELD'] : 'uid';
810 6
                            if ($fieldName === 'uid' || $GLOBALS['TCA'][$subpartParams['_TABLE']]['columns'][$fieldName]) {
811 6
                                $queryBuilder = $this->getQueryBuilder($subpartParams['_TABLE']);
812
813 6
                                if ($recursiveDepth > 0) {
814
                                    /** @var \TYPO3\CMS\Core\Database\QueryGenerator $queryGenerator */
815 2
                                    $queryGenerator = GeneralUtility::makeInstance(\TYPO3\CMS\Core\Database\QueryGenerator::class);
816 2
                                    $pidList = $queryGenerator->getTreeList($lookUpPid, $recursiveDepth, 0, 1);
817 2
                                    $pidArray = GeneralUtility::intExplode(',', $pidList);
818
                                } else {
819 4
                                    $pidArray = [(string) $lookUpPid];
820
                                }
821
822 6
                                $queryBuilder->getRestrictions()
823 6
                                    ->removeAll()
824 6
                                    ->add(GeneralUtility::makeInstance(DeletedRestriction::class));
825
826
                                $queryBuilder
827 6
                                    ->select($fieldName)
828 6
                                    ->from($subpartParams['_TABLE'])
829 6
                                    ->where(
830 6
                                        $queryBuilder->expr()->in($pidField, $queryBuilder->createNamedParameter($pidArray, Connection::PARAM_INT_ARRAY)),
831 6
                                        $where
832
                                    );
833
834 6
                                if (! empty($addTable)) {
835
                                    // TODO: Check if this works as intended!
836
                                    $queryBuilder->add('from', $addTable);
837
                                }
838 6
                                $transOrigPointerField = $GLOBALS['TCA'][$subpartParams['_TABLE']]['ctrl']['transOrigPointerField'];
839
840 6
                                if ($subpartParams['_ENABLELANG'] && $transOrigPointerField) {
841
                                    $queryBuilder->andWhere(
842
                                        $queryBuilder->expr()->lte(
843
                                            $transOrigPointerField,
844
                                            0
845
                                        )
846
                                    );
847
                                }
848
849 6
                                $statement = $queryBuilder->execute();
850
851 6
                                $rows = [];
852 6
                                while ($row = $statement->fetch()) {
853 6
                                    $rows[$row[$fieldName]] = $row;
854
                                }
855
856 6
                                if (is_array($rows)) {
857 6
                                    $paramArray[$p] = array_merge($paramArray[$p], array_keys($rows));
858
                                }
859
                            }
860
                        }
861
                    } else {
862
                        // Just add value:
863 4
                        $paramArray[$p][] = $pV;
864
                    }
865
                    // Hook for processing own expandParameters place holder
866 11
                    if (is_array($GLOBALS['TYPO3_CONF_VARS']['SC_OPTIONS']['crawler/class.tx_crawler_lib.php']['expandParameters'])) {
867
                        $_params = [
868
                            'pObj' => &$this,
869
                            'paramArray' => &$paramArray,
870
                            'currentKey' => $p,
871
                            'currentValue' => $pV,
872
                            'pid' => $pid,
873
                        ];
874
                        foreach ($GLOBALS['TYPO3_CONF_VARS']['SC_OPTIONS']['crawler/class.tx_crawler_lib.php']['expandParameters'] as $_funcRef) {
875
                            GeneralUtility::callUserFunction($_funcRef, $_params, $this);
876
                        }
877
                    }
878
                }
879
880
                // Make unique set of values and sort array by key:
881 11
                $paramArray[$p] = array_unique($paramArray[$p]);
882 11
                ksort($paramArray);
883
            } else {
884
                // Set the literal value as only value in array:
885 4
                $paramArray[$p] = [$v];
886
            }
887
        }
888
889 11
        return $paramArray;
890
    }
891
892
    /**
893
     * Compiling URLs from parameter array (output of expandParameters())
894
     * The number of URLs will be the multiplication of the number of parameter values for each key
895
     *
896
     * @param array $paramArray Output of expandParameters(): Array with keys (GET var names) and for each an array of values
897
     * @param array $urls URLs accumulated in this array (for recursion)
898
     * @return array
899
     */
900 7
    public function compileUrls($paramArray, array $urls)
901
    {
902 7
        if (empty($paramArray)) {
903 7
            return $urls;
904
        }
905
        // shift first off stack:
906 6
        reset($paramArray);
907 6
        $varName = key($paramArray);
908 6
        $valueSet = array_shift($paramArray);
909
910
        // Traverse value set:
911 6
        $newUrls = [];
912 6
        foreach ($urls as $url) {
913 5
            foreach ($valueSet as $val) {
914 5
                $newUrls[] = $url . (strcmp((string) $val, '') ? '&' . rawurlencode($varName) . '=' . rawurlencode((string) $val) : '');
915
916 5
                if (count($newUrls) > $this->maximumUrlsToCompile) {
917
                    break;
918
                }
919
            }
920
        }
921 6
        return $this->compileUrls($paramArray, $newUrls);
922
    }
923
924
    /************************************
925
     *
926
     * Crawler log
927
     *
928
     ************************************/
929
930
    /**
931
     * Return array of records from crawler queue for input page ID
932
     *
933
     * @param integer $id Page ID for which to look up log entries.
934
     * @param boolean $doFlush If TRUE, then entries selected at DELETED(!) instead of selected!
935
     * @param boolean $doFullFlush
936
     * @param integer $itemsPerPage Limit the amount of entries per page default is 10
937
     * @return array
938
     */
939 4
    public function getLogEntriesForPageId($id, QueueFilter $queueFilter, $doFlush = false, $doFullFlush = false, $itemsPerPage = 10)
940
    {
941 4
        $queryBuilder = GeneralUtility::makeInstance(ConnectionPool::class)->getQueryBuilderForTable($this->tableName);
942
        $queryBuilder
943 4
            ->select('*')
944 4
            ->from($this->tableName)
945 4
            ->where(
946 4
                $queryBuilder->expr()->eq('page_id', $queryBuilder->createNamedParameter($id, \PDO::PARAM_INT))
947
            )
948 4
            ->orderBy('scheduled', 'DESC');
949
950 4
        $expressionBuilder = GeneralUtility::makeInstance(ConnectionPool::class)
951 4
            ->getConnectionForTable($this->tableName)
952 4
            ->getExpressionBuilder();
953 4
        $query = $expressionBuilder->andX();
0 ignored issues
show
Unused Code introduced by
The assignment to $query is dead and can be removed.
Loading history...
954
        // PHPStorm adds the highlight that the $addWhere is immediately overwritten,
955
        // but the $query = $expressionBuilder->andX() ensures that the $addWhere is written correctly with AND
956
        // between the statements, it's not a mistake in the code.
957 4
        switch ($queueFilter) {
958 4
            case 'pending':
959
                $queryBuilder->andWhere($queryBuilder->expr()->eq('exec_time', 0));
960
                break;
961 4
            case 'finished':
962
                $queryBuilder->andWhere($queryBuilder->expr()->gt('exec_time', 0));
963
                break;
964
        }
965
966 4
        if ($doFlush) {
967 2
            if ($doFullFlush) {
968 1
                $this->queueRepository->flushQueue($queueFilter);
969
            } else {
970 1
                $this->queueRepository->flushQueue($queueFilter);
971
            }
972
        }
973 4
        if ($itemsPerPage > 0) {
974
            $queryBuilder
975 4
                ->setMaxResults((int) $itemsPerPage);
976
        }
977
978 4
        return $queryBuilder->execute()->fetchAll();
979
    }
980
981
    /**
982
     * Return array of records from crawler queue for input set ID
983
     *
984
     * @param int $set_id Set ID for which to look up log entries.
985
     * @param string $filter Filter: "all" => all entries, "pending" => all that is not yet run, "finished" => all complete ones
986
     * @param bool $doFlush If TRUE, then entries selected at DELETED(!) instead of selected!
987
     * @param int $itemsPerPage Limit the amount of entries per page default is 10
988
     * @return array
989
     *
990
     * @deprecated
991
     */
992 6
    public function getLogEntriesForSetId(int $set_id, string $filter = '', bool $doFlush = false, bool $doFullFlush = false, int $itemsPerPage = 10)
993
    {
994 6
        $queryBuilder = GeneralUtility::makeInstance(ConnectionPool::class)->getQueryBuilderForTable($this->tableName);
995
        $queryBuilder
996 6
            ->select('*')
997 6
            ->from($this->tableName)
998 6
            ->where(
999 6
                $queryBuilder->expr()->eq('set_id', $queryBuilder->createNamedParameter($set_id, \PDO::PARAM_INT))
1000
            )
1001 6
            ->orderBy('scheduled', 'DESC');
1002
1003 6
        $expressionBuilder = GeneralUtility::makeInstance(ConnectionPool::class)
1004 6
            ->getConnectionForTable($this->tableName)
1005 6
            ->getExpressionBuilder();
1006 6
        $query = $expressionBuilder->andX();
1007
        // PHPStorm adds the highlight that the $addWhere is immediately overwritten,
1008
        // but the $query = $expressionBuilder->andX() ensures that the $addWhere is written correctly with AND
1009
        // between the statements, it's not a mistake in the code.
1010 6
        $addWhere = '';
1011 6
        switch ($filter) {
1012 6
            case 'pending':
1013 1
                $queryBuilder->andWhere($queryBuilder->expr()->eq('exec_time', 0));
1014 1
                $addWhere = $query->add($expressionBuilder->eq('exec_time', 0));
0 ignored issues
show
Unused Code introduced by
The assignment to $addWhere is dead and can be removed.
Loading history...
1015 1
                break;
1016 5
            case 'finished':
1017 1
                $queryBuilder->andWhere($queryBuilder->expr()->gt('exec_time', 0));
1018 1
                $addWhere = $query->add($expressionBuilder->gt('exec_time', 0));
1019 1
                break;
1020
        }
1021 6
        if ($doFlush) {
1022 4
            $addWhere = $query->add($expressionBuilder->eq('set_id', (int) $set_id));
1023 4
            $this->flushQueue($doFullFlush ? '' : $addWhere);
0 ignored issues
show
Deprecated Code introduced by
The function AOE\Crawler\Controller\C...ontroller::flushQueue() has been deprecated. ( Ignorable by Annotation )

If this is a false-positive, you can also ignore this issue in your code via the ignore-deprecated  annotation

1023
            /** @scrutinizer ignore-deprecated */ $this->flushQueue($doFullFlush ? '' : $addWhere);
Loading history...
1024 4
            return [];
1025
        }
1026 2
        if ($itemsPerPage > 0) {
1027
            $queryBuilder
1028 2
                ->setMaxResults((int) $itemsPerPage);
1029
        }
1030
1031 2
        return $queryBuilder->execute()->fetchAll();
1032
    }
1033
1034
    /**
1035
     * Adding call back entries to log (called from hooks typically, see indexed search class "class.crawler.php"
1036
     *
1037
     * @param integer $setId Set ID
1038
     * @param array $params Parameters to pass to call back function
1039
     * @param string $callBack Call back object reference, eg. 'EXT:indexed_search/class.crawler.php:&tx_indexedsearch_crawler'
1040
     * @param integer $page_id Page ID to attach it to
1041
     * @param integer $schedule Time at which to activate
1042
     */
1043
    public function addQueueEntry_callBack($setId, $params, $callBack, $page_id = 0, $schedule = 0): void
1044
    {
1045
        if (! is_array($params)) {
0 ignored issues
show
introduced by
The condition is_array($params) is always true.
Loading history...
1046
            $params = [];
1047
        }
1048
        $params['_CALLBACKOBJ'] = $callBack;
1049
1050
        GeneralUtility::makeInstance(ConnectionPool::class)->getConnectionForTable('tx_crawler_queue')
1051
            ->insert(
1052
                'tx_crawler_queue',
1053
                [
1054
                    'page_id' => (int) $page_id,
1055
                    'parameters' => json_encode($params),
1056
                    'scheduled' => (int) $schedule ?: $this->getCurrentTime(),
1057
                    'exec_time' => 0,
1058
                    'set_id' => (int) $setId,
1059
                    'result_data' => '',
1060
                ]
1061
            );
1062
    }
1063
1064
    /************************************
1065
     *
1066
     * URL setting
1067
     *
1068
     ************************************/
1069
1070
    /**
1071
     * Setting a URL for crawling:
1072
     *
1073
     * @param integer $id Page ID
1074
     * @param string $url Complete URL
1075
     * @param array $subCfg Sub configuration array (from TS config)
1076
     * @param integer $tstamp Scheduled-time
1077
     * @param string $configurationHash (optional) configuration hash
1078
     * @param bool $skipInnerDuplicationCheck (optional) skip inner duplication check
1079
     * @return bool
1080
     */
1081 8
    public function addUrl(
1082
        $id,
1083
        $url,
1084
        array $subCfg,
1085
        $tstamp,
1086
        $configurationHash = '',
1087
        $skipInnerDuplicationCheck = false
1088
    ) {
1089 8
        $urlAdded = false;
1090 8
        $rows = [];
1091
1092
        // Creating parameters:
1093
        $parameters = [
1094 8
            'url' => $url,
1095
        ];
1096
1097
        // fe user group simulation:
1098 8
        $uGs = implode(',', array_unique(GeneralUtility::intExplode(',', $subCfg['userGroups'], true)));
1099 8
        if ($uGs) {
1100 1
            $parameters['feUserGroupList'] = $uGs;
1101
        }
1102
1103
        // Setting processing instructions
1104 8
        $parameters['procInstructions'] = GeneralUtility::trimExplode(',', $subCfg['procInstrFilter']);
1105 8
        if (is_array($subCfg['procInstrParams.'])) {
1106 5
            $parameters['procInstrParams'] = $subCfg['procInstrParams.'];
1107
        }
1108
1109
        // Compile value array:
1110 8
        $parameters_serialized = json_encode($parameters);
1111
        $fieldArray = [
1112 8
            'page_id' => (int) $id,
1113 8
            'parameters' => $parameters_serialized,
1114 8
            'parameters_hash' => GeneralUtility::shortMD5($parameters_serialized),
1115 8
            'configuration_hash' => $configurationHash,
1116 8
            'scheduled' => $tstamp,
1117 8
            'exec_time' => 0,
1118 8
            'set_id' => (int) $this->setID,
1119 8
            'result_data' => '',
1120 8
            'configuration' => $subCfg['key'],
1121
        ];
1122
1123 8
        if ($this->registerQueueEntriesInternallyOnly) {
1124
            //the entries will only be registered and not stored to the database
1125 1
            $this->queueEntries[] = $fieldArray;
1126
        } else {
1127 7
            if (! $skipInnerDuplicationCheck) {
1128
                // check if there is already an equal entry
1129 6
                $rows = $this->queueRepository->getDuplicateQueueItemsIfExists(
1130 6
                    (bool) $this->extensionSettings['enableTimeslot'],
1131 6
                    $tstamp,
1132 6
                    $this->getCurrentTime(),
1133 6
                    $fieldArray['page_id'],
1134 6
                    $fieldArray['parameters_hash']
1135
                );
1136
            }
1137
1138 7
            if (empty($rows)) {
1139 6
                $connectionForCrawlerQueue = GeneralUtility::makeInstance(ConnectionPool::class)->getConnectionForTable('tx_crawler_queue');
1140 6
                $connectionForCrawlerQueue->insert(
1141 6
                    'tx_crawler_queue',
1142 6
                    $fieldArray
1143
                );
1144 6
                $uid = $connectionForCrawlerQueue->lastInsertId('tx_crawler_queue', 'qid');
1145 6
                $rows[] = $uid;
1146 6
                $urlAdded = true;
1147
1148 6
                $signalPayload = ['uid' => $uid, 'fieldArray' => $fieldArray];
1149 6
                SignalSlotUtility::emitSignal(
1150 6
                    self::class,
1151 6
                    SignalSlotUtility::SIGNAL_URL_ADDED_TO_QUEUE,
1152 6
                    $signalPayload
1153
                );
1154
            } else {
1155 3
                $signalPayload = ['rows' => $rows, 'fieldArray' => $fieldArray];
1156 3
                SignalSlotUtility::emitSignal(
1157 3
                    self::class,
1158 3
                    SignalSlotUtility::SIGNAL_DUPLICATE_URL_IN_QUEUE,
1159 3
                    $signalPayload
1160
                );
1161
            }
1162
        }
1163
1164 8
        return $urlAdded;
1165
    }
1166
1167
    /**
1168
     * Returns the current system time
1169
     *
1170
     * @return int
1171
     */
1172 2
    public function getCurrentTime()
1173
    {
1174 2
        return time();
1175
    }
1176
1177
    /************************************
1178
     *
1179
     * URL reading
1180
     *
1181
     ************************************/
1182
1183
    /**
1184
     * Read URL for single queue entry
1185
     *
1186
     * @param integer $queueId
1187
     * @param boolean $force If set, will process even if exec_time has been set!
1188
     * @return integer
1189
     */
1190
    public function readUrl($queueId, $force = false)
1191
    {
1192
        $queryBuilder = GeneralUtility::makeInstance(ConnectionPool::class)->getQueryBuilderForTable($this->tableName);
1193
        $ret = 0;
1194
        $this->logger->debug('crawler-readurl start ' . microtime(true));
1195
        // Get entry:
1196
        $queryBuilder
1197
            ->select('*')
1198
            ->from('tx_crawler_queue')
1199
            ->where(
1200
                $queryBuilder->expr()->eq('qid', $queryBuilder->createNamedParameter($queueId, \PDO::PARAM_INT))
1201
            );
1202
        if (! $force) {
1203
            $queryBuilder
1204
                ->andWhere('exec_time = 0')
1205
                ->andWhere('process_scheduled > 0');
1206
        }
1207
        $queueRec = $queryBuilder->execute()->fetch();
1208
1209
        if (! is_array($queueRec)) {
1210
            return;
1211
        }
1212
1213
        SignalSlotUtility::emitSignal(
1214
            self::class,
1215
            SignalSlotUtility::SIGNAL_QUEUEITEM_PREPROCESS,
1216
            [$queueId, &$queueRec]
1217
        );
1218
1219
        // Set exec_time to lock record:
1220
        $field_array = ['exec_time' => $this->getCurrentTime()];
1221
1222
        if (isset($this->processID)) {
1223
            //if mulitprocessing is used we need to store the id of the process which has handled this entry
1224
            $field_array['process_id_completed'] = $this->processID;
1225
        }
1226
1227
        GeneralUtility::makeInstance(ConnectionPool::class)->getConnectionForTable('tx_crawler_queue')
1228
            ->update(
1229
                'tx_crawler_queue',
1230
                $field_array,
1231
                ['qid' => (int) $queueId]
1232
            );
1233
1234
        $result = $this->queueExecutor->executeQueueItem($queueRec, $this);
1235
        if ($result['content'] === null) {
1236
            $resultData = 'An errors happened';
1237
        } else {
1238
            /** @var JsonCompatibilityConverter $jsonCompatibilityConverter */
1239
            $jsonCompatibilityConverter = GeneralUtility::makeInstance(JsonCompatibilityConverter::class);
1240
            $resultData = $jsonCompatibilityConverter->convert($result['content']);
1241
        }
1242
1243
        //atm there's no need to point to specific pollable extensions
1244
        if (is_array($GLOBALS['TYPO3_CONF_VARS']['EXTCONF']['crawler']['pollSuccess'])) {
1245
            foreach ($GLOBALS['TYPO3_CONF_VARS']['EXTCONF']['crawler']['pollSuccess'] as $pollable) {
1246
                // only check the success value if the instruction is runnig
1247
                // it is important to name the pollSuccess key same as the procInstructions key
1248
                if (is_array($resultData['parameters']['procInstructions'])
1249
                    && in_array(
1250
                        $pollable,
1251
                        $resultData['parameters']['procInstructions'], true
1252
                    )
1253
                ) {
1254
                    if (! empty($resultData['success'][$pollable]) && $resultData['success'][$pollable]) {
1255
                        $ret |= self::CLI_STATUS_POLLABLE_PROCESSED;
1256
                    }
1257
                }
1258
            }
1259
        }
1260
1261
        // Set result in log which also denotes the end of the processing of this entry.
1262
        $field_array = ['result_data' => json_encode($result)];
1263
1264
        SignalSlotUtility::emitSignal(
1265
            self::class,
1266
            SignalSlotUtility::SIGNAL_QUEUEITEM_POSTPROCESS,
1267
            [$queueId, &$field_array]
1268
        );
1269
1270
        GeneralUtility::makeInstance(ConnectionPool::class)->getConnectionForTable('tx_crawler_queue')
1271
            ->update(
1272
                'tx_crawler_queue',
1273
                $field_array,
1274
                ['qid' => (int) $queueId]
1275
            );
1276
1277
        $this->logger->debug('crawler-readurl stop ' . microtime(true));
1278
        return $ret;
1279
    }
1280
1281
    /**
1282
     * Read URL for not-yet-inserted log-entry
1283
     *
1284
     * @param array $field_array Queue field array,
1285
     *
1286
     * @return array|bool|mixed|string
1287
     */
1288
    public function readUrlFromArray($field_array)
1289
    {
1290
        // Set exec_time to lock record:
1291
        $field_array['exec_time'] = $this->getCurrentTime();
1292
        $connectionForCrawlerQueue = GeneralUtility::makeInstance(ConnectionPool::class)->getConnectionForTable($this->tableName);
1293
        $connectionForCrawlerQueue->insert(
1294
            $this->tableName,
1295
            $field_array
1296
        );
1297
        $queueId = $connectionForCrawlerQueue->lastInsertId($this->tableName, 'qid');
1298
        $result = $this->queueExecutor->executeQueueItem($field_array, $this);
1299
1300
        // Set result in log which also denotes the end of the processing of this entry.
1301
        $field_array = ['result_data' => json_encode($result)];
1302
1303
        SignalSlotUtility::emitSignal(
1304
            self::class,
1305
            SignalSlotUtility::SIGNAL_QUEUEITEM_POSTPROCESS,
1306
            [$queueId, &$field_array]
1307
        );
1308
1309
        $connectionForCrawlerQueue->update(
1310
            $this->tableName,
1311
            $field_array,
1312
            ['qid' => $queueId]
1313
        );
1314
1315
        return $result;
1316
    }
1317
1318
    /*****************************
1319
     *
1320
     * Compiling URLs to crawl - tools
1321
     *
1322
     *****************************/
1323
1324
    /**
1325
     * @param integer $id Root page id to start from.
1326
     * @param integer $depth Depth of tree, 0=only id-page, 1= on sublevel, 99 = infinite
1327
     * @param integer $scheduledTime Unix Time when the URL is timed to be visited when put in queue
1328
     * @param integer $reqMinute Number of requests per minute (creates the interleave between requests)
1329
     * @param boolean $submitCrawlUrls If set, submits the URLs to queue in database (real crawling)
1330
     * @param boolean $downloadCrawlUrls If set (and submitcrawlUrls is false) will fill $downloadUrls with entries)
1331
     * @param array $incomingProcInstructions Array of processing instructions
1332
     * @param array $configurationSelection Array of configuration keys
1333
     * @return string
1334
     */
1335
    public function getPageTreeAndUrls(
1336
        $id,
1337
        $depth,
1338
        $scheduledTime,
1339
        $reqMinute,
1340
        $submitCrawlUrls,
1341
        $downloadCrawlUrls,
1342
        array $incomingProcInstructions,
1343
        array $configurationSelection
1344
    ) {
1345
        $this->scheduledTime = $scheduledTime;
1346
        $this->reqMinute = $reqMinute;
1347
        $this->submitCrawlUrls = $submitCrawlUrls;
1348
        $this->downloadCrawlUrls = $downloadCrawlUrls;
1349
        $this->incomingProcInstructions = $incomingProcInstructions;
1350
        $this->incomingConfigurationSelection = $configurationSelection;
1351
1352
        $this->duplicateTrack = [];
1353
        $this->downloadUrls = [];
1354
1355
        // Drawing tree:
1356
        /* @var PageTreeView $tree */
1357
        $tree = GeneralUtility::makeInstance(PageTreeView::class);
1358
        $perms_clause = $this->getBackendUser()->getPagePermsClause(Permission::PAGE_SHOW);
1359
        $tree->init('AND ' . $perms_clause);
1360
1361
        $pageInfo = BackendUtility::readPageAccess($id, $perms_clause);
1362
        if (is_array($pageInfo)) {
0 ignored issues
show
introduced by
The condition is_array($pageInfo) is always true.
Loading history...
1363
            // Set root row:
1364
            $tree->tree[] = [
1365
                'row' => $pageInfo,
1366
                'HTML' => $this->iconFactory->getIconForRecord('pages', $pageInfo, Icon::SIZE_SMALL),
1367
            ];
1368
        }
1369
1370
        // Get branch beneath:
1371
        if ($depth) {
1372
            $tree->getTree($id, $depth, '');
1373
        }
1374
1375
        // Traverse page tree:
1376
        $code = '';
1377
1378
        foreach ($tree->tree as $data) {
1379
            $this->MP = false;
1380
1381
            // recognize mount points
1382
            if ($data['row']['doktype'] === PageRepository::DOKTYPE_MOUNTPOINT) {
1383
                $mountpage = $this->pageRepository->getPage($data['row']['uid']);
1384
1385
                // fetch mounted pages
1386
                $this->MP = $mountpage[0]['mount_pid'] . '-' . $data['row']['uid'];
0 ignored issues
show
Documentation Bug introduced by
The property $MP was declared of type boolean, but $mountpage[0]['mount_pid...' . $data['row']['uid'] is of type string. Maybe add a type cast?

This check looks for assignments to scalar types that may be of the wrong type.

To ensure the code behaves as expected, it may be a good idea to add an explicit type cast.

$answer = 42;

$correct = false;

$correct = (bool) $answer;
Loading history...
1387
1388
                $mountTree = GeneralUtility::makeInstance(PageTreeView::class);
1389
                $mountTree->init('AND ' . $perms_clause);
1390
                $mountTree->getTree($mountpage[0]['mount_pid'], $depth);
1391
1392
                foreach ($mountTree->tree as $mountData) {
1393
                    $code .= $this->drawURLs_addRowsForPage(
1394
                        $mountData['row'],
1395
                        $mountData['HTML'] . BackendUtility::getRecordTitle('pages', $mountData['row'], true)
1396
                    );
1397
                }
1398
1399
                // replace page when mount_pid_ol is enabled
1400
                if ($mountpage[0]['mount_pid_ol']) {
1401
                    $data['row']['uid'] = $mountpage[0]['mount_pid'];
1402
                } else {
1403
                    // if the mount_pid_ol is not set the MP must not be used for the mountpoint page
1404
                    $this->MP = false;
1405
                }
1406
            }
1407
1408
            $code .= $this->drawURLs_addRowsForPage(
1409
                $data['row'],
1410
                $data['HTML'] . BackendUtility::getRecordTitle('pages', $data['row'], true)
1411
            );
1412
        }
1413
1414
        return $code;
1415
    }
1416
1417
    /**
1418
     * Expands exclude string
1419
     *
1420
     * @param string $excludeString Exclude string
1421
     * @return array
1422
     */
1423 2
    public function expandExcludeString($excludeString)
1424
    {
1425
        // internal static caches;
1426 2
        static $expandedExcludeStringCache;
1427 2
        static $treeCache;
1428
1429 2
        if (empty($expandedExcludeStringCache[$excludeString])) {
1430 2
            $pidList = [];
1431
1432 2
            if (! empty($excludeString)) {
1433
                /** @var PageTreeView $tree */
1434 1
                $tree = GeneralUtility::makeInstance(PageTreeView::class);
1435 1
                $tree->init('AND ' . $this->getBackendUser()->getPagePermsClause(Permission::PAGE_SHOW));
1436
1437 1
                $excludeParts = GeneralUtility::trimExplode(',', $excludeString);
1438
1439 1
                foreach ($excludeParts as $excludePart) {
1440 1
                    [$pid, $depth] = GeneralUtility::trimExplode('+', $excludePart);
1441
1442
                    // default is "page only" = "depth=0"
1443 1
                    if (empty($depth)) {
1444 1
                        $depth = (stristr($excludePart, '+')) ? 99 : 0;
1445
                    }
1446
1447 1
                    $pidList[] = (int) $pid;
1448
1449 1
                    if ($depth > 0) {
1450
                        if (empty($treeCache[$pid][$depth])) {
1451
                            $tree->reset();
1452
                            $tree->getTree($pid, $depth);
0 ignored issues
show
Bug introduced by
$pid of type string is incompatible with the type integer expected by parameter $uid of TYPO3\CMS\Backend\Tree\V...ractTreeView::getTree(). ( Ignorable by Annotation )

If this is a false-positive, you can also ignore this issue in your code via the ignore-type  annotation

1452
                            $tree->getTree(/** @scrutinizer ignore-type */ $pid, $depth);
Loading history...
1453
                            $treeCache[$pid][$depth] = $tree->tree;
1454
                        }
1455
1456
                        foreach ($treeCache[$pid][$depth] as $data) {
1457
                            $pidList[] = (int) $data['row']['uid'];
1458
                        }
1459
                    }
1460
                }
1461
            }
1462
1463 2
            $expandedExcludeStringCache[$excludeString] = array_unique($pidList);
1464
        }
1465
1466 2
        return $expandedExcludeStringCache[$excludeString];
1467
    }
1468
1469
    /**
1470
     * Create the rows for display of the page tree
1471
     * For each page a number of rows are shown displaying GET variable configuration
1472
     */
1473
    public function drawURLs_addRowsForPage(array $pageRow, string $pageTitle): string
1474
    {
1475
        $skipMessage = '';
1476
1477
        // Get list of configurations
1478
        $configurations = $this->getUrlsForPageRow($pageRow, $skipMessage);
1479
1480
        if (! empty($this->incomingConfigurationSelection)) {
1481
            // remove configuration that does not match the current selection
1482
            foreach ($configurations as $confKey => $confArray) {
1483
                if (! in_array($confKey, $this->incomingConfigurationSelection, true)) {
1484
                    unset($configurations[$confKey]);
1485
                }
1486
            }
1487
        }
1488
1489
        // Traverse parameter combinations:
1490
        $c = 0;
1491
        $content = '';
1492
        if (! empty($configurations)) {
1493
            foreach ($configurations as $confKey => $confArray) {
1494
1495
                // Title column:
1496
                if (! $c) {
1497
                    $titleClm = '<td rowspan="' . count($configurations) . '">' . $pageTitle . '</td>';
1498
                } else {
1499
                    $titleClm = '';
1500
                }
1501
1502
                if (! in_array($pageRow['uid'], $this->expandExcludeString($confArray['subCfg']['exclude']), true)) {
1503
1504
                    // URL list:
1505
                    $urlList = $this->urlListFromUrlArray(
1506
                        $confArray,
1507
                        $pageRow,
1508
                        $this->scheduledTime,
1509
                        $this->reqMinute,
1510
                        $this->submitCrawlUrls,
1511
                        $this->downloadCrawlUrls,
1512
                        $this->duplicateTrack,
1513
                        $this->downloadUrls,
1514
                        // if empty the urls won't be filtered by processing instructions
1515
                        $this->incomingProcInstructions
1516
                    );
1517
1518
                    // Expanded parameters:
1519
                    $paramExpanded = '';
1520
                    $calcAccu = [];
1521
                    $calcRes = 1;
1522
                    foreach ($confArray['paramExpanded'] as $gVar => $gVal) {
1523
                        $paramExpanded .= '
1524
                            <tr>
1525
                                <td>' . htmlspecialchars('&' . $gVar . '=') . '<br/>' .
1526
                            '(' . count($gVal) . ')' .
1527
                            '</td>
1528
                                <td nowrap="nowrap">' . nl2br(htmlspecialchars(implode(chr(10), $gVal))) . '</td>
1529
                            </tr>
1530
                        ';
1531
                        $calcRes *= count($gVal);
1532
                        $calcAccu[] = count($gVal);
1533
                    }
1534
                    $paramExpanded = '<table>' . $paramExpanded . '</table>';
1535
                    $paramExpanded .= 'Comb: ' . implode('*', $calcAccu) . '=' . $calcRes;
1536
1537
                    // Options
1538
                    $optionValues = '';
1539
                    if ($confArray['subCfg']['userGroups']) {
1540
                        $optionValues .= 'User Groups: ' . $confArray['subCfg']['userGroups'] . '<br/>';
1541
                    }
1542
                    if ($confArray['subCfg']['procInstrFilter']) {
1543
                        $optionValues .= 'ProcInstr: ' . $confArray['subCfg']['procInstrFilter'] . '<br/>';
1544
                    }
1545
1546
                    // Compile row:
1547
                    $content .= '
1548
                        <tr>
1549
                            ' . $titleClm . '
1550
                            <td>' . htmlspecialchars($confKey) . '</td>
1551
                            <td>' . nl2br(htmlspecialchars(rawurldecode(trim(str_replace('&', chr(10) . '&', GeneralUtility::implodeArrayForUrl('', $confArray['paramParsed'])))))) . '</td>
1552
                            <td>' . $paramExpanded . '</td>
1553
                            <td nowrap="nowrap">' . $urlList . '</td>
1554
                            <td nowrap="nowrap">' . $optionValues . '</td>
1555
                            <td nowrap="nowrap">' . DebugUtility::viewArray($confArray['subCfg']['procInstrParams.']) . '</td>
1556
                        </tr>';
1557
                } else {
1558
                    $content .= '<tr>
1559
                            ' . $titleClm . '
1560
                            <td>' . htmlspecialchars($confKey) . '</td>
1561
                            <td colspan="5"><em>No entries</em> (Page is excluded in this configuration)</td>
1562
                        </tr>';
1563
                }
1564
1565
                $c++;
1566
            }
1567
        } else {
1568
            $message = ! empty($skipMessage) ? ' (' . $skipMessage . ')' : '';
1569
1570
            // Compile row:
1571
            $content .= '
1572
                <tr>
1573
                    <td>' . $pageTitle . '</td>
1574
                    <td colspan="6"><em>No entries</em>' . $message . '</td>
1575
                </tr>';
1576
        }
1577
1578
        return $content;
1579
    }
1580
1581
    /*****************************
1582
     *
1583
     * CLI functions
1584
     *
1585
     *****************************/
1586
1587
    /**
1588
     * Running the functionality of the CLI (crawling URLs from queue)
1589
     */
1590
    public function CLI_run(int $countInARun, int $sleepTime, int $sleepAfterFinish): int
1591
    {
1592
        $result = 0;
1593
        $counter = 0;
1594
1595
        // First, run hooks:
1596
        $this->CLI_runHooks();
1597
1598
        // Clean up the queue
1599
        $this->queueRepository->cleanupQueue();
1600
1601
        // Select entries:
1602
        $rows = $this->queueRepository->fetchRecordsToBeCrawled($countInARun);
1603
1604
        if (! empty($rows)) {
1605
            $quidList = [];
1606
1607
            foreach ($rows as $r) {
1608
                $quidList[] = $r['qid'];
1609
            }
1610
1611
            $processId = $this->CLI_buildProcessId();
1612
1613
            //save the number of assigned queue entries to determine how many have been processed later
1614
            $numberOfAffectedRows = $this->queueRepository->updateProcessIdAndSchedulerForQueueIds($quidList, $processId);
1615
            $this->processRepository->updateProcessAssignItemsCount($numberOfAffectedRows, $processId);
1616
1617
            if ($numberOfAffectedRows !== count($quidList)) {
1618
                $this->CLI_debug('Nothing processed due to multi-process collision (' . $this->CLI_buildProcessId() . ')');
0 ignored issues
show
Deprecated Code introduced by
The function AOE\Crawler\Controller\C...Controller::CLI_debug() has been deprecated. ( Ignorable by Annotation )

If this is a false-positive, you can also ignore this issue in your code via the ignore-deprecated  annotation

1618
                /** @scrutinizer ignore-deprecated */ $this->CLI_debug('Nothing processed due to multi-process collision (' . $this->CLI_buildProcessId() . ')');
Loading history...
1619
                return ($result | self::CLI_STATUS_ABORTED);
1620
            }
1621
1622
            foreach ($rows as $r) {
1623
                $result |= $this->readUrl($r['qid']);
1624
1625
                $counter++;
1626
                // Just to relax the system
1627
                usleep((int) $sleepTime);
1628
1629
                // if during the start and the current read url the cli has been disable we need to return from the function
1630
                // mark the process NOT as ended.
1631
                if ($this->crawler->isDisabled()) {
1632
                    return ($result | self::CLI_STATUS_ABORTED);
1633
                }
1634
1635
                if (! $this->processRepository->isProcessActive($this->CLI_buildProcessId())) {
1636
                    $this->CLI_debug('conflict / timeout (' . $this->CLI_buildProcessId() . ')');
0 ignored issues
show
Deprecated Code introduced by
The function AOE\Crawler\Controller\C...Controller::CLI_debug() has been deprecated. ( Ignorable by Annotation )

If this is a false-positive, you can also ignore this issue in your code via the ignore-deprecated  annotation

1636
                    /** @scrutinizer ignore-deprecated */ $this->CLI_debug('conflict / timeout (' . $this->CLI_buildProcessId() . ')');
Loading history...
1637
                    $result |= self::CLI_STATUS_ABORTED;
1638
                    //possible timeout
1639
                    break;
1640
                }
1641
            }
1642
1643
            sleep((int) $sleepAfterFinish);
1644
1645
            $msg = 'Rows: ' . $counter;
1646
            $this->CLI_debug($msg . ' (' . $this->CLI_buildProcessId() . ')');
0 ignored issues
show
Deprecated Code introduced by
The function AOE\Crawler\Controller\C...Controller::CLI_debug() has been deprecated. ( Ignorable by Annotation )

If this is a false-positive, you can also ignore this issue in your code via the ignore-deprecated  annotation

1646
            /** @scrutinizer ignore-deprecated */ $this->CLI_debug($msg . ' (' . $this->CLI_buildProcessId() . ')');
Loading history...
1647
        } else {
1648
            $this->CLI_debug('Nothing within queue which needs to be processed (' . $this->CLI_buildProcessId() . ')');
0 ignored issues
show
Deprecated Code introduced by
The function AOE\Crawler\Controller\C...Controller::CLI_debug() has been deprecated. ( Ignorable by Annotation )

If this is a false-positive, you can also ignore this issue in your code via the ignore-deprecated  annotation

1648
            /** @scrutinizer ignore-deprecated */ $this->CLI_debug('Nothing within queue which needs to be processed (' . $this->CLI_buildProcessId() . ')');
Loading history...
1649
        }
1650
1651
        if ($counter > 0) {
1652
            $result |= self::CLI_STATUS_PROCESSED;
1653
        }
1654
1655
        return $result;
1656
    }
1657
1658
    /**
1659
     * Activate hooks
1660
     */
1661
    public function CLI_runHooks(): void
1662
    {
1663
        foreach ($GLOBALS['TYPO3_CONF_VARS']['EXTCONF']['crawler']['cli_hooks'] ?? [] as $objRef) {
1664
            $hookObj = GeneralUtility::makeInstance($objRef);
1665
            if (is_object($hookObj)) {
1666
                $hookObj->crawler_init($this);
1667
            }
1668
        }
1669
    }
1670
1671
    /**
1672
     * Try to acquire a new process with the given id
1673
     * also performs some auto-cleanup for orphan processes
1674
     * @param string $id identification string for the process
1675
     * @return boolean
1676
     * @todo preemption might not be the most elegant way to clean up
1677
     */
1678
    public function CLI_checkAndAcquireNewProcess($id)
1679
    {
1680
        $ret = true;
1681
1682
        $systemProcessId = getmypid();
1683
        if (! $systemProcessId) {
1684
            return false;
1685
        }
1686
1687
        $processCount = 0;
1688
        $orphanProcesses = [];
1689
1690
        $activeProcesses = $this->processRepository->findAllActive();
1691
        $currentTime = $this->getCurrentTime();
1692
1693
        /** @var Process $process */
1694
        foreach ($activeProcesses as $process) {
1695
            if ($process->getTtl() < $currentTime) {
1696
                $orphanProcesses[] = $process->getProcessId();
1697
            } else {
1698
                $processCount++;
1699
            }
1700
        }
1701
1702
        // if there are less than allowed active processes then add a new one
1703
        if ($processCount < (int) $this->extensionSettings['processLimit']) {
1704
            $this->CLI_debug('add process ' . $this->CLI_buildProcessId() . ' (' . ($processCount + 1) . '/' . (int) $this->extensionSettings['processLimit'] . ')');
0 ignored issues
show
Deprecated Code introduced by
The function AOE\Crawler\Controller\C...Controller::CLI_debug() has been deprecated. ( Ignorable by Annotation )

If this is a false-positive, you can also ignore this issue in your code via the ignore-deprecated  annotation

1704
            /** @scrutinizer ignore-deprecated */ $this->CLI_debug('add process ' . $this->CLI_buildProcessId() . ' (' . ($processCount + 1) . '/' . (int) $this->extensionSettings['processLimit'] . ')');
Loading history...
1705
1706
            GeneralUtility::makeInstance(ConnectionPool::class)->getConnectionForTable('tx_crawler_process')->insert(
1707
                'tx_crawler_process',
1708
                [
1709
                    'process_id' => $id,
1710
                    'active' => 1,
1711
                    'ttl' => $currentTime + (int) $this->extensionSettings['processMaxRunTime'],
1712
                    'system_process_id' => $systemProcessId,
1713
                ]
1714
            );
1715
        } else {
1716
            $this->CLI_debug('Processlimit reached (' . ($processCount) . '/' . (int) $this->extensionSettings['processLimit'] . ')');
0 ignored issues
show
Deprecated Code introduced by
The function AOE\Crawler\Controller\C...Controller::CLI_debug() has been deprecated. ( Ignorable by Annotation )

If this is a false-positive, you can also ignore this issue in your code via the ignore-deprecated  annotation

1716
            /** @scrutinizer ignore-deprecated */ $this->CLI_debug('Processlimit reached (' . ($processCount) . '/' . (int) $this->extensionSettings['processLimit'] . ')');
Loading history...
1717
            $ret = false;
1718
        }
1719
1720
        $this->processRepository->deleteProcessesMarkedAsDeleted();
1721
        $this->CLI_releaseProcesses($orphanProcesses);
1722
1723
        return $ret;
1724
    }
1725
1726
    /**
1727
     * Release a process and the required resources
1728
     *
1729
     * @param mixed $releaseIds string with a single process-id or array with multiple process-ids
1730
     * @return boolean
1731
     */
1732
    public function CLI_releaseProcesses($releaseIds)
1733
    {
1734
        $queryBuilder = GeneralUtility::makeInstance(ConnectionPool::class)->getQueryBuilderForTable($this->tableName);
1735
1736
        if (! is_array($releaseIds)) {
1737
            $releaseIds = [$releaseIds];
1738
        }
1739
1740
        if (empty($releaseIds)) {
1741
            //nothing to release
1742
            return false;
1743
        }
1744
1745
        // some kind of 2nd chance algo - this way you need at least 2 processes to have a real cleanup
1746
        // this ensures that a single process can't mess up the entire process table
1747
1748
        // mark all processes as deleted which have no "waiting" queue-entires and which are not active
1749
1750
        $queryBuilder
1751
            ->update($this->tableName, 'q')
1752
            ->where(
1753
                'q.process_id IN(SELECT p.process_id FROM tx_crawler_process as p WHERE p.active = 0)'
1754
            )
1755
            ->set('q.process_scheduled', 0)
1756
            ->set('q.process_id', '')
1757
            ->execute();
1758
1759
        // FIXME: Not entirely sure that this is equivalent to the previous version
1760
        $queryBuilder->resetQueryPart('set');
1761
1762
        $queryBuilder
1763
            ->update('tx_crawler_process')
1764
            ->where(
1765
                $queryBuilder->expr()->eq('active', 0),
1766
                'process_id IN(SELECT q.process_id FROM tx_crawler_queue as q WHERE q.exec_time = 0)'
1767
            )
1768
            ->set('system_process_id', 0)
1769
            ->execute();
1770
1771
        $this->processRepository->markRequestedProcessesAsNotActive($releaseIds);
1772
        $this->queueRepository->unsetProcessScheduledAndProcessIdForQueueEntries($releaseIds);
1773
1774
        return true;
1775
    }
1776
1777
    /**
1778
     * Create a unique Id for the current process
1779
     *
1780
     * @return string the ID
1781
     */
1782 1
    public function CLI_buildProcessId()
1783
    {
1784 1
        if (! $this->processID) {
1785
            $this->processID = GeneralUtility::shortMD5(microtime(true));
1786
        }
1787 1
        return $this->processID;
1788
    }
1789
1790
    /**
1791
     * Prints a message to the stdout (only if debug-mode is enabled)
1792
     *
1793
     * @param string $msg the message
1794
     * @deprecated
1795
     * @codeCoverageIgnore
1796
     */
1797
    public function CLI_debug($msg): void
1798
    {
1799
        if ((int) $this->extensionSettings['processDebug']) {
1800
            echo $msg . "\n";
1801
            flush();
1802
        }
1803
    }
1804
1805
    /**
1806
     * Cleans up entries that stayed for too long in the queue. These are:
1807
     * - processed entries that are over 1.5 days in age
1808
     * - scheduled entries that are over 7 days old
1809
     *
1810
     * @deprecated
1811
     */
1812 1
    public function cleanUpOldQueueEntries(): void
1813
    {
1814
        // 24*60*60 Seconds in 24 hours
1815 1
        $processedAgeInSeconds = $this->extensionSettings['cleanUpProcessedAge'] * 86400;
1816 1
        $scheduledAgeInSeconds = $this->extensionSettings['cleanUpScheduledAge'] * 86400;
1817
1818 1
        $now = time();
1819 1
        $condition = '(exec_time<>0 AND exec_time<' . ($now - $processedAgeInSeconds) . ') OR scheduled<=' . ($now - $scheduledAgeInSeconds);
1820 1
        $this->flushQueue($condition);
0 ignored issues
show
Deprecated Code introduced by
The function AOE\Crawler\Controller\C...ontroller::flushQueue() has been deprecated. ( Ignorable by Annotation )

If this is a false-positive, you can also ignore this issue in your code via the ignore-deprecated  annotation

1820
        /** @scrutinizer ignore-deprecated */ $this->flushQueue($condition);
Loading history...
1821 1
    }
1822
1823
    /**
1824
     * Removes queue entries
1825
     *
1826
     * @param string $where SQL related filter for the entries which should be removed
1827
     *
1828
     * @deprecated
1829
     */
1830 5
    protected function flushQueue($where = ''): void
1831
    {
1832 5
        $realWhere = strlen((string) $where) > 0 ? $where : '1=1';
1833
1834 5
        $queryBuilder = $this->getQueryBuilder($this->tableName);
1835
1836
        $groups = $queryBuilder
1837 5
            ->selectLiteral('DISTINCT set_id')
1838 5
            ->from($this->tableName)
1839 5
            ->where($realWhere)
1840 5
            ->execute()
1841 5
            ->fetchAll();
1842 5
        if (is_array($groups)) {
0 ignored issues
show
introduced by
The condition is_array($groups) is always true.
Loading history...
1843 5
            foreach ($groups as $group) {
1844
                $subSet = $queryBuilder
1845 4
                    ->select('qid', 'set_id')
1846 4
                    ->from($this->tableName)
1847 4
                    ->where(
1848 4
                        $realWhere,
1849 4
                        $queryBuilder->expr()->eq('set_id', $group['set_id'])
1850
                    )
1851 4
                    ->execute()
1852 4
                    ->fetchAll();
1853
1854 4
                $payLoad = ['subSet' => $subSet];
1855 4
                SignalSlotUtility::emitSignal(
1856 4
                    self::class,
1857 4
                    SignalSlotUtility::SIGNAL_QUEUE_ENTRY_FLUSH,
1858 4
                    $payLoad
1859
                );
1860
            }
1861
        }
1862
1863
        $queryBuilder
1864 5
            ->delete($this->tableName)
1865 5
            ->where($realWhere)
1866 5
            ->execute();
1867 5
    }
1868
1869
    /**
1870
     * This method determines duplicates for a queue entry with the same parameters and this timestamp.
1871
     * If the timestamp is in the past, it will check if there is any unprocessed queue entry in the past.
1872
     * If the timestamp is in the future it will check, if the queued entry has exactly the same timestamp
1873
     *
1874
     * @param int $tstamp
1875
     * @param array $fieldArray
1876
     *
1877
     * @return array
1878
     * @deprecated
1879
     */
1880 5
    protected function getDuplicateRowsIfExist($tstamp, $fieldArray)
1881
    {
1882 5
        $rows = [];
1883
1884 5
        $currentTime = $this->getCurrentTime();
1885
1886 5
        $queryBuilder = GeneralUtility::makeInstance(ConnectionPool::class)->getQueryBuilderForTable($this->tableName);
1887
        $queryBuilder
1888 5
            ->select('qid')
1889 5
            ->from('tx_crawler_queue');
1890
        //if this entry is scheduled with "now"
1891 5
        if ($tstamp <= $currentTime) {
1892 2
            if ($this->extensionSettings['enableTimeslot']) {
1893 1
                $timeBegin = $currentTime - 100;
1894 1
                $timeEnd = $currentTime + 100;
1895
                $queryBuilder
1896 1
                    ->where(
1897 1
                        'scheduled BETWEEN ' . $timeBegin . ' AND ' . $timeEnd . ''
1898
                    )
1899 1
                    ->orWhere(
1900 1
                        $queryBuilder->expr()->lte('scheduled', $currentTime)
1901
                    );
1902
            } else {
1903
                $queryBuilder
1904 1
                    ->where(
1905 2
                        $queryBuilder->expr()->lte('scheduled', $currentTime)
1906
                    );
1907
            }
1908 3
        } elseif ($tstamp > $currentTime) {
1909
            //entry with a timestamp in the future need to have the same schedule time
1910
            $queryBuilder
1911 3
                ->where(
1912 3
                    $queryBuilder->expr()->eq('scheduled', $tstamp)
1913
                );
1914
        }
1915
1916
        $queryBuilder
1917 5
            ->andWhere('NOT exec_time')
1918 5
            ->andWhere('NOT process_id')
1919 5
            ->andWhere($queryBuilder->expr()->eq('page_id', $queryBuilder->createNamedParameter($fieldArray['page_id'], \PDO::PARAM_INT)))
1920 5
            ->andWhere($queryBuilder->expr()->eq('parameters_hash', $queryBuilder->createNamedParameter($fieldArray['parameters_hash'], \PDO::PARAM_STR)));
1921
1922 5
        $statement = $queryBuilder->execute();
1923
1924 5
        while ($row = $statement->fetch()) {
1925 5
            $rows[] = $row['qid'];
1926
        }
1927
1928 5
        return $rows;
1929
    }
1930
1931
    /**
1932
     * Returns a md5 hash generated from a serialized configuration array.
1933
     *
1934
     * @return string
1935
     */
1936 10
    protected function getConfigurationHash(array $configuration)
1937
    {
1938 10
        unset($configuration['paramExpanded']);
1939 10
        unset($configuration['URLs']);
1940 10
        return md5(serialize($configuration));
1941
    }
1942
1943
    /**
1944
     * Build a URL from a Page and the Query String. If the page has a Site configuration, it can be built by using
1945
     * the Site instance.
1946
     *
1947
     * @param int $httpsOrHttp see tx_crawler_configuration.force_ssl
1948
     * @throws \TYPO3\CMS\Core\Exception\SiteNotFoundException
1949
     * @throws \TYPO3\CMS\Core\Routing\InvalidRouteArgumentsException
1950
     *
1951
     * @deprecated Using CrawlerController::getUrlFromPageAndQueryParameters() is deprecated since 9.1.1 and will be removed in v11.x, please use UrlService->getUrlFromPageAndQueryParameters() instead.
1952
     * @codeCoverageIgnore
1953
     */
1954
    protected function getUrlFromPageAndQueryParameters(int $pageId, string $queryString, ?string $alternativeBaseUrl, int $httpsOrHttp): UriInterface
1955
    {
1956
        $urlService = new UrlService();
1957
        return $urlService->getUrlFromPageAndQueryParameters($pageId, $queryString, $alternativeBaseUrl, $httpsOrHttp);
1958
    }
1959
1960 1
    protected function swapIfFirstIsLargerThanSecond(array $reg): array
1961
    {
1962
        // Swap if first is larger than last:
1963 1
        if ($reg[1] > $reg[2]) {
1964
            $temp = $reg[2];
1965
            $reg[2] = $reg[1];
1966
            $reg[1] = $temp;
1967
        }
1968
1969 1
        return $reg;
1970
    }
1971
1972
    /**
1973
     * @return BackendUserAuthentication
1974
     */
1975 2
    private function getBackendUser()
1976
    {
1977
        // Make sure the _cli_ user is loaded
1978 2
        Bootstrap::initializeBackendAuthentication();
1979 2
        if ($this->backendUser === null) {
1980 2
            $this->backendUser = $GLOBALS['BE_USER'];
1981
        }
1982 2
        return $this->backendUser;
1983
    }
1984
1985
    /**
1986
     * Get querybuilder for given table
1987
     *
1988
     * @return \TYPO3\CMS\Core\Database\Query\QueryBuilder
1989
     */
1990 12
    private function getQueryBuilder(string $table)
1991
    {
1992 12
        return GeneralUtility::makeInstance(ConnectionPool::class)->getQueryBuilderForTable($table);
1993
    }
1994
}
1995