Passed
Push — codeception ( 70a2c0...f5176b )
by Tomas Norre
06:16
created

CrawlerController::checkIfPageShouldBeSkipped()   F

Complexity

Conditions 14
Paths 360

Size

Total Lines 52
Code Lines 29

Duplication

Lines 0
Ratio 0 %

Code Coverage

Tests 20
CRAP Score 18.5707

Importance

Changes 1
Bugs 0 Features 0
Metric Value
cc 14
eloc 29
c 1
b 0
f 0
nc 360
nop 1
dl 0
loc 52
ccs 20
cts 28
cp 0.7143
crap 18.5707
rs 3.4333

How to fix   Long Method    Complexity   

Long Method

Small methods make your code easier to understand, in particular if combined with a good name. Besides, if your method is small, finding a good name is usually much easier.

For example, if you find yourself adding comments to a method's body, this is usually a good sign to extract the commented part to a new method, and use the comment as a starting point when coming up with a good name for this new method.

Commonly applied refactorings include:

1
<?php
2
3
declare(strict_types=1);
4
5
namespace AOE\Crawler\Controller;
6
7
/***************************************************************
8
 *  Copyright notice
9
 *
10
 *  (c) 2020 AOE GmbH <[email protected]>
11
 *
12
 *  All rights reserved
13
 *
14
 *  This script is part of the TYPO3 project. The TYPO3 project is
15
 *  free software; you can redistribute it and/or modify
16
 *  it under the terms of the GNU General Public License as published by
17
 *  the Free Software Foundation; either version 3 of the License, or
18
 *  (at your option) any later version.
19
 *
20
 *  The GNU General Public License can be found at
21
 *  http://www.gnu.org/copyleft/gpl.html.
22
 *
23
 *  This script is distributed in the hope that it will be useful,
24
 *  but WITHOUT ANY WARRANTY; without even the implied warranty of
25
 *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
26
 *  GNU General Public License for more details.
27
 *
28
 *  This copyright notice MUST APPEAR in all copies of the script!
29
 ***************************************************************/
30
31
use AOE\Crawler\Configuration\ExtensionConfigurationProvider;
32
use AOE\Crawler\Domain\Repository\ConfigurationRepository;
33
use AOE\Crawler\Domain\Repository\ProcessRepository;
34
use AOE\Crawler\Domain\Repository\QueueRepository;
35
use AOE\Crawler\Event\EventDispatcher;
36
use AOE\Crawler\QueueExecutor;
37
use AOE\Crawler\Utility\SignalSlotUtility;
38
use Psr\Http\Message\UriInterface;
39
use Psr\Log\LoggerAwareInterface;
40
use Psr\Log\LoggerAwareTrait;
41
use TYPO3\CMS\Backend\Tree\View\PageTreeView;
42
use TYPO3\CMS\Backend\Utility\BackendUtility;
43
use TYPO3\CMS\Core\Authentication\BackendUserAuthentication;
44
use TYPO3\CMS\Core\Compatibility\PublicMethodDeprecationTrait;
45
use TYPO3\CMS\Core\Core\Bootstrap;
46
use TYPO3\CMS\Core\Core\Environment;
47
use TYPO3\CMS\Core\Database\Connection;
48
use TYPO3\CMS\Core\Database\ConnectionPool;
49
use TYPO3\CMS\Core\Database\Query\Restriction\DeletedRestriction;
50
use TYPO3\CMS\Core\Http\Uri;
51
use TYPO3\CMS\Core\Imaging\Icon;
52
use TYPO3\CMS\Core\Imaging\IconFactory;
53
use TYPO3\CMS\Core\Routing\SiteMatcher;
54
use TYPO3\CMS\Core\Site\Entity\Site;
55
use TYPO3\CMS\Core\Type\Bitmask\Permission;
56
use TYPO3\CMS\Core\TypoScript\Parser\TypoScriptParser;
57
use TYPO3\CMS\Core\Utility\DebugUtility;
58
use TYPO3\CMS\Core\Utility\GeneralUtility;
59
use TYPO3\CMS\Core\Utility\MathUtility;
60
use TYPO3\CMS\Extbase\Object\ObjectManager;
61
use TYPO3\CMS\Frontend\Page\CacheHashCalculator;
62
use TYPO3\CMS\Frontend\Page\PageRepository;
63
64
/**
65
 * Class CrawlerController
66
 *
67
 * @package AOE\Crawler\Controller
68
 */
69
class CrawlerController implements LoggerAwareInterface
70
{
71
    use LoggerAwareTrait;
72
    use PublicMethodDeprecationTrait;
73
74
    public const CLI_STATUS_NOTHING_PROCCESSED = 0;
75
76
    public const CLI_STATUS_REMAIN = 1; //queue not empty
77
78
    public const CLI_STATUS_PROCESSED = 2; //(some) queue items where processed
79
80
    public const CLI_STATUS_ABORTED = 4; //instance didn't finish
81
82
    public const CLI_STATUS_POLLABLE_PROCESSED = 8;
83
84
    /**
85
     * @var integer
86
     */
87
    public $setID = 0;
88
89
    /**
90
     * @var string
91
     */
92
    public $processID = '';
93
94
    /**
95
     * @var array
96
     */
97
    public $duplicateTrack = [];
98
99
    /**
100
     * @var array
101
     */
102
    public $downloadUrls = [];
103
104
    /**
105
     * @var array
106
     */
107
    public $incomingProcInstructions = [];
108
109
    /**
110
     * @var array
111
     */
112
    public $incomingConfigurationSelection = [];
113
114
    /**
115
     * @var bool
116
     */
117
    public $registerQueueEntriesInternallyOnly = false;
118
119
    /**
120
     * @var array
121
     */
122
    public $queueEntries = [];
123
124
    /**
125
     * @var array
126
     */
127
    public $urlList = [];
128
129
    /**
130
     * @var array
131
     */
132
    public $extensionSettings = [];
133
134
    /**
135
     * Mount Point
136
     *
137
     * @var bool
138
     * Todo: Check what this is used for and adjust the type hint or code, as bool doesn't match the current code.
139
     */
140
    public $MP = false;
141
142
    /**
143
     * @var string
144
     */
145
    protected $processFilename;
146
147
    /**
148
     * Holds the internal access mode can be 'gui','cli' or 'cli_im'
149
     *
150
     * @var string
151
     */
152
    protected $accessMode;
153
154
    /**
155
     * @var QueueRepository
156
     */
157
    protected $queueRepository;
158
159
    /**
160
     * @var ProcessRepository
161
     */
162
    protected $processRepository;
163
164
    /**
165
     * @var ConfigurationRepository
166
     */
167
    protected $configurationRepository;
168
169
    /**
170
     * @var string
171
     */
172
    protected $tableName = 'tx_crawler_queue';
173
174
    /**
175
     * @var QueueExecutor
176
     */
177
    protected $queueExecutor;
178
179
    /**
180
     * @var int
181
     */
182
    protected $maximumUrlsToCompile = 10000;
183
184
    /**
185
     * @var IconFactory
186
     */
187
    protected $iconFactory;
188
189
    /**
190
     * @var string[]
191
     */
192
    private $deprecatedPublicMethods = [
0 ignored issues
show
introduced by
The private property $deprecatedPublicMethods is not used, and could be removed.
Loading history...
193
        'getLogEntriesForSetId' => 'Using crawlerController::getLogEntriesForSetId() is deprecated since 9.0.1 and will be removed in v10.x',
194
        'flushQueue' => 'Using CrawlerController::flushQueue() is deprecated since 9.0.1 and will be removed in v10.x, please use QueueRepository->flushQueue() instead.',
195
        'cleanUpOldQueueEntries' => 'Using CrawlerController::cleanUpOldQueueEntries() is deprecated since 9.0.1 and will be removed in v10.x, please use QueueRepository->cleanUpOldQueueEntries() instead.',
196
    ];
197
198
    /**
199
     * @var BackendUserAuthentication|null
200
     */
201
    private $backendUser;
202
203
    /**
204
     * @var integer
205
     */
206
    private $scheduledTime = 0;
207
208
    /**
209
     * @var integer
210
     */
211
    private $reqMinute = 0;
212
213
    /**
214
     * @var bool
215
     */
216
    private $submitCrawlUrls = false;
217
218
    /**
219
     * @var bool
220
     */
221
    private $downloadCrawlUrls = false;
222
223
    /************************************
224
     *
225
     * Getting URLs based on Page TSconfig
226
     *
227
     ************************************/
228
229 41
    public function __construct()
230
    {
231 41
        $objectManager = GeneralUtility::makeInstance(ObjectManager::class);
232 41
        $this->queueRepository = $objectManager->get(QueueRepository::class);
233 41
        $this->processRepository = $objectManager->get(ProcessRepository::class);
234 41
        $this->configurationRepository = $objectManager->get(ConfigurationRepository::class);
235 41
        $this->queueExecutor = $objectManager->get(QueueExecutor::class);
236 41
        $this->iconFactory = GeneralUtility::makeInstance(IconFactory::class);
237
238 41
        $this->processFilename = Environment::getVarPath() . '/lock/tx_crawler.proc';
239
240
        /** @var ExtensionConfigurationProvider $configurationProvider */
241 41
        $configurationProvider = GeneralUtility::makeInstance(ExtensionConfigurationProvider::class);
242 41
        $settings = $configurationProvider->getExtensionConfiguration();
243 41
        $this->extensionSettings = is_array($settings) ? $settings : [];
0 ignored issues
show
introduced by
The condition is_array($settings) is always true.
Loading history...
244
245
        // set defaults:
246 41
        if (MathUtility::convertToPositiveInteger($this->extensionSettings['countInARun']) === 0) {
247
            $this->extensionSettings['countInARun'] = 100;
248
        }
249
250 41
        $this->extensionSettings['processLimit'] = MathUtility::forceIntegerInRange($this->extensionSettings['processLimit'], 1, 99, 1);
251 41
        $this->maximumUrlsToCompile = MathUtility::forceIntegerInRange($this->extensionSettings['maxCompileUrls'], 1, 1000000000, 10000);
252 41
    }
253
254
    /**
255
     * Method to set the accessMode can be gui, cli or cli_im
256
     *
257
     * @return string
258
     */
259 1
    public function getAccessMode()
260
    {
261 1
        return $this->accessMode;
262
    }
263
264
    /**
265
     * @param string $accessMode
266
     */
267 1
    public function setAccessMode($accessMode): void
268
    {
269 1
        $this->accessMode = $accessMode;
270 1
    }
271
272
    /**
273
     * Set disabled status to prevent processes from being processed
274
     *
275
     * @param bool $disabled (optional, defaults to true)
276
     */
277 2
    public function setDisabled($disabled = true): void
278
    {
279 2
        if ($disabled) {
280 1
            GeneralUtility::writeFile($this->processFilename, '');
281
        } else {
282 1
            if (is_file($this->processFilename)) {
283 1
                unlink($this->processFilename);
284
            }
285
        }
286 2
    }
287
288
    /**
289
     * Get disable status
290
     *
291
     * @return bool true if disabled
292
     */
293 2
    public function getDisabled()
294
    {
295 2
        return is_file($this->processFilename);
296
    }
297
298
    /**
299
     * @param string $filenameWithPath
300
     */
301 3
    public function setProcessFilename($filenameWithPath): void
302
    {
303 3
        $this->processFilename = $filenameWithPath;
304 3
    }
305
306
    /**
307
     * @return string
308
     */
309 1
    public function getProcessFilename()
310
    {
311 1
        return $this->processFilename;
312
    }
313
314
    /**
315
     * Sets the extensions settings (unserialized pendant of $TYPO3_CONF_VARS['EXT']['extConf']['crawler']).
316
     */
317 12
    public function setExtensionSettings(array $extensionSettings): void
318
    {
319 12
        $this->extensionSettings = $extensionSettings;
320 12
    }
321
322
    /**
323
     * Check if the given page should be crawled
324
     *
325
     * @return false|string false if the page should be crawled (not excluded), true / skipMessage if it should be skipped
326
     */
327 8
    public function checkIfPageShouldBeSkipped(array $pageRow)
328
    {
329 8
        $skipPage = false;
330 8
        $skipMessage = 'Skipped'; // message will be overwritten later
331
332
        // if page is hidden
333 8
        if (! $this->extensionSettings['crawlHiddenPages']) {
334 8
            if ($pageRow['hidden']) {
335 1
                $skipPage = true;
336 1
                $skipMessage = 'Because page is hidden';
337
            }
338
        }
339
340 8
        if (! $skipPage) {
341 7
            if (GeneralUtility::inList('3,4', $pageRow['doktype']) || $pageRow['doktype'] >= 199) {
342 3
                $skipPage = true;
343 3
                $skipMessage = 'Because doktype is not allowed';
344
            }
345
        }
346
347 8
        if (! $skipPage) {
348 4
            foreach ($GLOBALS['TYPO3_CONF_VARS']['EXTCONF']['crawler']['excludeDoktype'] ?? [] as $key => $doktypeList) {
349 1
                if (GeneralUtility::inList($doktypeList, $pageRow['doktype'])) {
350 1
                    $skipPage = true;
351 1
                    $skipMessage = 'Doktype was excluded by "' . $key . '"';
352 1
                    break;
353
                }
354
            }
355
        }
356
357 8
        if (! $skipPage) {
358
            // veto hook
359 3
            foreach ($GLOBALS['TYPO3_CONF_VARS']['EXTCONF']['crawler']['pageVeto'] ?? [] as $key => $func) {
360
                $params = [
361
                    'pageRow' => $pageRow,
362
                ];
363
                // expects "false" if page is ok and "true" or a skipMessage if this page should _not_ be crawled
364
                $veto = GeneralUtility::callUserFunction($func, $params, $this);
365
                if ($veto !== false) {
366
                    $skipPage = true;
367
                    if (is_string($veto)) {
368
                        $skipMessage = $veto;
369
                    } else {
370
                        $skipMessage = 'Veto from hook "' . htmlspecialchars($key) . '"';
371
                    }
372
                    // no need to execute other hooks if a previous one return a veto
373
                    break;
374
                }
375
            }
376
        }
377
378 8
        return $skipPage ? $skipMessage : false;
379
    }
380
381
    /**
382
     * Wrapper method for getUrlsForPageId()
383
     * It returns an array of configurations and no urls!
384
     *
385
     * @param array $pageRow Page record with at least dok-type and uid columns.
386
     * @param string $skipMessage
387
     * @return array
388
     * @see getUrlsForPageId()
389
     */
390 4
    public function getUrlsForPageRow(array $pageRow, &$skipMessage = '')
391
    {
392 4
        $message = $this->checkIfPageShouldBeSkipped($pageRow);
393
394 4
        if ($message === false) {
395 3
            $res = $this->getUrlsForPageId($pageRow['uid']);
396 3
            $skipMessage = '';
397
        } else {
398 1
            $skipMessage = $message;
399 1
            $res = [];
400
        }
401
402 4
        return $res;
403
    }
404
405
    /**
406
     * Creates a list of URLs from input array (and submits them to queue if asked for)
407
     * See Web > Info module script + "indexed_search"'s crawler hook-client using this!
408
     *
409
     * @param array $vv Information about URLs from pageRow to crawl.
410
     * @param array $pageRow Page row
411
     * @param int $scheduledTime Unix time to schedule indexing to, typically time()
412
     * @param int $reqMinute Number of requests per minute (creates the interleave between requests)
413
     * @param bool $submitCrawlUrls If set, submits the URLs to queue
414
     * @param bool $downloadCrawlUrls If set (and submitcrawlUrls is false) will fill $downloadUrls with entries)
415
     * @param array $duplicateTrack Array which is passed by reference and contains the an id per url to secure we will not crawl duplicates
416
     * @param array $downloadUrls Array which will be filled with URLS for download if flag is set.
417
     * @param array $incomingProcInstructions Array of processing instructions
418
     * @return string List of URLs (meant for display in backend module)
419
     */
420 2
    public function urlListFromUrlArray(
421
        array $vv,
422
        array $pageRow,
423
        $scheduledTime,
424
        $reqMinute,
425
        $submitCrawlUrls,
426
        $downloadCrawlUrls,
427
        array &$duplicateTrack,
428
        array &$downloadUrls,
429
        array $incomingProcInstructions
430
    ) {
431 2
        if (! is_array($vv['URLs'])) {
432
            return 'ERROR - no URL generated';
433
        }
434 2
        $urlLog = [];
435 2
        $pageId = (int) $pageRow['uid'];
436 2
        $configurationHash = $this->getConfigurationHash($vv);
437 2
        $skipInnerCheck = $this->queueRepository->noUnprocessedQueueEntriesForPageWithConfigurationHashExist($pageId, $configurationHash);
438
439 2
        foreach ($vv['URLs'] as $urlQuery) {
440 2
            if (! $this->drawURLs_PIfilter($vv['subCfg']['procInstrFilter'], $incomingProcInstructions)) {
441
                continue;
442
            }
443 2
            $url = (string) $this->getUrlFromPageAndQueryParameters(
444 2
                $pageId,
445 2
                $urlQuery,
446 2
                $vv['subCfg']['baseUrl'] ?? null,
447 2
                $vv['subCfg']['force_ssl'] ?? 0
448
            );
449
450
            // Create key by which to determine unique-ness:
451 2
            $uKey = $url . '|' . $vv['subCfg']['userGroups'] . '|' . $vv['subCfg']['procInstrFilter'];
452
453 2
            if (isset($duplicateTrack[$uKey])) {
454
                //if the url key is registered just display it and do not resubmit is
455
                $urlLog[] = '<em><span class="text-muted">' . htmlspecialchars($url) . '</span></em>';
456
            } else {
457
                // Scheduled time:
458 2
                $schTime = $scheduledTime + round(count($duplicateTrack) * (60 / $reqMinute));
459 2
                $schTime = intval($schTime / 60) * 60;
460 2
                $formattedDate = BackendUtility::datetime($schTime);
461 2
                $this->urlList[] = '[' . $formattedDate . '] ' . $url;
462 2
                $urlList = '[' . $formattedDate . '] ' . htmlspecialchars($url);
463
464
                // Submit for crawling!
465 2
                if ($submitCrawlUrls) {
466 2
                    $added = $this->addUrl(
467 2
                        $pageId,
468 2
                        $url,
469 2
                        $vv['subCfg'],
470 2
                        $scheduledTime,
471 2
                        $configurationHash,
472 2
                        $skipInnerCheck
473
                    );
474 2
                    if ($added === false) {
475 2
                        $urlList .= ' (URL already existed)';
476
                    }
477
                } elseif ($downloadCrawlUrls) {
478
                    $downloadUrls[$url] = $url;
479
                }
480 2
                $urlLog[] = $urlList;
481
            }
482 2
            $duplicateTrack[$uKey] = true;
483
        }
484
485 2
        return implode('<br>', $urlLog);
486
    }
487
488
    /**
489
     * Returns true if input processing instruction is among registered ones.
490
     *
491
     * @param string $piString PI to test
492
     * @param array $incomingProcInstructions Processing instructions
493
     * @return boolean
494
     */
495 5
    public function drawURLs_PIfilter($piString, array $incomingProcInstructions)
496
    {
497 5
        if (empty($incomingProcInstructions)) {
498 1
            return true;
499
        }
500
501 4
        foreach ($incomingProcInstructions as $pi) {
502 4
            if (GeneralUtility::inList($piString, $pi)) {
503 2
                return true;
504
            }
505
        }
506 2
        return false;
507
    }
508
509 3
    public function getPageTSconfigForId($id): array
510
    {
511 3
        if (! $this->MP) {
512 3
            $pageTSconfig = BackendUtility::getPagesTSconfig($id);
0 ignored issues
show
Deprecated Code introduced by
The function TYPO3\CMS\Backend\Utilit...ity::getPagesTSconfig() has been deprecated. ( Ignorable by Annotation )

If this is a false-positive, you can also ignore this issue in your code via the ignore-deprecated  annotation

512
            $pageTSconfig = /** @scrutinizer ignore-deprecated */ BackendUtility::getPagesTSconfig($id);
Loading history...
513
        } else {
514
            // TODO: Please check, this makes no sense to split a boolean value.
515
            [, $mountPointId] = explode('-', $this->MP);
0 ignored issues
show
Bug introduced by
$this->MP of type true is incompatible with the type string expected by parameter $string of explode(). ( Ignorable by Annotation )

If this is a false-positive, you can also ignore this issue in your code via the ignore-type  annotation

515
            [, $mountPointId] = explode('-', /** @scrutinizer ignore-type */ $this->MP);
Loading history...
516
            $pageTSconfig = BackendUtility::getPagesTSconfig($mountPointId);
0 ignored issues
show
Deprecated Code introduced by
The function TYPO3\CMS\Backend\Utilit...ity::getPagesTSconfig() has been deprecated. ( Ignorable by Annotation )

If this is a false-positive, you can also ignore this issue in your code via the ignore-deprecated  annotation

516
            $pageTSconfig = /** @scrutinizer ignore-deprecated */ BackendUtility::getPagesTSconfig($mountPointId);
Loading history...
Bug introduced by
$mountPointId of type string is incompatible with the type integer expected by parameter $id of TYPO3\CMS\Backend\Utilit...ity::getPagesTSconfig(). ( Ignorable by Annotation )

If this is a false-positive, you can also ignore this issue in your code via the ignore-type  annotation

516
            $pageTSconfig = BackendUtility::getPagesTSconfig(/** @scrutinizer ignore-type */ $mountPointId);
Loading history...
517
        }
518
519
        // Call a hook to alter configuration
520 3
        if (is_array($GLOBALS['TYPO3_CONF_VARS']['EXTCONF']['crawler']['getPageTSconfigForId'])) {
521
            $params = [
522
                'pageId' => $id,
523
                'pageTSConfig' => &$pageTSconfig,
524
            ];
525
            foreach ($GLOBALS['TYPO3_CONF_VARS']['EXTCONF']['crawler']['getPageTSconfigForId'] as $userFunc) {
526
                GeneralUtility::callUserFunction($userFunc, $params, $this);
527
            }
528
        }
529 3
        return $pageTSconfig;
530
    }
531
532
    /**
533
     * This methods returns an array of configurations.
534
     * Adds no urls!
535
     */
536 2
    public function getUrlsForPageId(int $pageId): array
537
    {
538
        // Get page TSconfig for page ID
539 2
        $pageTSconfig = $this->getPageTSconfigForId($pageId);
540
541 2
        $res = [];
542
543
        // Fetch Crawler Configuration from pageTSconfig
544 2
        $crawlerCfg = $pageTSconfig['tx_crawler.']['crawlerCfg.']['paramSets.'] ?? [];
545 2
        foreach ($crawlerCfg as $key => $values) {
546 1
            if (! is_array($values)) {
547 1
                continue;
548
            }
549 1
            $key = str_replace('.', '', $key);
550
            // Sub configuration for a single configuration string:
551 1
            $subCfg = (array) $crawlerCfg[$key . '.'];
552 1
            $subCfg['key'] = $key;
553
554 1
            if (strcmp($subCfg['procInstrFilter'], '')) {
555 1
                $subCfg['procInstrFilter'] = implode(',', GeneralUtility::trimExplode(',', $subCfg['procInstrFilter']));
556
            }
557 1
            $pidOnlyList = implode(',', GeneralUtility::trimExplode(',', $subCfg['pidsOnly'], true));
558
559
            // process configuration if it is not page-specific or if the specific page is the current page:
560
            // TODO: Check if $pidOnlyList can be kept as Array instead of imploded
561 1
            if (! strcmp((string) $subCfg['pidsOnly'], '') || GeneralUtility::inList($pidOnlyList, strval($pageId))) {
562
563
                // Explode, process etc.:
564 1
                $res[$key] = [];
565 1
                $res[$key]['subCfg'] = $subCfg;
566 1
                $res[$key]['paramParsed'] = GeneralUtility::explodeUrl2Array($crawlerCfg[$key]);
567 1
                $res[$key]['paramExpanded'] = $this->expandParameters($res[$key]['paramParsed'], $pageId);
568 1
                $res[$key]['origin'] = 'pagets';
569
570
                // recognize MP value
571 1
                if (! $this->MP) {
572 1
                    $res[$key]['URLs'] = $this->compileUrls($res[$key]['paramExpanded'], ['?id=' . $pageId]);
573
                } else {
574
                    $res[$key]['URLs'] = $this->compileUrls($res[$key]['paramExpanded'], ['?id=' . $pageId . '&MP=' . $this->MP]);
0 ignored issues
show
Bug introduced by
Are you sure $this->MP of type true can be used in concatenation? ( Ignorable by Annotation )

If this is a false-positive, you can also ignore this issue in your code via the ignore-type  annotation

574
                    $res[$key]['URLs'] = $this->compileUrls($res[$key]['paramExpanded'], ['?id=' . $pageId . '&MP=' . /** @scrutinizer ignore-type */ $this->MP]);
Loading history...
575
                }
576
            }
577
        }
578
579
        // Get configuration from tx_crawler_configuration records up the rootline
580 2
        $crawlerConfigurations = $this->configurationRepository->getCrawlerConfigurationRecordsFromRootLine($pageId);
581 2
        foreach ($crawlerConfigurations as $configurationRecord) {
582
583
            // check access to the configuration record
584 1
            if (empty($configurationRecord['begroups']) || $this->getBackendUser()->isAdmin() || $this->hasGroupAccess($this->getBackendUser()->user['usergroup_cached_list'], $configurationRecord['begroups'])) {
585 1
                $pidOnlyList = implode(',', GeneralUtility::trimExplode(',', $configurationRecord['pidsonly'], true));
586
587
                // process configuration if it is not page-specific or if the specific page is the current page:
588
                // TODO: Check if $pidOnlyList can be kept as Array instead of imploded
589 1
                if (! strcmp($configurationRecord['pidsonly'], '') || GeneralUtility::inList($pidOnlyList, strval($pageId))) {
590 1
                    $key = $configurationRecord['name'];
591
592
                    // don't overwrite previously defined paramSets
593 1
                    if (! isset($res[$key])) {
594
595
                        /* @var $TSparserObject \TYPO3\CMS\Core\TypoScript\Parser\TypoScriptParser */
596 1
                        $TSparserObject = GeneralUtility::makeInstance(TypoScriptParser::class);
597 1
                        $TSparserObject->parse($configurationRecord['processing_instruction_parameters_ts']);
598
599
                        $subCfg = [
600 1
                            'procInstrFilter' => $configurationRecord['processing_instruction_filter'],
601 1
                            'procInstrParams.' => $TSparserObject->setup,
602 1
                            'baseUrl' => $configurationRecord['base_url'],
603 1
                            'force_ssl' => (int) $configurationRecord['force_ssl'],
604 1
                            'userGroups' => $configurationRecord['fegroups'],
605 1
                            'exclude' => $configurationRecord['exclude'],
606 1
                            'key' => $key,
607
                        ];
608
609 1
                        if (! in_array($pageId, $this->expandExcludeString($subCfg['exclude']), true)) {
610 1
                            $res[$key] = [];
611 1
                            $res[$key]['subCfg'] = $subCfg;
612 1
                            $res[$key]['paramParsed'] = GeneralUtility::explodeUrl2Array($configurationRecord['configuration']);
613 1
                            $res[$key]['paramExpanded'] = $this->expandParameters($res[$key]['paramParsed'], $pageId);
614 1
                            $res[$key]['URLs'] = $this->compileUrls($res[$key]['paramExpanded'], ['?id=' . $pageId]);
615 1
                            $res[$key]['origin'] = 'tx_crawler_configuration_' . $configurationRecord['uid'];
616
                        }
617
                    }
618
                }
619
            }
620
        }
621
622 2
        foreach ($GLOBALS['TYPO3_CONF_VARS']['EXTCONF']['crawler']['processUrls'] ?? [] as $func) {
623
            $params = [
624
                'res' => &$res,
625
            ];
626
            GeneralUtility::callUserFunction($func, $params, $this);
627
        }
628 2
        return $res;
629
    }
630
631
    /**
632
     * Find all configurations of subpages of a page
633
     * TODO: Write Functional Tests
634
     */
635 1
    public function getConfigurationsForBranch(int $rootid, int $depth): array
636
    {
637 1
        $configurationsForBranch = [];
638 1
        $pageTSconfig = $this->getPageTSconfigForId($rootid);
639 1
        $sets = $pageTSconfig['tx_crawler.']['crawlerCfg.']['paramSets.'] ?? [];
640 1
        foreach ($sets as $key => $value) {
641
            if (! is_array($value)) {
642
                continue;
643
            }
644
            $configurationsForBranch[] = substr($key, -1) === '.' ? substr($key, 0, -1) : $key;
645
        }
646 1
        $pids = [];
647 1
        $rootLine = BackendUtility::BEgetRootLine($rootid);
648 1
        foreach ($rootLine as $node) {
649 1
            $pids[] = $node['uid'];
650
        }
651
        /* @var PageTreeView $tree */
652 1
        $tree = GeneralUtility::makeInstance(PageTreeView::class);
653 1
        $perms_clause = $this->getBackendUser()->getPagePermsClause(Permission::PAGE_SHOW);
654 1
        $tree->init(empty($perms_clause) ? '' : ('AND ' . $perms_clause));
655 1
        $tree->getTree($rootid, $depth, '');
656 1
        foreach ($tree->tree as $node) {
657
            $pids[] = $node['row']['uid'];
658
        }
659
660 1
        $queryBuilder = $this->getQueryBuilder('tx_crawler_configuration');
661
        $statement = $queryBuilder
662 1
            ->select('name')
663 1
            ->from('tx_crawler_configuration')
664 1
            ->where(
665 1
                $queryBuilder->expr()->in('pid', $queryBuilder->createNamedParameter($pids, Connection::PARAM_INT_ARRAY))
666
            )
667 1
            ->execute();
668
669 1
        while ($row = $statement->fetch()) {
670 1
            $configurationsForBranch[] = $row['name'];
671
        }
672 1
        return $configurationsForBranch;
673
    }
674
675
    /**
676
     * Check if a user has access to an item
677
     * (e.g. get the group list of the current logged in user from $GLOBALS['TSFE']->gr_list)
678
     *
679
     * @param string $groupList Comma-separated list of (fe_)group UIDs from a user
680
     * @param string $accessList Comma-separated list of (fe_)group UIDs of the item to access
681
     * @return bool                 TRUE if at least one of the users group UIDs is in the access list or the access list is empty
682
     * @see \TYPO3\CMS\Frontend\Page\PageRepository::getMultipleGroupsWhereClause()
683
     */
684 3
    public function hasGroupAccess($groupList, $accessList)
685
    {
686 3
        if (empty($accessList)) {
687 1
            return true;
688
        }
689 2
        foreach (GeneralUtility::intExplode(',', $groupList) as $groupUid) {
690 2
            if (GeneralUtility::inList($accessList, $groupUid)) {
691 1
                return true;
692
            }
693
        }
694 1
        return false;
695
    }
696
697
    /**
698
     * Will expand the parameters configuration to individual values. This follows a certain syntax of the value of each parameter.
699
     * Syntax of values:
700
     * - Basically: If the value is wrapped in [...] it will be expanded according to the following syntax, otherwise the value is taken literally
701
     * - Configuration is splitted by "|" and the parts are processed individually and finally added together
702
     * - For each configuration part:
703
     *         - "[int]-[int]" = Integer range, will be expanded to all values in between, values included, starting from low to high (max. 1000). Example "1-34" or "-40--30"
704
     *         - "_TABLE:[TCA table name];[_PID:[optional page id, default is current page]];[_ENABLELANG:1]" = Look up of table records from PID, filtering out deleted records. Example "_TABLE:tt_content; _PID:123"
705
     *        _ENABLELANG:1 picks only original records without their language overlays
706
     *         - Default: Literal value
707
     *
708
     * @param array $paramArray Array with key (GET var name) and values (value of GET var which is configuration for expansion)
709
     * @param integer $pid Current page ID
710
     * @return array
711
     *
712
     * TODO: Write Functional Tests
713
     */
714 9
    public function expandParameters($paramArray, $pid)
715
    {
716
        // Traverse parameter names:
717 9
        foreach ($paramArray as $p => $v) {
718 9
            $v = trim($v);
719
720
            // If value is encapsulated in square brackets it means there are some ranges of values to find, otherwise the value is literal
721 9
            if (strpos($v, '[') === 0 && substr($v, -1) === ']') {
722
                // So, find the value inside brackets and reset the paramArray value as an array.
723 9
                $v = substr($v, 1, -1);
724 9
                $paramArray[$p] = [];
725
726
                // Explode parts and traverse them:
727 9
                $parts = explode('|', $v);
728 9
                foreach ($parts as $pV) {
729
730
                    // Look for integer range: (fx. 1-34 or -40--30 // reads minus 40 to minus 30)
731 9
                    if (preg_match('/^(-?[0-9]+)\s*-\s*(-?[0-9]+)$/', trim($pV), $reg)) {
732 1
                        $reg = $this->swapIfFirstIsLargerThanSecond($reg);
733
734
                        // Traverse range, add values:
735 1
                        $runAwayBrake = 1000; // Limit to size of range!
736 1
                        for ($a = $reg[1]; $a <= $reg[2]; $a++) {
737 1
                            $paramArray[$p][] = $a;
738 1
                            $runAwayBrake--;
739 1
                            if ($runAwayBrake <= 0) {
740
                                break;
741
                            }
742
                        }
743 8
                    } elseif (strpos(trim($pV), '_TABLE:') === 0) {
744
745
                        // Parse parameters:
746 6
                        $subparts = GeneralUtility::trimExplode(';', $pV);
747 6
                        $subpartParams = [];
748 6
                        foreach ($subparts as $spV) {
749 6
                            [$pKey, $pVal] = GeneralUtility::trimExplode(':', $spV);
750 6
                            $subpartParams[$pKey] = $pVal;
751
                        }
752
753
                        // Table exists:
754 6
                        if (isset($GLOBALS['TCA'][$subpartParams['_TABLE']])) {
755 6
                            $lookUpPid = isset($subpartParams['_PID']) ? intval($subpartParams['_PID']) : intval($pid);
756 6
                            $recursiveDepth = isset($subpartParams['_RECURSIVE']) ? intval($subpartParams['_RECURSIVE']) : 0;
757 6
                            $pidField = isset($subpartParams['_PIDFIELD']) ? trim($subpartParams['_PIDFIELD']) : 'pid';
758 6
                            $where = $subpartParams['_WHERE'] ?? '';
759 6
                            $addTable = $subpartParams['_ADDTABLE'] ?? '';
760
761 6
                            $fieldName = $subpartParams['_FIELD'] ? $subpartParams['_FIELD'] : 'uid';
762 6
                            if ($fieldName === 'uid' || $GLOBALS['TCA'][$subpartParams['_TABLE']]['columns'][$fieldName]) {
763 6
                                $queryBuilder = $this->getQueryBuilder($subpartParams['_TABLE']);
764
765 6
                                if ($recursiveDepth > 0) {
766
                                    /** @var \TYPO3\CMS\Core\Database\QueryGenerator $queryGenerator */
767 2
                                    $queryGenerator = GeneralUtility::makeInstance(\TYPO3\CMS\Core\Database\QueryGenerator::class);
768 2
                                    $pidList = $queryGenerator->getTreeList($lookUpPid, $recursiveDepth, 0, 1);
769 2
                                    $pidArray = GeneralUtility::intExplode(',', $pidList);
770
                                } else {
771 4
                                    $pidArray = [(string) $lookUpPid];
772
                                }
773
774 6
                                $queryBuilder->getRestrictions()
775 6
                                    ->removeAll()
776 6
                                    ->add(GeneralUtility::makeInstance(DeletedRestriction::class));
777
778
                                $queryBuilder
779 6
                                    ->select($fieldName)
780 6
                                    ->from($subpartParams['_TABLE'])
781 6
                                    ->where(
782 6
                                        $queryBuilder->expr()->in($pidField, $queryBuilder->createNamedParameter($pidArray, Connection::PARAM_INT_ARRAY)),
783 6
                                        $where
784
                                    );
785
786 6
                                if (! empty($addTable)) {
787
                                    // TODO: Check if this works as intended!
788
                                    $queryBuilder->add('from', $addTable);
789
                                }
790 6
                                $transOrigPointerField = $GLOBALS['TCA'][$subpartParams['_TABLE']]['ctrl']['transOrigPointerField'];
791
792 6
                                if ($subpartParams['_ENABLELANG'] && $transOrigPointerField) {
793
                                    $queryBuilder->andWhere(
794
                                        $queryBuilder->expr()->lte(
795
                                            $transOrigPointerField,
796
                                            0
797
                                        )
798
                                    );
799
                                }
800
801 6
                                $statement = $queryBuilder->execute();
802
803 6
                                $rows = [];
804 6
                                while ($row = $statement->fetch()) {
805 6
                                    $rows[$row[$fieldName]] = $row;
806
                                }
807
808 6
                                if (is_array($rows)) {
809 6
                                    $paramArray[$p] = array_merge($paramArray[$p], array_keys($rows));
810
                                }
811
                            }
812
                        }
813
                    } else { // Just add value:
814 2
                        $paramArray[$p][] = $pV;
815
                    }
816
                    // Hook for processing own expandParameters place holder
817 9
                    if (is_array($GLOBALS['TYPO3_CONF_VARS']['SC_OPTIONS']['crawler/class.tx_crawler_lib.php']['expandParameters'])) {
818
                        $_params = [
819
                            'pObj' => &$this,
820
                            'paramArray' => &$paramArray,
821
                            'currentKey' => $p,
822
                            'currentValue' => $pV,
823
                            'pid' => $pid,
824
                        ];
825
                        foreach ($GLOBALS['TYPO3_CONF_VARS']['SC_OPTIONS']['crawler/class.tx_crawler_lib.php']['expandParameters'] as $_funcRef) {
826
                            GeneralUtility::callUserFunction($_funcRef, $_params, $this);
827
                        }
828
                    }
829
                }
830
831
                // Make unique set of values and sort array by key:
832 9
                $paramArray[$p] = array_unique($paramArray[$p]);
833 9
                ksort($paramArray);
834
            } else {
835
                // Set the literal value as only value in array:
836 2
                $paramArray[$p] = [$v];
837
            }
838
        }
839
840 9
        return $paramArray;
841
    }
842
843
    /**
844
     * Compiling URLs from parameter array (output of expandParameters())
845
     * The number of URLs will be the multiplication of the number of parameter values for each key
846
     *
847
     * @param array $paramArray Output of expandParameters(): Array with keys (GET var names) and for each an array of values
848
     * @param array $urls URLs accumulated in this array (for recursion)
849
     * @return array
850
     */
851 5
    public function compileUrls($paramArray, array $urls)
852
    {
853 5
        if (empty($paramArray)) {
854 5
            return $urls;
855
        }
856
        // shift first off stack:
857 4
        reset($paramArray);
858 4
        $varName = key($paramArray);
859 4
        $valueSet = array_shift($paramArray);
860
861
        // Traverse value set:
862 4
        $newUrls = [];
863 4
        foreach ($urls as $url) {
864 3
            foreach ($valueSet as $val) {
865 3
                $newUrls[] = $url . (strcmp((string) $val, '') ? '&' . rawurlencode($varName) . '=' . rawurlencode((string) $val) : '');
866
867 3
                if (count($newUrls) > $this->maximumUrlsToCompile) {
868
                    break;
869
                }
870
            }
871
        }
872 4
        return $this->compileUrls($paramArray, $newUrls);
873
    }
874
875
    /************************************
876
     *
877
     * Crawler log
878
     *
879
     ************************************/
880
881
    /**
882
     * Return array of records from crawler queue for input page ID
883
     *
884
     * @param integer $id Page ID for which to look up log entries.
885
     * @param string $filter Filter: "all" => all entries, "pending" => all that is not yet run, "finished" => all complete ones
886
     * @param boolean $doFlush If TRUE, then entries selected at DELETED(!) instead of selected!
887
     * @param boolean $doFullFlush
888
     * @param integer $itemsPerPage Limit the amount of entries per page default is 10
889
     * @return array
890
     */
891 4
    public function getLogEntriesForPageId($id, $filter = '', $doFlush = false, $doFullFlush = false, $itemsPerPage = 10)
892
    {
893 4
        $queryBuilder = GeneralUtility::makeInstance(ConnectionPool::class)->getQueryBuilderForTable($this->tableName);
894
        $queryBuilder
895 4
            ->select('*')
896 4
            ->from($this->tableName)
897 4
            ->where(
898 4
                $queryBuilder->expr()->eq('page_id', $queryBuilder->createNamedParameter($id, \PDO::PARAM_INT))
899
            )
900 4
            ->orderBy('scheduled', 'DESC');
901
902 4
        $expressionBuilder = GeneralUtility::makeInstance(ConnectionPool::class)
903 4
            ->getConnectionForTable($this->tableName)
904 4
            ->getExpressionBuilder();
905 4
        $query = $expressionBuilder->andX();
0 ignored issues
show
Unused Code introduced by
The assignment to $query is dead and can be removed.
Loading history...
906
        // PHPStorm adds the highlight that the $addWhere is immediately overwritten,
907
        // but the $query = $expressionBuilder->andX() ensures that the $addWhere is written correctly with AND
908
        // between the statements, it's not a mistake in the code.
909 4
        switch ($filter) {
910 4
            case 'pending':
911
                $queryBuilder->andWhere($queryBuilder->expr()->eq('exec_time', 0));
912
                break;
913 4
            case 'finished':
914
                $queryBuilder->andWhere($queryBuilder->expr()->gt('exec_time', 0));
915
                break;
916
        }
917
918 4
        if ($doFlush) {
919 2
            if ($doFullFlush) {
920 1
                $this->queueRepository->flushQueue('all');
921
            } else {
922 1
                $this->queueRepository->flushQueue($filter);
923
            }
924
        }
925 4
        if ($itemsPerPage > 0) {
926
            $queryBuilder
927 4
                ->setMaxResults((int) $itemsPerPage);
928
        }
929
930 4
        return $queryBuilder->execute()->fetchAll();
931
    }
932
933
    /**
934
     * Return array of records from crawler queue for input set ID
935
     *
936
     * @param int $set_id Set ID for which to look up log entries.
937
     * @param string $filter Filter: "all" => all entries, "pending" => all that is not yet run, "finished" => all complete ones
938
     * @param bool $doFlush If TRUE, then entries selected at DELETED(!) instead of selected!
939
     * @param int $itemsPerPage Limit the amount of entries per page default is 10
940
     * @return array
941
     *
942
     * @deprecated
943
     */
944 6
    public function getLogEntriesForSetId(int $set_id, string $filter = '', bool $doFlush = false, bool $doFullFlush = false, int $itemsPerPage = 10)
945
    {
946 6
        $queryBuilder = GeneralUtility::makeInstance(ConnectionPool::class)->getQueryBuilderForTable($this->tableName);
947
        $queryBuilder
948 6
            ->select('*')
949 6
            ->from($this->tableName)
950 6
            ->where(
951 6
                $queryBuilder->expr()->eq('set_id', $queryBuilder->createNamedParameter($set_id, \PDO::PARAM_INT))
952
            )
953 6
            ->orderBy('scheduled', 'DESC');
954
955 6
        $expressionBuilder = GeneralUtility::makeInstance(ConnectionPool::class)
956 6
            ->getConnectionForTable($this->tableName)
957 6
            ->getExpressionBuilder();
958 6
        $query = $expressionBuilder->andX();
959
        // PHPStorm adds the highlight that the $addWhere is immediately overwritten,
960
        // but the $query = $expressionBuilder->andX() ensures that the $addWhere is written correctly with AND
961
        // between the statements, it's not a mistake in the code.
962 6
        $addWhere = '';
963 6
        switch ($filter) {
964 6
            case 'pending':
965 1
                $queryBuilder->andWhere($queryBuilder->expr()->eq('exec_time', 0));
966 1
                $addWhere = $query->add($expressionBuilder->eq('exec_time', 0));
0 ignored issues
show
Unused Code introduced by
The assignment to $addWhere is dead and can be removed.
Loading history...
967 1
                break;
968 5
            case 'finished':
969 1
                $queryBuilder->andWhere($queryBuilder->expr()->gt('exec_time', 0));
970 1
                $addWhere = $query->add($expressionBuilder->gt('exec_time', 0));
971 1
                break;
972
        }
973 6
        if ($doFlush) {
974 4
            $addWhere = $query->add($expressionBuilder->eq('set_id', (int) $set_id));
975 4
            $this->flushQueue($doFullFlush ? '' : $addWhere);
0 ignored issues
show
Deprecated Code introduced by
The function AOE\Crawler\Controller\C...ontroller::flushQueue() has been deprecated. ( Ignorable by Annotation )

If this is a false-positive, you can also ignore this issue in your code via the ignore-deprecated  annotation

975
            /** @scrutinizer ignore-deprecated */ $this->flushQueue($doFullFlush ? '' : $addWhere);
Loading history...
976 4
            return [];
977
        }
978 2
        if ($itemsPerPage > 0) {
979
            $queryBuilder
980 2
                ->setMaxResults((int) $itemsPerPage);
981
        }
982
983 2
        return $queryBuilder->execute()->fetchAll();
984
    }
985
986
    /**
987
     * Adding call back entries to log (called from hooks typically, see indexed search class "class.crawler.php"
988
     *
989
     * @param integer $setId Set ID
990
     * @param array $params Parameters to pass to call back function
991
     * @param string $callBack Call back object reference, eg. 'EXT:indexed_search/class.crawler.php:&tx_indexedsearch_crawler'
992
     * @param integer $page_id Page ID to attach it to
993
     * @param integer $schedule Time at which to activate
994
     */
995
    public function addQueueEntry_callBack($setId, $params, $callBack, $page_id = 0, $schedule = 0): void
996
    {
997
        if (! is_array($params)) {
0 ignored issues
show
introduced by
The condition is_array($params) is always true.
Loading history...
998
            $params = [];
999
        }
1000
        $params['_CALLBACKOBJ'] = $callBack;
1001
1002
        GeneralUtility::makeInstance(ConnectionPool::class)->getConnectionForTable('tx_crawler_queue')
1003
            ->insert(
1004
                'tx_crawler_queue',
1005
                [
1006
                    'page_id' => (int) $page_id,
1007
                    'parameters' => serialize($params),
1008
                    'scheduled' => (int) $schedule ?: $this->getCurrentTime(),
1009
                    'exec_time' => 0,
1010
                    'set_id' => (int) $setId,
1011
                    'result_data' => '',
1012
                ]
1013
            );
1014
    }
1015
1016
    /************************************
1017
     *
1018
     * URL setting
1019
     *
1020
     ************************************/
1021
1022
    /**
1023
     * Setting a URL for crawling:
1024
     *
1025
     * @param integer $id Page ID
1026
     * @param string $url Complete URL
1027
     * @param array $subCfg Sub configuration array (from TS config)
1028
     * @param integer $tstamp Scheduled-time
1029
     * @param string $configurationHash (optional) configuration hash
1030
     * @param bool $skipInnerDuplicationCheck (optional) skip inner duplication check
1031
     * @return bool
1032
     */
1033 6
    public function addUrl(
1034
        $id,
1035
        $url,
1036
        array $subCfg,
1037
        $tstamp,
1038
        $configurationHash = '',
1039
        $skipInnerDuplicationCheck = false
1040
    ) {
1041 6
        $urlAdded = false;
1042 6
        $rows = [];
1043
1044
        // Creating parameters:
1045
        $parameters = [
1046 6
            'url' => $url,
1047
        ];
1048
1049
        // fe user group simulation:
1050 6
        $uGs = implode(',', array_unique(GeneralUtility::intExplode(',', $subCfg['userGroups'], true)));
1051 6
        if ($uGs) {
1052 1
            $parameters['feUserGroupList'] = $uGs;
1053
        }
1054
1055
        // Setting processing instructions
1056 6
        $parameters['procInstructions'] = GeneralUtility::trimExplode(',', $subCfg['procInstrFilter']);
1057 6
        if (is_array($subCfg['procInstrParams.'])) {
1058 3
            $parameters['procInstrParams'] = $subCfg['procInstrParams.'];
1059
        }
1060
1061
        // Compile value array:
1062 6
        $parameters_serialized = serialize($parameters);
1063
        $fieldArray = [
1064 6
            'page_id' => (int) $id,
1065 6
            'parameters' => $parameters_serialized,
1066 6
            'parameters_hash' => GeneralUtility::shortMD5($parameters_serialized),
1067 6
            'configuration_hash' => $configurationHash,
1068 6
            'scheduled' => $tstamp,
1069 6
            'exec_time' => 0,
1070 6
            'set_id' => (int) $this->setID,
1071 6
            'result_data' => '',
1072 6
            'configuration' => $subCfg['key'],
1073
        ];
1074
1075 6
        if ($this->registerQueueEntriesInternallyOnly) {
1076
            //the entries will only be registered and not stored to the database
1077 1
            $this->queueEntries[] = $fieldArray;
1078
        } else {
1079 5
            if (! $skipInnerDuplicationCheck) {
1080
                // check if there is already an equal entry
1081 4
                $rows = $this->getDuplicateRowsIfExist($tstamp, $fieldArray);
1082
            }
1083
1084 5
            if (empty($rows)) {
1085 4
                $connectionForCrawlerQueue = GeneralUtility::makeInstance(ConnectionPool::class)->getConnectionForTable('tx_crawler_queue');
1086 4
                $connectionForCrawlerQueue->insert(
1087 4
                    'tx_crawler_queue',
1088 4
                    $fieldArray
1089
                );
1090 4
                $uid = $connectionForCrawlerQueue->lastInsertId('tx_crawler_queue', 'qid');
1091 4
                $rows[] = $uid;
1092 4
                $urlAdded = true;
1093 4
                EventDispatcher::getInstance()->post('urlAddedToQueue', strval($this->setID), ['uid' => $uid, 'fieldArray' => $fieldArray]);
1094
            } else {
1095 1
                EventDispatcher::getInstance()->post('duplicateUrlInQueue', strval($this->setID), ['rows' => $rows, 'fieldArray' => $fieldArray]);
1096
            }
1097
        }
1098
1099 6
        return $urlAdded;
1100
    }
1101
1102
    /**
1103
     * Returns the current system time
1104
     *
1105
     * @return int
1106
     */
1107
    public function getCurrentTime()
1108
    {
1109
        return time();
1110
    }
1111
1112
    /************************************
1113
     *
1114
     * URL reading
1115
     *
1116
     ************************************/
1117
1118
    /**
1119
     * Read URL for single queue entry
1120
     *
1121
     * @param integer $queueId
1122
     * @param boolean $force If set, will process even if exec_time has been set!
1123
     * @return integer
1124
     */
1125
    public function readUrl($queueId, $force = false)
1126
    {
1127
        $queryBuilder = GeneralUtility::makeInstance(ConnectionPool::class)->getQueryBuilderForTable($this->tableName);
1128
        $ret = 0;
1129
        $this->logger->debug('crawler-readurl start ' . microtime(true));
1130
        // Get entry:
1131
        $queryBuilder
1132
            ->select('*')
1133
            ->from('tx_crawler_queue')
1134
            ->where(
1135
                $queryBuilder->expr()->eq('qid', $queryBuilder->createNamedParameter($queueId, \PDO::PARAM_INT))
1136
            );
1137
        if (! $force) {
1138
            $queryBuilder
1139
                ->andWhere('exec_time = 0')
1140
                ->andWhere('process_scheduled > 0');
1141
        }
1142
        $queueRec = $queryBuilder->execute()->fetch();
1143
1144
        if (! is_array($queueRec)) {
1145
            return;
1146
        }
1147
1148
        SignalSlotUtility::emitSignal(
1149
            self::class,
1150
            SignalSlotUtility::SIGNAL_QUEUEITEM_PREPROCESS,
1151
            [$queueId, &$queueRec]
1152
        );
1153
1154
        // Set exec_time to lock record:
1155
        $field_array = ['exec_time' => $this->getCurrentTime()];
1156
1157
        if (isset($this->processID)) {
1158
            //if mulitprocessing is used we need to store the id of the process which has handled this entry
1159
            $field_array['process_id_completed'] = $this->processID;
1160
        }
1161
1162
        GeneralUtility::makeInstance(ConnectionPool::class)->getConnectionForTable('tx_crawler_queue')
1163
            ->update(
1164
                'tx_crawler_queue',
1165
                $field_array,
1166
                ['qid' => (int) $queueId]
1167
            );
1168
1169
        $result = $this->queueExecutor->executeQueueItem($queueRec, $this);
1170
        if ($result['content'] === null) {
1171
            $resultData = 'An errors happened';
1172
        } else {
1173
            $resultData = unserialize($result['content']);
1174
        }
1175
1176
        //atm there's no need to point to specific pollable extensions
1177
        if (is_array($GLOBALS['TYPO3_CONF_VARS']['EXTCONF']['crawler']['pollSuccess'])) {
1178
            foreach ($GLOBALS['TYPO3_CONF_VARS']['EXTCONF']['crawler']['pollSuccess'] as $pollable) {
1179
                // only check the success value if the instruction is runnig
1180
                // it is important to name the pollSuccess key same as the procInstructions key
1181
                if (is_array($resultData['parameters']['procInstructions'])
1182
                    && in_array(
1183
                        $pollable,
1184
                        $resultData['parameters']['procInstructions'], true
1185
                    )
1186
                ) {
1187
                    if (! empty($resultData['success'][$pollable]) && $resultData['success'][$pollable]) {
1188
                        $ret |= self::CLI_STATUS_POLLABLE_PROCESSED;
1189
                    }
1190
                }
1191
            }
1192
        }
1193
1194
        // Set result in log which also denotes the end of the processing of this entry.
1195
        $field_array = ['result_data' => serialize($result)];
1196
1197
        SignalSlotUtility::emitSignal(
1198
            self::class,
1199
            SignalSlotUtility::SIGNAL_QUEUEITEM_POSTPROCESS,
1200
            [$queueId, &$field_array]
1201
        );
1202
1203
        GeneralUtility::makeInstance(ConnectionPool::class)->getConnectionForTable('tx_crawler_queue')
1204
            ->update(
1205
                'tx_crawler_queue',
1206
                $field_array,
1207
                ['qid' => (int) $queueId]
1208
            );
1209
1210
        $this->logger->debug('crawler-readurl stop ' . microtime(true));
1211
        return $ret;
1212
    }
1213
1214
    /**
1215
     * Read URL for not-yet-inserted log-entry
1216
     *
1217
     * @param array $field_array Queue field array,
1218
     *
1219
     * @return string
1220
     */
1221
    public function readUrlFromArray($field_array)
1222
    {
1223
        // Set exec_time to lock record:
1224
        $field_array['exec_time'] = $this->getCurrentTime();
1225
        $connectionForCrawlerQueue = GeneralUtility::makeInstance(ConnectionPool::class)->getConnectionForTable($this->tableName);
1226
        $connectionForCrawlerQueue->insert(
1227
            $this->tableName,
1228
            $field_array
1229
        );
1230
        $queueId = $field_array['qid'] = $connectionForCrawlerQueue->lastInsertId($this->tableName, 'qid');
1231
1232
        $result = $this->queueExecutor->executeQueueItem($field_array, $this);
1233
1234
        // Set result in log which also denotes the end of the processing of this entry.
1235
        $field_array = ['result_data' => serialize($result)];
1236
1237
        SignalSlotUtility::emitSignal(
1238
            self::class,
1239
            SignalSlotUtility::SIGNAL_QUEUEITEM_POSTPROCESS,
1240
            [$queueId, &$field_array]
1241
        );
1242
1243
        $connectionForCrawlerQueue->update(
1244
            $this->tableName,
1245
            $field_array,
1246
            ['qid' => $queueId]
1247
        );
1248
1249
        return $result;
0 ignored issues
show
Bug Best Practice introduced by
The expression return $result also could return the type array|boolean which is incompatible with the documented return type string.
Loading history...
1250
    }
1251
1252
    /*****************************
1253
     *
1254
     * Compiling URLs to crawl - tools
1255
     *
1256
     *****************************/
1257
1258
    /**
1259
     * @param integer $id Root page id to start from.
1260
     * @param integer $depth Depth of tree, 0=only id-page, 1= on sublevel, 99 = infinite
1261
     * @param integer $scheduledTime Unix Time when the URL is timed to be visited when put in queue
1262
     * @param integer $reqMinute Number of requests per minute (creates the interleave between requests)
1263
     * @param boolean $submitCrawlUrls If set, submits the URLs to queue in database (real crawling)
1264
     * @param boolean $downloadCrawlUrls If set (and submitcrawlUrls is false) will fill $downloadUrls with entries)
1265
     * @param array $incomingProcInstructions Array of processing instructions
1266
     * @param array $configurationSelection Array of configuration keys
1267
     * @return string
1268
     */
1269
    public function getPageTreeAndUrls(
1270
        $id,
1271
        $depth,
1272
        $scheduledTime,
1273
        $reqMinute,
1274
        $submitCrawlUrls,
1275
        $downloadCrawlUrls,
1276
        array $incomingProcInstructions,
1277
        array $configurationSelection
1278
    ) {
1279
        $this->scheduledTime = $scheduledTime;
1280
        $this->reqMinute = $reqMinute;
1281
        $this->submitCrawlUrls = $submitCrawlUrls;
1282
        $this->downloadCrawlUrls = $downloadCrawlUrls;
1283
        $this->incomingProcInstructions = $incomingProcInstructions;
1284
        $this->incomingConfigurationSelection = $configurationSelection;
1285
1286
        $this->duplicateTrack = [];
1287
        $this->downloadUrls = [];
1288
1289
        // Drawing tree:
1290
        /* @var PageTreeView $tree */
1291
        $tree = GeneralUtility::makeInstance(PageTreeView::class);
1292
        $perms_clause = $this->getBackendUser()->getPagePermsClause(Permission::PAGE_SHOW);
1293
        $tree->init('AND ' . $perms_clause);
1294
1295
        $pageInfo = BackendUtility::readPageAccess($id, $perms_clause);
1296
        if (is_array($pageInfo)) {
0 ignored issues
show
introduced by
The condition is_array($pageInfo) is always true.
Loading history...
1297
            // Set root row:
1298
            $tree->tree[] = [
1299
                'row' => $pageInfo,
1300
                'HTML' => $this->iconFactory->getIconForRecord('pages', $pageInfo, Icon::SIZE_SMALL),
1301
            ];
1302
        }
1303
1304
        // Get branch beneath:
1305
        if ($depth) {
1306
            $tree->getTree($id, $depth, '');
1307
        }
1308
1309
        // Traverse page tree:
1310
        $code = '';
1311
1312
        foreach ($tree->tree as $data) {
1313
            $this->MP = false;
1314
1315
            // recognize mount points
1316
            if ($data['row']['doktype'] === PageRepository::DOKTYPE_MOUNTPOINT) {
1317
                $queryBuilder = GeneralUtility::makeInstance(ConnectionPool::class)->getQueryBuilderForTable('pages');
1318
                $queryBuilder->getRestrictions()->removeAll()->add(GeneralUtility::makeInstance(DeletedRestriction::class));
1319
                $mountpage = $queryBuilder
1320
                    ->select('*')
1321
                    ->from('pages')
1322
                    ->where(
1323
                        $queryBuilder->expr()->eq('uid', $queryBuilder->createNamedParameter($data['row']['uid'], \PDO::PARAM_INT))
1324
                    )
1325
                    ->execute()
1326
                    ->fetchAll();
1327
                $queryBuilder->resetRestrictions();
1328
1329
                // fetch mounted pages
1330
                $this->MP = $mountpage[0]['mount_pid'] . '-' . $data['row']['uid'];
0 ignored issues
show
Documentation Bug introduced by
The property $MP was declared of type boolean, but $mountpage[0]['mount_pid...' . $data['row']['uid'] is of type string. Maybe add a type cast?

This check looks for assignments to scalar types that may be of the wrong type.

To ensure the code behaves as expected, it may be a good idea to add an explicit type cast.

$answer = 42;

$correct = false;

$correct = (bool) $answer;
Loading history...
1331
1332
                $mountTree = GeneralUtility::makeInstance(PageTreeView::class);
1333
                $mountTree->init('AND ' . $perms_clause);
1334
                $mountTree->getTree($mountpage[0]['mount_pid'], $depth);
1335
1336
                foreach ($mountTree->tree as $mountData) {
1337
                    $code .= $this->drawURLs_addRowsForPage(
1338
                        $mountData['row'],
1339
                        $mountData['HTML'] . BackendUtility::getRecordTitle('pages', $mountData['row'], true)
1340
                    );
1341
                }
1342
1343
                // replace page when mount_pid_ol is enabled
1344
                if ($mountpage[0]['mount_pid_ol']) {
1345
                    $data['row']['uid'] = $mountpage[0]['mount_pid'];
1346
                } else {
1347
                    // if the mount_pid_ol is not set the MP must not be used for the mountpoint page
1348
                    $this->MP = false;
1349
                }
1350
            }
1351
1352
            $code .= $this->drawURLs_addRowsForPage(
1353
                $data['row'],
1354
                $data['HTML'] . BackendUtility::getRecordTitle('pages', $data['row'], true)
1355
            );
1356
        }
1357
1358
        return $code;
1359
    }
1360
1361
    /**
1362
     * Expands exclude string
1363
     *
1364
     * @param string $excludeString Exclude string
1365
     * @return array
1366
     */
1367 1
    public function expandExcludeString($excludeString)
1368
    {
1369
        // internal static caches;
1370 1
        static $expandedExcludeStringCache;
1371 1
        static $treeCache;
1372
1373 1
        if (empty($expandedExcludeStringCache[$excludeString])) {
1374 1
            $pidList = [];
1375
1376 1
            if (! empty($excludeString)) {
1377
                /** @var PageTreeView $tree */
1378
                $tree = GeneralUtility::makeInstance(PageTreeView::class);
1379
                $tree->init('AND ' . $this->getBackendUser()->getPagePermsClause(Permission::PAGE_SHOW));
1380
1381
                $excludeParts = GeneralUtility::trimExplode(',', $excludeString);
1382
1383
                foreach ($excludeParts as $excludePart) {
1384
                    [$pid, $depth] = GeneralUtility::trimExplode('+', $excludePart);
1385
1386
                    // default is "page only" = "depth=0"
1387
                    if (empty($depth)) {
1388
                        $depth = (stristr($excludePart, '+')) ? 99 : 0;
1389
                    }
1390
1391
                    $pidList[] = $pid;
1392
1393
                    if ($depth > 0) {
1394
                        if (empty($treeCache[$pid][$depth])) {
1395
                            $tree->reset();
1396
                            $tree->getTree($pid, $depth);
1397
                            $treeCache[$pid][$depth] = $tree->tree;
1398
                        }
1399
1400
                        foreach ($treeCache[$pid][$depth] as $data) {
1401
                            $pidList[] = $data['row']['uid'];
1402
                        }
1403
                    }
1404
                }
1405
            }
1406
1407 1
            $expandedExcludeStringCache[$excludeString] = array_unique($pidList);
1408
        }
1409
1410 1
        return $expandedExcludeStringCache[$excludeString];
1411
    }
1412
1413
    /**
1414
     * Create the rows for display of the page tree
1415
     * For each page a number of rows are shown displaying GET variable configuration
1416
     */
1417
    public function drawURLs_addRowsForPage(array $pageRow, string $pageTitle): string
1418
    {
1419
        $skipMessage = '';
1420
1421
        // Get list of configurations
1422
        $configurations = $this->getUrlsForPageRow($pageRow, $skipMessage);
1423
1424
        if (! empty($this->incomingConfigurationSelection)) {
1425
            // remove configuration that does not match the current selection
1426
            foreach ($configurations as $confKey => $confArray) {
1427
                if (! in_array($confKey, $this->incomingConfigurationSelection, true)) {
1428
                    unset($configurations[$confKey]);
1429
                }
1430
            }
1431
        }
1432
1433
        // Traverse parameter combinations:
1434
        $c = 0;
1435
        $content = '';
1436
        if (! empty($configurations)) {
1437
            foreach ($configurations as $confKey => $confArray) {
1438
1439
                // Title column:
1440
                if (! $c) {
1441
                    $titleClm = '<td rowspan="' . count($configurations) . '">' . $pageTitle . '</td>';
1442
                } else {
1443
                    $titleClm = '';
1444
                }
1445
1446
                if (! in_array($pageRow['uid'], $this->expandExcludeString($confArray['subCfg']['exclude']), true)) {
1447
1448
                    // URL list:
1449
                    $urlList = $this->urlListFromUrlArray(
1450
                        $confArray,
1451
                        $pageRow,
1452
                        $this->scheduledTime,
1453
                        $this->reqMinute,
1454
                        $this->submitCrawlUrls,
1455
                        $this->downloadCrawlUrls,
1456
                        $this->duplicateTrack,
1457
                        $this->downloadUrls,
1458
                        $this->incomingProcInstructions // if empty the urls won't be filtered by processing instructions
1459
                    );
1460
1461
                    // Expanded parameters:
1462
                    $paramExpanded = '';
1463
                    $calcAccu = [];
1464
                    $calcRes = 1;
1465
                    foreach ($confArray['paramExpanded'] as $gVar => $gVal) {
1466
                        $paramExpanded .= '
1467
                            <tr>
1468
                                <td>' . htmlspecialchars('&' . $gVar . '=') . '<br/>' .
1469
                            '(' . count($gVal) . ')' .
1470
                            '</td>
1471
                                <td nowrap="nowrap">' . nl2br(htmlspecialchars(implode(chr(10), $gVal))) . '</td>
1472
                            </tr>
1473
                        ';
1474
                        $calcRes *= count($gVal);
1475
                        $calcAccu[] = count($gVal);
1476
                    }
1477
                    $paramExpanded = '<table>' . $paramExpanded . '</table>';
1478
                    $paramExpanded .= 'Comb: ' . implode('*', $calcAccu) . '=' . $calcRes;
1479
1480
                    // Options
1481
                    $optionValues = '';
1482
                    if ($confArray['subCfg']['userGroups']) {
1483
                        $optionValues .= 'User Groups: ' . $confArray['subCfg']['userGroups'] . '<br/>';
1484
                    }
1485
                    if ($confArray['subCfg']['procInstrFilter']) {
1486
                        $optionValues .= 'ProcInstr: ' . $confArray['subCfg']['procInstrFilter'] . '<br/>';
1487
                    }
1488
1489
                    // Compile row:
1490
                    $content .= '
1491
                        <tr>
1492
                            ' . $titleClm . '
1493
                            <td>' . htmlspecialchars($confKey) . '</td>
1494
                            <td>' . nl2br(htmlspecialchars(rawurldecode(trim(str_replace('&', chr(10) . '&', GeneralUtility::implodeArrayForUrl('', $confArray['paramParsed'])))))) . '</td>
1495
                            <td>' . $paramExpanded . '</td>
1496
                            <td nowrap="nowrap">' . $urlList . '</td>
1497
                            <td nowrap="nowrap">' . $optionValues . '</td>
1498
                            <td nowrap="nowrap">' . DebugUtility::viewArray($confArray['subCfg']['procInstrParams.']) . '</td>
1499
                        </tr>';
1500
                } else {
1501
                    $content .= '<tr>
1502
                            ' . $titleClm . '
1503
                            <td>' . htmlspecialchars($confKey) . '</td>
1504
                            <td colspan="5"><em>No entries</em> (Page is excluded in this configuration)</td>
1505
                        </tr>';
1506
                }
1507
1508
                $c++;
1509
            }
1510
        } else {
1511
            $message = ! empty($skipMessage) ? ' (' . $skipMessage . ')' : '';
1512
1513
            // Compile row:
1514
            $content .= '
1515
                <tr>
1516
                    <td>' . $pageTitle . '</td>
1517
                    <td colspan="6"><em>No entries</em>' . $message . '</td>
1518
                </tr>';
1519
        }
1520
1521
        return $content;
1522
    }
1523
1524
    /*****************************
1525
     *
1526
     * CLI functions
1527
     *
1528
     *****************************/
1529
1530
    /**
1531
     * Running the functionality of the CLI (crawling URLs from queue)
1532
     */
1533
    public function CLI_run(int $countInARun, int $sleepTime, int $sleepAfterFinish): int
1534
    {
1535
        $result = 0;
1536
        $counter = 0;
1537
1538
        // First, run hooks:
1539
        $this->CLI_runHooks();
1540
1541
        // Clean up the queue
1542
        $this->queueRepository->cleanupQueue();
1543
1544
        // Select entries:
1545
        $rows = $this->queueRepository->fetchRecordsToBeCrawled($countInARun);
1546
1547
        if (! empty($rows)) {
1548
            $quidList = [];
1549
1550
            foreach ($rows as $r) {
1551
                $quidList[] = $r['qid'];
1552
            }
1553
1554
            $processId = $this->CLI_buildProcessId();
1555
1556
            //save the number of assigned queue entries to determine how many have been processed later
1557
            $numberOfAffectedRows = $this->queueRepository->updateProcessIdAndSchedulerForQueueIds($quidList, $processId);
1558
            $this->processRepository->updateProcessAssignItemsCount($numberOfAffectedRows, $processId);
1559
1560
            if ($numberOfAffectedRows !== count($quidList)) {
1561
                $this->CLI_debug('Nothing processed due to multi-process collision (' . $this->CLI_buildProcessId() . ')');
1562
                return ($result | self::CLI_STATUS_ABORTED);
1563
            }
1564
1565
            foreach ($rows as $r) {
1566
                $result |= $this->readUrl($r['qid']);
1567
1568
                $counter++;
1569
                usleep((int) $sleepTime); // Just to relax the system
1570
1571
                // if during the start and the current read url the cli has been disable we need to return from the function
1572
                // mark the process NOT as ended.
1573
                if ($this->getDisabled()) {
1574
                    return ($result | self::CLI_STATUS_ABORTED);
1575
                }
1576
1577
                if (! $this->processRepository->isProcessActive($this->CLI_buildProcessId())) {
1578
                    $this->CLI_debug('conflict / timeout (' . $this->CLI_buildProcessId() . ')');
1579
                    $result |= self::CLI_STATUS_ABORTED;
1580
                    break; //possible timeout
1581
                }
1582
            }
1583
1584
            sleep((int) $sleepAfterFinish);
1585
1586
            $msg = 'Rows: ' . $counter;
1587
            $this->CLI_debug($msg . ' (' . $this->CLI_buildProcessId() . ')');
1588
        } else {
1589
            $this->CLI_debug('Nothing within queue which needs to be processed (' . $this->CLI_buildProcessId() . ')');
1590
        }
1591
1592
        if ($counter > 0) {
1593
            $result |= self::CLI_STATUS_PROCESSED;
1594
        }
1595
1596
        return $result;
1597
    }
1598
1599
    /**
1600
     * Activate hooks
1601
     */
1602
    public function CLI_runHooks(): void
1603
    {
1604
        foreach ($GLOBALS['TYPO3_CONF_VARS']['EXTCONF']['crawler']['cli_hooks'] ?? [] as $objRef) {
1605
            $hookObj = GeneralUtility::makeInstance($objRef);
1606
            if (is_object($hookObj)) {
1607
                $hookObj->crawler_init($this);
1608
            }
1609
        }
1610
    }
1611
1612
    /**
1613
     * Try to acquire a new process with the given id
1614
     * also performs some auto-cleanup for orphan processes
1615
     * @param string $id identification string for the process
1616
     * @return boolean
1617
     * @todo preemption might not be the most elegant way to clean up
1618
     */
1619
    public function CLI_checkAndAcquireNewProcess($id)
1620
    {
1621
        $queryBuilder = GeneralUtility::makeInstance(ConnectionPool::class)->getQueryBuilderForTable($this->tableName);
1622
        $ret = true;
1623
1624
        $systemProcessId = getmypid();
1625
        if ($systemProcessId < 1) {
1626
            return false;
1627
        }
1628
1629
        $processCount = 0;
1630
        $orphanProcesses = [];
1631
1632
        $statement = $queryBuilder
1633
            ->select('process_id', 'ttl')
1634
            ->from('tx_crawler_process')
1635
            ->where(
1636
                'active = 1 AND deleted = 0'
1637
            )
1638
            ->execute();
1639
1640
        $currentTime = $this->getCurrentTime();
1641
1642
        while ($row = $statement->fetch()) {
1643
            if ($row['ttl'] < $currentTime) {
1644
                $orphanProcesses[] = $row['process_id'];
1645
            } else {
1646
                $processCount++;
1647
            }
1648
        }
1649
1650
        // if there are less than allowed active processes then add a new one
1651
        if ($processCount < (int) $this->extensionSettings['processLimit']) {
1652
            $this->CLI_debug('add process ' . $this->CLI_buildProcessId() . ' (' . ($processCount + 1) . '/' . (int) $this->extensionSettings['processLimit'] . ')');
1653
1654
            GeneralUtility::makeInstance(ConnectionPool::class)->getConnectionForTable('tx_crawler_process')->insert(
1655
                'tx_crawler_process',
1656
                [
1657
                    'process_id' => $id,
1658
                    'active' => 1,
1659
                    'ttl' => $currentTime + (int) $this->extensionSettings['processMaxRunTime'],
1660
                    'system_process_id' => $systemProcessId,
1661
                ]
1662
            );
1663
        } else {
1664
            $this->CLI_debug('Processlimit reached (' . ($processCount) . '/' . (int) $this->extensionSettings['processLimit'] . ')');
1665
            $ret = false;
1666
        }
1667
1668
        $this->processRepository->deleteProcessesMarkedAsDeleted();
1669
        $this->CLI_releaseProcesses($orphanProcesses);
1670
1671
        return $ret;
1672
    }
1673
1674
    /**
1675
     * Release a process and the required resources
1676
     *
1677
     * @param mixed $releaseIds string with a single process-id or array with multiple process-ids
1678
     * @return boolean
1679
     */
1680
    public function CLI_releaseProcesses($releaseIds)
1681
    {
1682
        $queryBuilder = GeneralUtility::makeInstance(ConnectionPool::class)->getQueryBuilderForTable($this->tableName);
1683
1684
        if (! is_array($releaseIds)) {
1685
            $releaseIds = [$releaseIds];
1686
        }
1687
1688
        if (empty($releaseIds)) {
1689
            return false;   //nothing to release
1690
        }
1691
1692
        // some kind of 2nd chance algo - this way you need at least 2 processes to have a real cleanup
1693
        // this ensures that a single process can't mess up the entire process table
1694
1695
        // mark all processes as deleted which have no "waiting" queue-entires and which are not active
1696
1697
        $queryBuilder
1698
            ->update($this->tableName, 'q')
1699
            ->where(
1700
                'q.process_id IN(SELECT p.process_id FROM tx_crawler_process as p WHERE p.active = 0)'
1701
            )
1702
            ->set('q.process_scheduled', 0)
1703
            ->set('q.process_id', '')
1704
            ->execute();
1705
1706
        // FIXME: Not entirely sure that this is equivalent to the previous version
1707
        $queryBuilder->resetQueryPart('set');
1708
1709
        $queryBuilder
1710
            ->update('tx_crawler_process')
1711
            ->where(
1712
                $queryBuilder->expr()->eq('active', 0),
1713
                'process_id IN(SELECT q.process_id FROM tx_crawler_queue as q WHERE q.exec_time = 0)'
1714
            )
1715
            ->set('system_process_id', 0)
1716
            ->execute();
1717
1718
        $this->processRepository->markRequestedProcessesAsNotActive($releaseIds);
1719
        $this->queueRepository->unsetProcessScheduledAndProcessIdForQueueEntries($releaseIds);
1720
1721
        return true;
1722
    }
1723
1724
    /**
1725
     * Create a unique Id for the current process
1726
     *
1727
     * @return string  the ID
1728
     */
1729 1
    public function CLI_buildProcessId()
1730
    {
1731 1
        if (! $this->processID) {
1732
            $this->processID = GeneralUtility::shortMD5(microtime(true));
1733
        }
1734 1
        return $this->processID;
1735
    }
1736
1737
    /**
1738
     * Prints a message to the stdout (only if debug-mode is enabled)
1739
     *
1740
     * @param string $msg the message
1741
     */
1742
    public function CLI_debug($msg): void
1743
    {
1744
        if ((int) $this->extensionSettings['processDebug']) {
1745
            echo $msg . "\n";
1746
            flush();
1747
        }
1748
    }
1749
1750
    /**
1751
     * Cleans up entries that stayed for too long in the queue. These are:
1752
     * - processed entries that are over 1.5 days in age
1753
     * - scheduled entries that are over 7 days old
1754
     *
1755
     * @deprecated
1756
     */
1757 1
    public function cleanUpOldQueueEntries(): void
1758
    {
1759 1
        $processedAgeInSeconds = $this->extensionSettings['cleanUpProcessedAge'] * 86400; // 24*60*60 Seconds in 24 hours
1760 1
        $scheduledAgeInSeconds = $this->extensionSettings['cleanUpScheduledAge'] * 86400;
1761
1762 1
        $now = time();
1763 1
        $condition = '(exec_time<>0 AND exec_time<' . ($now - $processedAgeInSeconds) . ') OR scheduled<=' . ($now - $scheduledAgeInSeconds);
1764 1
        $this->flushQueue($condition);
0 ignored issues
show
Deprecated Code introduced by
The function AOE\Crawler\Controller\C...ontroller::flushQueue() has been deprecated. ( Ignorable by Annotation )

If this is a false-positive, you can also ignore this issue in your code via the ignore-deprecated  annotation

1764
        /** @scrutinizer ignore-deprecated */ $this->flushQueue($condition);
Loading history...
1765 1
    }
1766
1767
    /**
1768
     * Removes queue entries
1769
     *
1770
     * @param string $where SQL related filter for the entries which should be removed
1771
     *
1772
     * @deprecated
1773
     */
1774 5
    protected function flushQueue($where = ''): void
1775
    {
1776 5
        $realWhere = strlen((string) $where) > 0 ? $where : '1=1';
1777
1778 5
        $queryBuilder = $this->getQueryBuilder($this->tableName);
1779
1780 5
        if (EventDispatcher::getInstance()->hasObserver('queueEntryFlush')) {
1781
            $groups = $queryBuilder
1782
                ->select('DISTINCT set_id')
1783
                ->from($this->tableName)
1784
                ->where($realWhere)
1785
                ->execute()
1786
                ->fetchAll();
1787
            if (is_array($groups)) {
0 ignored issues
show
introduced by
The condition is_array($groups) is always true.
Loading history...
1788
                foreach ($groups as $group) {
1789
                    $subSet = $queryBuilder
1790
                        ->select('uid', 'set_id')
1791
                        ->from($this->tableName)
1792
                        ->where(
1793
                            $realWhere,
1794
                            $queryBuilder->expr()->eq('set_id', $group['set_id'])
1795
                        )
1796
                        ->execute()
1797
                        ->fetchAll();
1798
                    EventDispatcher::getInstance()->post('queueEntryFlush', $group['set_id'], $subSet);
1799
                }
1800
            }
1801
        }
1802
1803
        $queryBuilder
1804 5
            ->delete($this->tableName)
1805 5
            ->where($realWhere)
1806 5
            ->execute();
1807 5
    }
1808
1809
    /**
1810
     * This method determines duplicates for a queue entry with the same parameters and this timestamp.
1811
     * If the timestamp is in the past, it will check if there is any unprocessed queue entry in the past.
1812
     * If the timestamp is in the future it will check, if the queued entry has exactly the same timestamp
1813
     *
1814
     * @param int $tstamp
1815
     * @param array $fieldArray
1816
     *
1817
     * @return array
1818
     */
1819 7
    protected function getDuplicateRowsIfExist($tstamp, $fieldArray)
1820
    {
1821 7
        $rows = [];
1822
1823 7
        $currentTime = $this->getCurrentTime();
1824
1825 7
        $queryBuilder = GeneralUtility::makeInstance(ConnectionPool::class)->getQueryBuilderForTable($this->tableName);
1826
        $queryBuilder
1827 7
            ->select('qid')
1828 7
            ->from('tx_crawler_queue');
1829
        //if this entry is scheduled with "now"
1830 7
        if ($tstamp <= $currentTime) {
1831 2
            if ($this->extensionSettings['enableTimeslot']) {
1832 1
                $timeBegin = $currentTime - 100;
1833 1
                $timeEnd = $currentTime + 100;
1834
                $queryBuilder
1835 1
                    ->where(
1836 1
                        'scheduled BETWEEN ' . $timeBegin . ' AND ' . $timeEnd . ''
1837
                    )
1838 1
                    ->orWhere(
1839 1
                        $queryBuilder->expr()->lte('scheduled', $currentTime)
1840
                    );
1841
            } else {
1842
                $queryBuilder
1843 1
                    ->where(
1844 2
                        $queryBuilder->expr()->lte('scheduled', $currentTime)
1845
                    );
1846
            }
1847 5
        } elseif ($tstamp > $currentTime) {
1848
            //entry with a timestamp in the future need to have the same schedule time
1849
            $queryBuilder
1850 5
                ->where(
1851 5
                    $queryBuilder->expr()->eq('scheduled', $tstamp)
1852
                );
1853
        }
1854
1855
        $queryBuilder
1856 7
            ->andWhere('NOT exec_time')
1857 7
            ->andWhere('NOT process_id')
1858 7
            ->andWhere($queryBuilder->expr()->eq('page_id', $queryBuilder->createNamedParameter($fieldArray['page_id'], \PDO::PARAM_INT)))
1859 7
            ->andWhere($queryBuilder->expr()->eq('parameters_hash', $queryBuilder->createNamedParameter($fieldArray['parameters_hash'], \PDO::PARAM_STR)));
1860
1861 7
        $statement = $queryBuilder->execute();
1862
1863 7
        while ($row = $statement->fetch()) {
1864 5
            $rows[] = $row['qid'];
1865
        }
1866
1867 7
        return $rows;
1868
    }
1869
1870
    /**
1871
     * Returns a md5 hash generated from a serialized configuration array.
1872
     *
1873
     * @return string
1874
     */
1875 8
    protected function getConfigurationHash(array $configuration)
1876
    {
1877 8
        unset($configuration['paramExpanded']);
1878 8
        unset($configuration['URLs']);
1879 8
        return md5(serialize($configuration));
1880
    }
1881
1882
    /**
1883
     * Build a URL from a Page and the Query String. If the page has a Site configuration, it can be built by using
1884
     * the Site instance.
1885
     *
1886
     * @param int $httpsOrHttp see tx_crawler_configuration.force_ssl
1887
     * @throws \TYPO3\CMS\Core\Exception\SiteNotFoundException
1888
     * @throws \TYPO3\CMS\Core\Routing\InvalidRouteArgumentsException
1889
     */
1890 10
    protected function getUrlFromPageAndQueryParameters(int $pageId, string $queryString, ?string $alternativeBaseUrl, int $httpsOrHttp): UriInterface
1891
    {
1892 10
        $site = GeneralUtility::makeInstance(SiteMatcher::class)->matchByPageId((int) $pageId);
1893 10
        if ($site instanceof Site) {
1894 5
            $queryString = ltrim($queryString, '?&');
1895 5
            $queryParts = [];
1896 5
            parse_str($queryString, $queryParts);
1897 5
            unset($queryParts['id']);
1898
            // workaround as long as we don't have native language support in crawler configurations
1899 5
            if (isset($queryParts['L'])) {
1900
                $queryParts['_language'] = $queryParts['L'];
1901
                unset($queryParts['L']);
1902
                $siteLanguage = $site->getLanguageById((int) $queryParts['_language']);
0 ignored issues
show
Unused Code introduced by
The assignment to $siteLanguage is dead and can be removed.
Loading history...
1903
            } else {
1904 5
                $siteLanguage = $site->getDefaultLanguage();
1905
            }
1906 5
            $url = $site->getRouter()->generateUri($pageId, $queryParts);
1907 5
            if (! empty($alternativeBaseUrl)) {
1908 3
                $alternativeBaseUrl = new Uri($alternativeBaseUrl);
1909 3
                $url = $url->withHost($alternativeBaseUrl->getHost());
1910 3
                $url = $url->withScheme($alternativeBaseUrl->getScheme());
1911 3
                $url = $url->withPort($alternativeBaseUrl->getPort());
1912 3
                if ($userInfo = $alternativeBaseUrl->getUserInfo()) {
1913 5
                    $url = $url->withUserInfo($userInfo);
1914
                }
1915
            }
1916
        } else {
1917
            // Technically this is not possible with site handling, but kept for backwards-compatibility reasons
1918
            // Once EXT:crawler is v10-only compatible, this should be removed completely
1919 5
            $baseUrl = ($alternativeBaseUrl ?: GeneralUtility::getIndpEnv('TYPO3_SITE_URL'));
1920 5
            $cacheHashCalculator = GeneralUtility::makeInstance(CacheHashCalculator::class);
1921 5
            $queryString .= '&cHash=' . $cacheHashCalculator->generateForParameters($queryString);
1922 5
            $url = rtrim($baseUrl, '/') . '/index.php' . $queryString;
1923 5
            $url = new Uri($url);
1924
        }
1925
1926 10
        if ($httpsOrHttp === -1) {
1927 2
            $url = $url->withScheme('http');
1928 8
        } elseif ($httpsOrHttp === 1) {
1929 6
            $url = $url->withScheme('https');
1930
        }
1931
1932 10
        return $url;
1933
    }
1934
1935 1
    protected function swapIfFirstIsLargerThanSecond(array $reg): array
1936
    {
1937
        // Swap if first is larger than last:
1938 1
        if ($reg[1] > $reg[2]) {
1939
            $temp = $reg[2];
1940
            $reg[2] = $reg[1];
1941
            $reg[1] = $temp;
1942
        }
1943
1944 1
        return $reg;
1945
    }
1946
1947
    /**
1948
     * @return BackendUserAuthentication
1949
     */
1950 1
    private function getBackendUser()
1951
    {
1952
        // Make sure the _cli_ user is loaded
1953 1
        Bootstrap::initializeBackendAuthentication();
1954 1
        if ($this->backendUser === null) {
1955 1
            $this->backendUser = $GLOBALS['BE_USER'];
1956
        }
1957 1
        return $this->backendUser;
1958
    }
1959
1960
    /**
1961
     * Get querybuilder for given table
1962
     *
1963
     * @return \TYPO3\CMS\Core\Database\Query\QueryBuilder
1964
     */
1965 12
    private function getQueryBuilder(string $table)
1966
    {
1967 12
        return GeneralUtility::makeInstance(ConnectionPool::class)->getQueryBuilderForTable($table);
1968
    }
1969
}
1970