Completed
Push — typo3v9 ( 73c725...c8617d )
by Tomas Norre
05:22
created

CrawlerController::setAccessMode()   A

Complexity

Conditions 1
Paths 1

Size

Total Lines 4

Duplication

Lines 0
Ratio 0 %

Code Coverage

Tests 3
CRAP Score 1

Importance

Changes 0
Metric Value
cc 1
nc 1
nop 1
dl 0
loc 4
rs 10
c 0
b 0
f 0
ccs 3
cts 3
cp 1
crap 1
1
<?php
2
namespace AOE\Crawler\Controller;
3
4
/***************************************************************
5
 *  Copyright notice
6
 *
7
 *  (c) 2019 AOE GmbH <[email protected]>
8
 *
9
 *  All rights reserved
10
 *
11
 *  This script is part of the TYPO3 project. The TYPO3 project is
12
 *  free software; you can redistribute it and/or modify
13
 *  it under the terms of the GNU General Public License as published by
14
 *  the Free Software Foundation; either version 3 of the License, or
15
 *  (at your option) any later version.
16
 *
17
 *  The GNU General Public License can be found at
18
 *  http://www.gnu.org/copyleft/gpl.html.
19
 *
20
 *  This script is distributed in the hope that it will be useful,
21
 *  but WITHOUT ANY WARRANTY; without even the implied warranty of
22
 *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
23
 *  GNU General Public License for more details.
24
 *
25
 *  This copyright notice MUST APPEAR in all copies of the script!
26
 ***************************************************************/
27
28
use AOE\Crawler\QueueExecutor;
29
use AOE\Crawler\Configuration\ExtensionConfigurationProvider;
30
use AOE\Crawler\Domain\Repository\ConfigurationRepository;
31
use AOE\Crawler\Domain\Repository\ProcessRepository;
32
use AOE\Crawler\Domain\Repository\QueueRepository;
33
use AOE\Crawler\Event\EventDispatcher;
34
use AOE\Crawler\Utility\SignalSlotUtility;
35
use Psr\Http\Message\UriInterface;
36
use Psr\Log\LoggerAwareInterface;
37
use Psr\Log\LoggerAwareTrait;
38
use TYPO3\CMS\Backend\Tree\View\PageTreeView;
39
use TYPO3\CMS\Backend\Utility\BackendUtility;
40
use TYPO3\CMS\Core\Authentication\BackendUserAuthentication;
41
use TYPO3\CMS\Core\Core\Environment;
42
use TYPO3\CMS\Core\Database\Connection;
43
use TYPO3\CMS\Core\Database\ConnectionPool;
44
use TYPO3\CMS\Core\Database\Query\Restriction\DeletedRestriction;
45
use TYPO3\CMS\Core\Http\Uri;
46
use TYPO3\CMS\Core\Imaging\Icon;
47
use TYPO3\CMS\Core\Imaging\IconFactory;
48
use TYPO3\CMS\Core\Routing\SiteMatcher;
49
use TYPO3\CMS\Core\Site\Entity\Site;
50
use TYPO3\CMS\Core\TypoScript\Parser\TypoScriptParser;
51
use TYPO3\CMS\Core\Utility\DebugUtility;
52
use TYPO3\CMS\Core\Utility\GeneralUtility;
53
use TYPO3\CMS\Core\Utility\MathUtility;
54
use TYPO3\CMS\Extbase\Object\ObjectManager;
55
use TYPO3\CMS\Frontend\Page\CacheHashCalculator;
56
use TYPO3\CMS\Frontend\Page\PageRepository;
57
58
/**
59
 * Class CrawlerController
60
 *
61
 * @package AOE\Crawler\Controller
62
 */
63
class CrawlerController implements LoggerAwareInterface
64
{
65
    use LoggerAwareTrait;
66
67
    const CLI_STATUS_NOTHING_PROCCESSED = 0;
68
    const CLI_STATUS_REMAIN = 1; //queue not empty
69
    const CLI_STATUS_PROCESSED = 2; //(some) queue items where processed
70
    const CLI_STATUS_ABORTED = 4; //instance didn't finish
71
    const CLI_STATUS_POLLABLE_PROCESSED = 8;
72
73
    /**
74
     * @var integer
75
     */
76
    public $setID = 0;
77
78
    /**
79
     * @var string
80
     */
81
    public $processID = '';
82
83
    /**
84
     * @var array
85
     */
86
    public $duplicateTrack = [];
87
88
    /**
89
     * @var array
90
     */
91
    public $downloadUrls = [];
92
93
    /**
94
     * @var array
95
     */
96
    public $incomingProcInstructions = [];
97
98
    /**
99
     * @var array
100
     */
101
    public $incomingConfigurationSelection = [];
102
103
    /**
104
     * @var bool
105
     */
106
    public $registerQueueEntriesInternallyOnly = false;
107
108
    /**
109
     * @var array
110
     */
111
    public $queueEntries = [];
112
113
    /**
114
     * @var array
115
     */
116
    public $urlList = [];
117
118
    /**
119
     * @var array
120
     */
121
    public $extensionSettings = [];
122
123
    /**
124
     * Mount Point
125
     *
126
     * @var boolean
127
     */
128
    public $MP = false;
129
130
    /**
131
     * @var string
132
     */
133
    protected $processFilename;
134
135
    /**
136
     * Holds the internal access mode can be 'gui','cli' or 'cli_im'
137
     *
138
     * @var string
139
     */
140
    protected $accessMode;
141
142
    /**
143
     * @var BackendUserAuthentication
144
     */
145
    private $backendUser;
146
147
    /**
148
     * @var integer
149
     */
150
    private $scheduledTime = 0;
151
152
    /**
153
     * @var integer
154
     */
155
    private $reqMinute = 0;
156
157
    /**
158
     * @var bool
159
     */
160
    private $submitCrawlUrls = false;
161
162
    /**
163
     * @var bool
164
     */
165
    private $downloadCrawlUrls = false;
166
167
    /**
168
     * @var QueueRepository
169
     */
170
    protected $queueRepository;
171
172
    /**
173
     * @var ProcessRepository
174
     */
175
    protected $processRepository;
176
177
    /**
178
     * @var ConfigurationRepository
179
     */
180
    protected $configurationRepository;
181
182
    /**
183
     * @var string
184
     */
185
    protected $tableName = 'tx_crawler_queue';
186
187
    /**
188
     * @var QueueExecutor
189
     */
190
    protected $queueExecutor;
191
192
    /**
193
     * @var int
194
     */
195
    protected $maximumUrlsToCompile = 10000;
196
197
    /**
198
     * @var IconFactory
199
     */
200
    protected $iconFactory;
201
202
    /**
203
     * Method to set the accessMode can be gui, cli or cli_im
204
     *
205
     * @return string
206
     */
207 1
    public function getAccessMode()
208
    {
209 1
        return $this->accessMode;
210
    }
211
212
    /**
213
     * @param string $accessMode
214
     */
215 1
    public function setAccessMode($accessMode)
216
    {
217 1
        $this->accessMode = $accessMode;
218 1
    }
219
220
    /**
221
     * Set disabled status to prevent processes from being processed
222
     *
223
     * @param  bool $disabled (optional, defaults to true)
224
     * @return void
225
     */
226 3
    public function setDisabled($disabled = true)
227
    {
228 3
        if ($disabled) {
229 2
            GeneralUtility::writeFile($this->processFilename, '');
230
        } else {
231 1
            if (is_file($this->processFilename)) {
232 1
                unlink($this->processFilename);
233
            }
234
        }
235 3
    }
236
237
    /**
238
     * Get disable status
239
     *
240
     * @return bool true if disabled
241
     */
242 3
    public function getDisabled()
243
    {
244 3
        return is_file($this->processFilename);
245
    }
246
247
    /**
248
     * @param string $filenameWithPath
249
     *
250
     * @return void
251
     */
252 4
    public function setProcessFilename($filenameWithPath)
253
    {
254 4
        $this->processFilename = $filenameWithPath;
255 4
    }
256
257
    /**
258
     * @return string
259
     */
260 1
    public function getProcessFilename()
261
    {
262 1
        return $this->processFilename;
263
    }
264
265
    /************************************
266
     *
267
     * Getting URLs based on Page TSconfig
268
     *
269
     ************************************/
270
271 26
    public function __construct()
272
    {
273 26
        $objectManager = GeneralUtility::makeInstance(ObjectManager::class);
274 26
        $this->queueRepository = $objectManager->get(QueueRepository::class);
275 26
        $this->processRepository = $objectManager->get(ProcessRepository::class);
276 26
        $this->configurationRepository = $objectManager->get(ConfigurationRepository::class);
277 26
        $this->queueExecutor = $objectManager->get(QueueExecutor::class);
278 26
        $this->iconFactory = GeneralUtility::makeInstance(IconFactory::class);
279
280 26
        $this->backendUser = $GLOBALS['BE_USER'];
281 26
        $this->processFilename = Environment::getVarPath() . '/locks/tx_crawler.proc';
282
283
        /** @var ExtensionConfigurationProvider $configurationProvider */
284 26
        $configurationProvider = GeneralUtility::makeInstance(ExtensionConfigurationProvider::class);
285 26
        $settings = $configurationProvider->getExtensionConfiguration();
286 26
        $this->extensionSettings = is_array($settings) ? $settings : [];
287
288
        // set defaults:
289 26
        if (MathUtility::convertToPositiveInteger($this->extensionSettings['countInARun']) == 0) {
290
            $this->extensionSettings['countInARun'] = 100;
291
        }
292
293 26
        $this->extensionSettings['processLimit'] = MathUtility::forceIntegerInRange($this->extensionSettings['processLimit'], 1, 99, 1);
294 26
        $this->maximumUrlsToCompile = MathUtility::forceIntegerInRange($this->extensionSettings['maxCompileUrls'], 1, 1000000000, 10000);
295 26
    }
296
297
    /**
298
     * Sets the extensions settings (unserialized pendant of $TYPO3_CONF_VARS['EXT']['extConf']['crawler']).
299
     *
300
     * @param array $extensionSettings
301
     * @return void
302
     */
303 7
    public function setExtensionSettings(array $extensionSettings)
304
    {
305 7
        $this->extensionSettings = $extensionSettings;
306 7
    }
307
308
    /**
309
     * Check if the given page should be crawled
310
     *
311
     * @param array $pageRow
312
     * @return false|string false if the page should be crawled (not excluded), true / skipMessage if it should be skipped
313
     */
314 8
    public function checkIfPageShouldBeSkipped(array $pageRow)
315
    {
316 8
        $skipPage = false;
317 8
        $skipMessage = 'Skipped'; // message will be overwritten later
318
319
        // if page is hidden
320 8
        if (!$this->extensionSettings['crawlHiddenPages']) {
321 8
            if ($pageRow['hidden']) {
322 1
                $skipPage = true;
323 1
                $skipMessage = 'Because page is hidden';
324
            }
325
        }
326
327 8
        if (!$skipPage) {
328 7
            if (GeneralUtility::inList('3,4', $pageRow['doktype']) || $pageRow['doktype'] >= 199) {
329 3
                $skipPage = true;
330 3
                $skipMessage = 'Because doktype is not allowed';
331
            }
332
        }
333
334 8
        if (!$skipPage) {
335 4
            foreach ($GLOBALS['TYPO3_CONF_VARS']['EXTCONF']['crawler']['excludeDoktype'] ?? [] as $key => $doktypeList) {
336 1
                if (GeneralUtility::inList($doktypeList, $pageRow['doktype'])) {
337 1
                    $skipPage = true;
338 1
                    $skipMessage = 'Doktype was excluded by "' . $key . '"';
339 1
                    break;
340
                }
341
            }
342
        }
343
344 8
        if (!$skipPage) {
345
            // veto hook
346 3
            foreach ($GLOBALS['TYPO3_CONF_VARS']['EXTCONF']['crawler']['pageVeto'] ?? [] as $key => $func) {
347
                $params = [
348
                    'pageRow' => $pageRow
349
                ];
350
                // expects "false" if page is ok and "true" or a skipMessage if this page should _not_ be crawled
351
                $veto = GeneralUtility::callUserFunction($func, $params, $this);
352
                if ($veto !== false) {
353
                    $skipPage = true;
354
                    if (is_string($veto)) {
355
                        $skipMessage = $veto;
356
                    } else {
357
                        $skipMessage = 'Veto from hook "' . htmlspecialchars($key) . '"';
358
                    }
359
                    // no need to execute other hooks if a previous one return a veto
360
                    break;
361
                }
362
            }
363
        }
364
365 8
        return $skipPage ? $skipMessage : false;
366
    }
367
368
    /**
369
     * Wrapper method for getUrlsForPageId()
370
     * It returns an array of configurations and no urls!
371
     *
372
     * @param array $pageRow Page record with at least dok-type and uid columns.
373
     * @param string $skipMessage
374
     * @return array
375
     * @see getUrlsForPageId()
376
     */
377 4
    public function getUrlsForPageRow(array $pageRow, &$skipMessage = '')
378
    {
379 4
        $message = $this->checkIfPageShouldBeSkipped($pageRow);
380
381 4
        if ($message === false) {
382 3
            $res = $this->getUrlsForPageId($pageRow['uid']);
383 3
            $skipMessage = '';
384
        } else {
385 1
            $skipMessage = $message;
386 1
            $res = [];
387
        }
388
389 4
        return $res;
390
    }
391
392
    /**
393
     * This method is used to count if there are ANY unprocessed queue entries
394
     * of a given page_id and the configuration which matches a given hash.
395
     * If there if none, we can skip an inner detail check
396
     *
397
     * @param  int $uid
398
     * @param  string $configurationHash
399
     * @return boolean
400
     */
401 5
    protected function noUnprocessedQueueEntriesForPageWithConfigurationHashExist($uid, $configurationHash)
402
    {
403 5
        $queryBuilder = GeneralUtility::makeInstance(ConnectionPool::class)->getQueryBuilderForTable($this->tableName);
404 5
        $noUnprocessedQueueEntriesFound = true;
405
406
        $result = $queryBuilder
407 5
            ->count('*')
408 5
            ->from($this->tableName)
409 5
            ->where(
410 5
                $queryBuilder->expr()->eq('page_id', intval($uid)),
411 5
                $queryBuilder->expr()->eq('configuration_hash', $queryBuilder->createNamedParameter($configurationHash)),
412 5
                $queryBuilder->expr()->eq('exec_time', 0)
413
            )
414 5
            ->execute()
415 5
            ->fetchColumn();
416
417 5
        if ($result) {
418 3
            $noUnprocessedQueueEntriesFound = false;
419
        }
420
421 5
        return $noUnprocessedQueueEntriesFound;
422
    }
423
424
    /**
425
     * Creates a list of URLs from input array (and submits them to queue if asked for)
426
     * See Web > Info module script + "indexed_search"'s crawler hook-client using this!
427
     *
428
     * @param    array        Information about URLs from pageRow to crawl.
429
     * @param    array        Page row
430
     * @param    integer        Unix time to schedule indexing to, typically time()
431
     * @param    integer        Number of requests per minute (creates the interleave between requests)
432
     * @param    boolean        If set, submits the URLs to queue
433
     * @param    boolean        If set (and submitcrawlUrls is false) will fill $downloadUrls with entries)
434
     * @param    array        Array which is passed by reference and contains the an id per url to secure we will not crawl duplicates
435
     * @param    array        Array which will be filled with URLS for download if flag is set.
436
     * @param    array        Array of processing instructions
437
     * @return    string        List of URLs (meant for display in backend module)
438
     *
439
     */
440 2
    public function urlListFromUrlArray(
441
        array $vv,
442
        array $pageRow,
443
        $scheduledTime,
444
        $reqMinute,
445
        $submitCrawlUrls,
446
        $downloadCrawlUrls,
447
        array &$duplicateTrack,
448
        array &$downloadUrls,
449
        array $incomingProcInstructions
450
    ) {
451
452 2
        if (!is_array($vv['URLs'])) {
453
            return 'ERROR - no URL generated';
454
        }
455 2
        $urlLog = [];
456 2
        $pageId = (int)$pageRow['uid'];
457 2
        $configurationHash = $this->getConfigurationHash($vv);
458 2
        $skipInnerCheck = $this->noUnprocessedQueueEntriesForPageWithConfigurationHashExist($pageId, $configurationHash);
459
460 2
        foreach ($vv['URLs'] as $urlQuery) {
461 2
            if (!$this->drawURLs_PIfilter($vv['subCfg']['procInstrFilter'], $incomingProcInstructions)) {
462
                continue;
463
            }
464 2
            $url = (string)$this->getUrlFromPageAndQueryParameters(
465 2
                $pageId,
466 2
                $urlQuery,
467 2
                $vv['subCfg']['baseUrl'] ?? null,
468 2
                $vv['subCfg']['force_ssl'] ?? 0
469
            );
470
471
            // Create key by which to determine unique-ness:
472 2
            $uKey = $url . '|' . $vv['subCfg']['userGroups'] . '|' . $vv['subCfg']['procInstrFilter'];
473
474 2
            if (isset($duplicateTrack[$uKey])) {
475
                //if the url key is registered just display it and do not resubmit is
476
                $urlLog[] = '<em><span class="text-muted">' . htmlspecialchars($url) . '</span></em>';
477
            } else {
478
                // Scheduled time:
479 2
                $schTime = $scheduledTime + round(count($duplicateTrack) * (60 / $reqMinute));
480 2
                $schTime = floor($schTime / 60) * 60;
481 2
                $formattedDate = BackendUtility::datetime($schTime);
482 2
                $this->urlList[] = '[' . $formattedDate . '] ' . $url;
483 2
                $urlList = '[' . $formattedDate . '] ' . htmlspecialchars($url);
484
485
                // Submit for crawling!
486 2
                if ($submitCrawlUrls) {
487 2
                    $added = $this->addUrl(
488 2
                        $pageId,
489 2
                        $url,
490 2
                        $vv['subCfg'],
491 2
                        $scheduledTime,
492 2
                        $configurationHash,
493 2
                        $skipInnerCheck
494
                    );
495 2
                    if ($added === false) {
496 2
                        $urlList .= ' (URL already existed)';
497
                    }
498
                } elseif ($downloadCrawlUrls) {
499
                    $downloadUrls[$url] = $url;
500
                }
501 2
                $urlLog[] = $urlList;
502
            }
503 2
            $duplicateTrack[$uKey] = true;
504
        }
505
506 2
        return implode('<br>', $urlLog);
507
    }
508
509
    /**
510
     * Returns true if input processing instruction is among registered ones.
511
     *
512
     * @param string $piString PI to test
513
     * @param array $incomingProcInstructions Processing instructions
514
     * @return boolean
515
     */
516 5
    public function drawURLs_PIfilter($piString, array $incomingProcInstructions)
517
    {
518 5
        if (empty($incomingProcInstructions)) {
519 1
            return true;
520
        }
521
522 4
        foreach ($incomingProcInstructions as $pi) {
523 4
            if (GeneralUtility::inList($piString, $pi)) {
524 2
                return true;
525
            }
526
        }
527 2
        return false;
528
    }
529
530 2
    public function getPageTSconfigForId($id)
531
    {
532 2
        if (!$this->MP) {
533 2
            $pageTSconfig = BackendUtility::getPagesTSconfig($id);
534
        } else {
535
            [, $mountPointId] = explode('-', $this->MP);
0 ignored issues
show
Bug introduced by
The variable $mountPointId does not exist. Did you forget to declare it?

This check marks access to variables or properties that have not been declared yet. While PHP has no explicit notion of declaring a variable, accessing it before a value is assigned to it is most likely a bug.

Loading history...
536
            $pageTSconfig = BackendUtility::getPagesTSconfig($mountPointId);
537
        }
538
539
        // Call a hook to alter configuration
540 2
        if (is_array($GLOBALS['TYPO3_CONF_VARS']['EXTCONF']['crawler']['getPageTSconfigForId'])) {
541
            $params = [
542
                'pageId' => $id,
543
                'pageTSConfig' => &$pageTSconfig
544
            ];
545
            foreach ($GLOBALS['TYPO3_CONF_VARS']['EXTCONF']['crawler']['getPageTSconfigForId'] as $userFunc) {
546
                GeneralUtility::callUserFunction($userFunc, $params, $this);
547
            }
548
        }
549 2
        return $pageTSconfig;
550
    }
551
552
    /**
553
     * This methods returns an array of configurations.
554
     * And no urls!
555
     *
556
     * @param integer $id Page ID
0 ignored issues
show
Bug introduced by
There is no parameter named $id. Was it maybe removed?

This check looks for PHPDoc comments describing methods or function parameters that do not exist on the corresponding method or function.

Consider the following example. The parameter $italy is not defined by the method finale(...).

/**
 * @param array $germany
 * @param array $island
 * @param array $italy
 */
function finale($germany, $island) {
    return "2:1";
}

The most likely cause is that the parameter was removed, but the annotation was not.

Loading history...
557
     * @return array
558
     */
559 2
    public function getUrlsForPageId($pageId)
560
    {
561
        // Get page TSconfig for page ID
562 2
        $pageTSconfig = $this->getPageTSconfigForId($pageId);
563
564 2
        $res = [];
565
566
        // Fetch Crawler Configuration from pageTSconfig
567 2
        $crawlerCfg = $pageTSconfig['tx_crawler.']['crawlerCfg.']['paramSets.'] ?? [];
568 2
        foreach ($crawlerCfg as $key => $values) {
569 1
            if (!is_array($values)) {
570 1
                continue;
571
            }
572 1
            $key = str_replace('.', '', $key);
573
            // Sub configuration for a single configuration string:
574 1
            $subCfg = (array)$crawlerCfg[$key . '.'];
575 1
            $subCfg['key'] = $key;
576
577 1
            if (strcmp($subCfg['procInstrFilter'], '')) {
578 1
                $subCfg['procInstrFilter'] = implode(',', GeneralUtility::trimExplode(',', $subCfg['procInstrFilter']));
579
            }
580 1
            $pidOnlyList = implode(',', GeneralUtility::trimExplode(',', $subCfg['pidsOnly'], true));
581
582
            // process configuration if it is not page-specific or if the specific page is the current page:
583 1
            if (!strcmp($subCfg['pidsOnly'], '') || GeneralUtility::inList($pidOnlyList, $pageId)) {
584
585
                // Explode, process etc.:
586 1
                $res[$key] = [];
587 1
                $res[$key]['subCfg'] = $subCfg;
588 1
                $res[$key]['paramParsed'] = GeneralUtility::explodeUrl2Array($crawlerCfg[$key]);
589 1
                $res[$key]['paramExpanded'] = $this->expandParameters($res[$key]['paramParsed'], $pageId);
590 1
                $res[$key]['origin'] = 'pagets';
591
592
                // recognize MP value
593 1
                if (!$this->MP) {
594 1
                    $res[$key]['URLs'] = $this->compileUrls($res[$key]['paramExpanded'], ['?id=' . $pageId]);
595
                } else {
596
                    $res[$key]['URLs'] = $this->compileUrls($res[$key]['paramExpanded'], ['?id=' . $pageId . '&MP=' . $this->MP]);
597
                }
598
            }
599
        }
600
601
        // Get configuration from tx_crawler_configuration records up the rootline
602 2
        $crawlerConfigurations = $this->configurationRepository->getCrawlerConfigurationRecordsFromRootLine($pageId);
603 2
        foreach ($crawlerConfigurations as $configurationRecord) {
604
605
                // check access to the configuration record
606 1
            if (empty($configurationRecord['begroups']) || $GLOBALS['BE_USER']->isAdmin() || $this->hasGroupAccess($GLOBALS['BE_USER']->user['usergroup_cached_list'], $configurationRecord['begroups'])) {
607 1
                $pidOnlyList = implode(',', GeneralUtility::trimExplode(',', $configurationRecord['pidsonly'], true));
608
609
                // process configuration if it is not page-specific or if the specific page is the current page:
610 1
                if (!strcmp($configurationRecord['pidsonly'], '') || GeneralUtility::inList($pidOnlyList, $pageId)) {
611 1
                    $key = $configurationRecord['name'];
612
613
                    // don't overwrite previously defined paramSets
614 1
                    if (!isset($res[$key])) {
615
616
                            /* @var $TSparserObject \TYPO3\CMS\Core\TypoScript\Parser\TypoScriptParser */
617 1
                        $TSparserObject = GeneralUtility::makeInstance(TypoScriptParser::class);
618 1
                        $TSparserObject->parse($configurationRecord['processing_instruction_parameters_ts']);
619
620
                        $subCfg = [
621 1
                            'procInstrFilter' => $configurationRecord['processing_instruction_filter'],
622 1
                            'procInstrParams.' => $TSparserObject->setup,
623 1
                            'baseUrl' => $configurationRecord['base_url'],
624 1
                            'force_ssl' => (int)$configurationRecord['force_ssl'],
625 1
                            'userGroups' => $configurationRecord['fegroups'],
626 1
                            'exclude' => $configurationRecord['exclude'],
627 1
                            'key' => $key
628
                        ];
629
630 1
                        if (!in_array($pageId, $this->expandExcludeString($subCfg['exclude']))) {
631 1
                            $res[$key] = [];
632 1
                            $res[$key]['subCfg'] = $subCfg;
633 1
                            $res[$key]['paramParsed'] = GeneralUtility::explodeUrl2Array($configurationRecord['configuration']);
634 1
                            $res[$key]['paramExpanded'] = $this->expandParameters($res[$key]['paramParsed'], $pageId);
635 1
                            $res[$key]['URLs'] = $this->compileUrls($res[$key]['paramExpanded'], ['?id=' . $pageId]);
636 1
                            $res[$key]['origin'] = 'tx_crawler_configuration_' . $configurationRecord['uid'];
637
                        }
638
                    }
639
                }
640
            }
641
        }
642
643 2
        foreach ($GLOBALS['TYPO3_CONF_VARS']['EXTCONF']['crawler']['processUrls'] ?? [] as $func) {
644
            $params = [
645
                'res' => &$res,
646
            ];
647
            GeneralUtility::callUserFunction($func, $params, $this);
648
        }
649 2
        return $res;
650
    }
651
652
    /**
653
     * Find all configurations of subpages of a page
654
     *
655
     * @param int $rootid
656
     * @param $depth
657
     * @return array
658
     *
659
     * TODO: Write Functional Tests
660
     */
661
    public function getConfigurationsForBranch(int $rootid, $depth)
662
    {
663
        $configurationsForBranch = [];
664
        $pageTSconfig = $this->getPageTSconfigForId($rootid);
665
        $sets = $pageTSconfig['tx_crawler.']['crawlerCfg.']['paramSets.'] ?? [];
666
        foreach ($sets as $key => $value) {
667
            if (!is_array($value)) {
668
                continue;
669
            }
670
            $configurationsForBranch[] = substr($key, -1) == '.' ? substr($key, 0, -1) : $key;
671
        }
672
        $pids = [];
673
        $rootLine = BackendUtility::BEgetRootLine($rootid);
674
        foreach ($rootLine as $node) {
675
            $pids[] = $node['uid'];
676
        }
677
        /* @var PageTreeView $tree */
678
        $tree = GeneralUtility::makeInstance(PageTreeView::class);
679
        $perms_clause = $GLOBALS['BE_USER']->getPagePermsClause(1);
680
        $tree->init('AND ' . $perms_clause);
681
        $tree->getTree($rootid, $depth, '');
682
        foreach ($tree->tree as $node) {
683
            $pids[] = $node['row']['uid'];
684
        }
685
686
        $queryBuilder = $this->getQueryBuilder('tx_crawler_configuration');
687
688
        $queryBuilder->getRestrictions()
689
            ->removeAll()
690
            ->add(GeneralUtility::makeInstance(DeletedRestriction::class));
691
692
        $statement = $queryBuilder
693
            ->select('name')
694
            ->from('tx_crawler_configuration')
695
            ->where(
696
                $queryBuilder->expr()->in('pid', $queryBuilder->createNamedParameter($pids, Connection::PARAM_INT_ARRAY))
697
            )
698
            ->execute();
699
700
        while ($row = $statement->fetch()) {
701
            $configurationsForBranch[] = $row['name'];
702
        }
703
        return $configurationsForBranch;
704
    }
705
706
    /**
707
     * Get querybuilder for given table
708
     *
709
     * @param string $table
710
     * @return \TYPO3\CMS\Core\Database\Query\QueryBuilder
711
     */
712 9
    private function getQueryBuilder(string $table)
713
    {
714 9
        return GeneralUtility::makeInstance(ConnectionPool::class)->getQueryBuilderForTable($table);
715
    }
716
717
    /**
718
     * Check if a user has access to an item
719
     * (e.g. get the group list of the current logged in user from $GLOBALS['TSFE']->gr_list)
720
     *
721
     * @see \TYPO3\CMS\Frontend\Page\PageRepository::getMultipleGroupsWhereClause()
722
     * @param  string $groupList    Comma-separated list of (fe_)group UIDs from a user
723
     * @param  string $accessList   Comma-separated list of (fe_)group UIDs of the item to access
724
     * @return bool                 TRUE if at least one of the users group UIDs is in the access list or the access list is empty
725
     */
726 3
    public function hasGroupAccess($groupList, $accessList)
727
    {
728 3
        if (empty($accessList)) {
729 1
            return true;
730
        }
731 2
        foreach (GeneralUtility::intExplode(',', $groupList) as $groupUid) {
732 2
            if (GeneralUtility::inList($accessList, $groupUid)) {
733 1
                return true;
734
            }
735
        }
736 1
        return false;
737
    }
738
739
    /**
740
     * Will expand the parameters configuration to individual values. This follows a certain syntax of the value of each parameter.
741
     * Syntax of values:
742
     * - Basically: If the value is wrapped in [...] it will be expanded according to the following syntax, otherwise the value is taken literally
743
     * - Configuration is splitted by "|" and the parts are processed individually and finally added together
744
     * - For each configuration part:
745
     *         - "[int]-[int]" = Integer range, will be expanded to all values in between, values included, starting from low to high (max. 1000). Example "1-34" or "-40--30"
746
     *         - "_TABLE:[TCA table name];[_PID:[optional page id, default is current page]];[_ENABLELANG:1]" = Look up of table records from PID, filtering out deleted records. Example "_TABLE:tt_content; _PID:123"
747
     *        _ENABLELANG:1 picks only original records without their language overlays
748
     *         - Default: Literal value
749
     *
750
     * @param array $paramArray Array with key (GET var name) and values (value of GET var which is configuration for expansion)
751
     * @param integer $pid Current page ID
752
     * @return array
753
     *
754
     * TODO: Write Functional Tests
755
     */
756 2
    public function expandParameters($paramArray, $pid)
757
    {
758
        // Traverse parameter names:
759 2
        foreach ($paramArray as $p => $v) {
760 2
            $v = trim($v);
761
762
            // If value is encapsulated in square brackets it means there are some ranges of values to find, otherwise the value is literal
763 2
            if (substr($v, 0, 1) === '[' && substr($v, -1) === ']') {
764
                // So, find the value inside brackets and reset the paramArray value as an array.
765 2
                $v = substr($v, 1, -1);
766 2
                $paramArray[$p] = [];
767
768
                // Explode parts and traverse them:
769 2
                $parts = explode('|', $v);
770 2
                foreach ($parts as $pV) {
771
772
                        // Look for integer range: (fx. 1-34 or -40--30 // reads minus 40 to minus 30)
773 2
                    if (preg_match('/^(-?[0-9]+)\s*-\s*(-?[0-9]+)$/', trim($pV), $reg)) {
774
775
                        // Swap if first is larger than last:
776
                        if ($reg[1] > $reg[2]) {
777
                            $temp = $reg[2];
778
                            $reg[2] = $reg[1];
779
                            $reg[1] = $temp;
780
                        }
781
782
                        // Traverse range, add values:
783
                        $runAwayBrake = 1000; // Limit to size of range!
784
                        for ($a = $reg[1]; $a <= $reg[2];$a++) {
785
                            $paramArray[$p][] = $a;
786
                            $runAwayBrake--;
787
                            if ($runAwayBrake <= 0) {
788
                                break;
789
                            }
790
                        }
791 2
                    } elseif (substr(trim($pV), 0, 7) == '_TABLE:') {
792
793
                        // Parse parameters:
794
                        $subparts = GeneralUtility::trimExplode(';', $pV);
795
                        $subpartParams = [];
796
                        foreach ($subparts as $spV) {
797
                            list($pKey, $pVal) = GeneralUtility::trimExplode(':', $spV);
798
                            $subpartParams[$pKey] = $pVal;
799
                        }
800
801
                        // Table exists:
802
                        if (isset($GLOBALS['TCA'][$subpartParams['_TABLE']])) {
803
                            $lookUpPid = isset($subpartParams['_PID']) ? intval($subpartParams['_PID']) : $pid;
804
                            $pidField = isset($subpartParams['_PIDFIELD']) ? trim($subpartParams['_PIDFIELD']) : 'pid';
805
                            $where = isset($subpartParams['_WHERE']) ? $subpartParams['_WHERE'] : '';
806
                            $addTable = isset($subpartParams['_ADDTABLE']) ? $subpartParams['_ADDTABLE'] : '';
807
808
                            $fieldName = $subpartParams['_FIELD'] ? $subpartParams['_FIELD'] : 'uid';
809
                            if ($fieldName === 'uid' || $GLOBALS['TCA'][$subpartParams['_TABLE']]['columns'][$fieldName]) {
810
                                $queryBuilder = $this->getQueryBuilder($subpartParams['_TABLE']);
811
812
                                $queryBuilder->getRestrictions()
813
                                    ->removeAll()
814
                                    ->add(GeneralUtility::makeInstance(DeletedRestriction::class));
815
816
                                $queryBuilder
817
                                    ->select($fieldName)
818
                                    ->from($subpartParams['_TABLE'])
819
                                    // TODO: Check if this works as intended!
820
                                    ->add('from', $addTable)
821
                                    ->where(
822
                                        $queryBuilder->expr()->eq($queryBuilder->quoteIdentifier($pidField), $queryBuilder->createNamedParameter($lookUpPid, \PDO::PARAM_INT)),
823
                                        $where
824
                                    );
825
                                $transOrigPointerField = $GLOBALS['TCA'][$subpartParams['_TABLE']]['ctrl']['transOrigPointerField'];
826
827
                                if ($subpartParams['_ENABLELANG'] && $transOrigPointerField) {
828
                                    $queryBuilder->andWhere(
829
                                        $queryBuilder->expr()->lte(
830
                                            $queryBuilder->quoteIdentifier($transOrigPointerField),
831
                                            0
832
                                        )
833
                                    );
834
                                }
835
836
                                $statement = $queryBuilder->execute();
837
838
                                $rows = [];
839
                                while ($row = $statement->fetch()) {
840
                                    $rows[$fieldName] = $row;
841
                                }
842
843
                                if (is_array($rows)) {
844
                                    $paramArray[$p] = array_merge($paramArray[$p], array_keys($rows));
845
                                }
846
                            }
847
                        }
848
                    } else { // Just add value:
849 2
                        $paramArray[$p][] = $pV;
850
                    }
851
                    // Hook for processing own expandParameters place holder
852 2
                    if (is_array($GLOBALS['TYPO3_CONF_VARS']['SC_OPTIONS']['crawler/class.tx_crawler_lib.php']['expandParameters'])) {
853
                        $_params = [
854
                            'pObj' => &$this,
855
                            'paramArray' => &$paramArray,
856
                            'currentKey' => $p,
857
                            'currentValue' => $pV,
858
                            'pid' => $pid
859
                        ];
860
                        foreach ($GLOBALS['TYPO3_CONF_VARS']['SC_OPTIONS']['crawler/class.tx_crawler_lib.php']['expandParameters'] as $key => $_funcRef) {
861
                            GeneralUtility::callUserFunction($_funcRef, $_params, $this);
862
                        }
863
                    }
864
                }
865
866
                // Make unique set of values and sort array by key:
867 2
                $paramArray[$p] = array_unique($paramArray[$p]);
868 2
                ksort($paramArray);
869
            } else {
870
                // Set the literal value as only value in array:
871 2
                $paramArray[$p] = [$v];
872
            }
873
        }
874
875 2
        return $paramArray;
876
    }
877
878
    /**
879
     * Compiling URLs from parameter array (output of expandParameters())
880
     * The number of URLs will be the multiplication of the number of parameter values for each key
881
     *
882
     * @param array $paramArray Output of expandParameters(): Array with keys (GET var names) and for each an array of values
883
     * @param array $urls URLs accumulated in this array (for recursion)
884
     * @return array
885
     */
886 5
    public function compileUrls($paramArray, array $urls)
887
    {
888 5
        if (empty($paramArray)) {
889 5
            return $urls;
890
        }
891
        // shift first off stack:
892 4
        reset($paramArray);
893 4
        $varName = key($paramArray);
894 4
        $valueSet = array_shift($paramArray);
895
896
        // Traverse value set:
897 4
        $newUrls = [];
898 4
        foreach ($urls as $url) {
899 3
            foreach ($valueSet as $val) {
900 3
                $newUrls[] = $url . (strcmp($val, '') ? '&' . rawurlencode($varName) . '=' . rawurlencode($val) : '');
901
902 3
                if (count($newUrls) > $this->maximumUrlsToCompile) {
903
                    break;
904
                }
905
            }
906
        }
907 4
        return $this->compileUrls($paramArray, $newUrls);
908
    }
909
910
    /************************************
911
     *
912
     * Crawler log
913
     *
914
     ************************************/
915
916
    /**
917
     * Return array of records from crawler queue for input page ID
918
     *
919
     * @param integer $id Page ID for which to look up log entries.
920
     * @param string$filter Filter: "all" => all entries, "pending" => all that is not yet run, "finished" => all complete ones
921
     * @param boolean $doFlush If TRUE, then entries selected at DELETED(!) instead of selected!
922
     * @param boolean $doFullFlush
923
     * @param integer $itemsPerPage Limit the amount of entries per page default is 10
924
     * @return array
925
     */
926 4
    public function getLogEntriesForPageId($id, $filter = '', $doFlush = false, $doFullFlush = false, $itemsPerPage = 10)
927
    {
928 4
        $queryBuilder = GeneralUtility::makeInstance(ConnectionPool::class)->getQueryBuilderForTable($this->tableName);
929
        $queryBuilder
930 4
            ->select('*')
931 4
            ->from($this->tableName)
932 4
            ->where(
933 4
                $queryBuilder->expr()->eq('page_id', $queryBuilder->createNamedParameter($id, \PDO::PARAM_INT))
934
            )
935 4
            ->orderBy('scheduled', 'DESC');
936
937 4
        $expressionBuilder = GeneralUtility::makeInstance(ConnectionPool::class)
938 4
            ->getConnectionForTable($this->tableName)
939 4
            ->getExpressionBuilder();
940 4
        $query = $expressionBuilder->andX();
941
        // PHPStorm adds the highlight that the $addWhere is immediately overwritten,
942
        // but the $query = $expressionBuilder->andX() ensures that the $addWhere is written correctly with AND
943
        // between the statements, it's not a mistake in the code.
944 4
        $addWhere = '';
0 ignored issues
show
Unused Code introduced by
$addWhere is not used, you could remove the assignment.

This check looks for variable assignements that are either overwritten by other assignments or where the variable is not used subsequently.

$myVar = 'Value';
$higher = false;

if (rand(1, 6) > 3) {
    $higher = true;
} else {
    $higher = false;
}

Both the $myVar assignment in line 1 and the $higher assignment in line 2 are dead. The first because $myVar is never used and the second because $higher is always overwritten for every possible time line.

Loading history...
945 4
        switch ($filter) {
946 4
            case 'pending':
947
                $queryBuilder->andWhere($queryBuilder->expr()->eq('exec_time', 0));
948
                $addWhere = ' AND ' . $query->add($expressionBuilder->eq('exec_time', 0));
0 ignored issues
show
Unused Code introduced by
$addWhere is not used, you could remove the assignment.

This check looks for variable assignements that are either overwritten by other assignments or where the variable is not used subsequently.

$myVar = 'Value';
$higher = false;

if (rand(1, 6) > 3) {
    $higher = true;
} else {
    $higher = false;
}

Both the $myVar assignment in line 1 and the $higher assignment in line 2 are dead. The first because $myVar is never used and the second because $higher is always overwritten for every possible time line.

Loading history...
949
                break;
950 4
            case 'finished':
951
                $queryBuilder->andWhere($queryBuilder->expr()->gt('exec_time', 0));
952
                $addWhere = ' AND ' . $query->add($expressionBuilder->gt('exec_time', 0));
0 ignored issues
show
Unused Code introduced by
$addWhere is not used, you could remove the assignment.

This check looks for variable assignements that are either overwritten by other assignments or where the variable is not used subsequently.

$myVar = 'Value';
$higher = false;

if (rand(1, 6) > 3) {
    $higher = true;
} else {
    $higher = false;
}

Both the $myVar assignment in line 1 and the $higher assignment in line 2 are dead. The first because $myVar is never used and the second because $higher is always overwritten for every possible time line.

Loading history...
953
                break;
954
        }
955
956
        // FIXME: Write unit test that ensures that the right records are deleted.
957 4
        if ($doFlush) {
958 2
            $addWhere = $query->add($expressionBuilder->eq('page_id', intval($id)));
959 2
            $this->flushQueue($doFullFlush ? '1=1' : $addWhere);
960 2
            return [];
961
        } else {
962 2
            if ($itemsPerPage > 0) {
963
                $queryBuilder
964 2
                    ->setMaxResults((int)$itemsPerPage);
965
            }
966
967 2
            return $queryBuilder->execute()->fetchAll();
968
        }
969
    }
970
971
    /**
972
     * Return array of records from crawler queue for input set ID
973
     *
974
     * @param integer $set_id Set ID for which to look up log entries.
975
     * @param string $filter Filter: "all" => all entries, "pending" => all that is not yet run, "finished" => all complete ones
976
     * @param boolean $doFlush If TRUE, then entries selected at DELETED(!) instead of selected!
977
     * @param integer $itemsPerPage Limit the amount of entires per page default is 10
978
     * @return array
979
     */
980 6
    public function getLogEntriesForSetId($set_id, $filter = '', $doFlush = false, $doFullFlush = false, $itemsPerPage = 10)
981
    {
982 6
        $queryBuilder = GeneralUtility::makeInstance(ConnectionPool::class)->getQueryBuilderForTable($this->tableName);
983
        $queryBuilder
984 6
            ->select('*')
985 6
            ->from($this->tableName)
986 6
            ->where(
987 6
                $queryBuilder->expr()->eq('set_id', $queryBuilder->createNamedParameter($set_id, \PDO::PARAM_INT))
988
            )
989 6
            ->orderBy('scheduled', 'DESC');
990
991 6
        $expressionBuilder = GeneralUtility::makeInstance(ConnectionPool::class)
992 6
            ->getConnectionForTable($this->tableName)
993 6
            ->getExpressionBuilder();
994 6
        $query = $expressionBuilder->andX();
995
        // FIXME: Write Unit tests for Filters
996
        // PHPStorm adds the highlight that the $addWhere is immediately overwritten,
997
        // but the $query = $expressionBuilder->andX() ensures that the $addWhere is written correctly with AND
998
        // between the statements, it's not a mistake in the code.
999 6
        $addWhere = '';
0 ignored issues
show
Unused Code introduced by
$addWhere is not used, you could remove the assignment.

This check looks for variable assignements that are either overwritten by other assignments or where the variable is not used subsequently.

$myVar = 'Value';
$higher = false;

if (rand(1, 6) > 3) {
    $higher = true;
} else {
    $higher = false;
}

Both the $myVar assignment in line 1 and the $higher assignment in line 2 are dead. The first because $myVar is never used and the second because $higher is always overwritten for every possible time line.

Loading history...
1000 6
        switch ($filter) {
1001 6
            case 'pending':
1002 1
                $queryBuilder->andWhere($queryBuilder->expr()->eq('exec_time', 0));
1003 1
                $addWhere = $query->add($expressionBuilder->eq('exec_time', 0));
0 ignored issues
show
Unused Code introduced by
$addWhere is not used, you could remove the assignment.

This check looks for variable assignements that are either overwritten by other assignments or where the variable is not used subsequently.

$myVar = 'Value';
$higher = false;

if (rand(1, 6) > 3) {
    $higher = true;
} else {
    $higher = false;
}

Both the $myVar assignment in line 1 and the $higher assignment in line 2 are dead. The first because $myVar is never used and the second because $higher is always overwritten for every possible time line.

Loading history...
1004 1
                break;
1005 5
            case 'finished':
1006 1
                $queryBuilder->andWhere($queryBuilder->expr()->gt('exec_time', 0));
1007 1
                $addWhere = $query->add($expressionBuilder->gt('exec_time', 0));
0 ignored issues
show
Unused Code introduced by
$addWhere is not used, you could remove the assignment.

This check looks for variable assignements that are either overwritten by other assignments or where the variable is not used subsequently.

$myVar = 'Value';
$higher = false;

if (rand(1, 6) > 3) {
    $higher = true;
} else {
    $higher = false;
}

Both the $myVar assignment in line 1 and the $higher assignment in line 2 are dead. The first because $myVar is never used and the second because $higher is always overwritten for every possible time line.

Loading history...
1008 1
                break;
1009
        }
1010
        // FIXME: Write unit test that ensures that the right records are deleted.
1011 6
        if ($doFlush) {
1012 4
            $addWhere = $query->add($expressionBuilder->eq('set_id', intval($set_id)));
1013 4
            $this->flushQueue($doFullFlush ? '' : $addWhere);
1014 4
            return [];
1015
        } else {
1016 2
            if ($itemsPerPage > 0) {
1017
                $queryBuilder
1018 2
                    ->setMaxResults((int)$itemsPerPage);
1019
            }
1020
1021 2
            return $queryBuilder->execute()->fetchAll();
1022
        }
1023
    }
1024
1025
    /**
1026
     * Removes queue entries
1027
     *
1028
     * @param string $where SQL related filter for the entries which should be removed
1029
     * @return void
1030
     */
1031 9
    protected function flushQueue($where = '')
1032
    {
1033 9
        $realWhere = strlen($where) > 0 ? $where : '1=1';
1034
1035 9
        $queryBuilder = $this->getQueryBuilder($this->tableName);
1036
1037 9
        if (EventDispatcher::getInstance()->hasObserver('queueEntryFlush')) {
1038
            $groups = $queryBuilder
1039
                ->select('DISTINCT set_id')
1040
                ->from($this->tableName)
1041
                ->where($realWhere)
1042
                ->execute()
1043
                ->fetchAll();
1044
            if (is_array($groups)) {
1045
                foreach ($groups as $group) {
1046
                    $subSet = $queryBuilder
1047
                        ->select('uid', 'set_id')
1048
                        ->from($this->tableName)
1049
                        ->where(
1050
                            $realWhere,
1051
                            $queryBuilder->expr()->eq('set_id', $group['set_id'])
1052
                        )
1053
                        ->execute()
1054
                        ->fetchAll();
1055
                    EventDispatcher::getInstance()->post('queueEntryFlush', $group['set_id'], $subSet);
1056
                }
1057
            }
1058
        }
1059
1060
        $queryBuilder
1061 9
            ->delete($this->tableName)
1062 9
            ->where($realWhere)
1063 9
            ->execute();
1064 9
    }
1065
1066
    /**
1067
     * Adding call back entries to log (called from hooks typically, see indexed search class "class.crawler.php"
1068
     *
1069
     * @param integer $setId Set ID
1070
     * @param array $params Parameters to pass to call back function
1071
     * @param string $callBack Call back object reference, eg. 'EXT:indexed_search/class.crawler.php:&tx_indexedsearch_crawler'
1072
     * @param integer $page_id Page ID to attach it to
1073
     * @param integer $schedule Time at which to activate
1074
     * @return void
1075
     */
1076
    public function addQueueEntry_callBack($setId, $params, $callBack, $page_id = 0, $schedule = 0)
1077
    {
1078
        if (!is_array($params)) {
1079
            $params = [];
1080
        }
1081
        $params['_CALLBACKOBJ'] = $callBack;
1082
1083
        GeneralUtility::makeInstance(ConnectionPool::class)->getConnectionForTable('tx_crawler_queue')
1084
            ->insert(
1085
                'tx_crawler_queue',
1086
                [
1087
                    'page_id' => intval($page_id),
1088
                    'parameters' => serialize($params),
1089
                    'scheduled' => intval($schedule) ? intval($schedule) : $this->getCurrentTime(),
1090
                    'exec_time' => 0,
1091
                    'set_id' => intval($setId),
1092
                    'result_data' => '',
1093
                ]
1094
            );
1095
    }
1096
1097
    /************************************
1098
     *
1099
     * URL setting
1100
     *
1101
     ************************************/
1102
1103
    /**
1104
     * Setting a URL for crawling:
1105
     *
1106
     * @param integer $id Page ID
1107
     * @param string $url Complete URL
1108
     * @param array $subCfg Sub configuration array (from TS config)
1109
     * @param integer $tstamp Scheduled-time
1110
     * @param string $configurationHash (optional) configuration hash
1111
     * @param bool $skipInnerDuplicationCheck (optional) skip inner duplication check
1112
     * @return bool
1113
     */
1114 2
    public function addUrl(
1115
        $id,
1116
        $url,
1117
        array $subCfg,
1118
        $tstamp,
1119
        $configurationHash = '',
1120
        $skipInnerDuplicationCheck = false
1121
    ) {
1122 2
        $urlAdded = false;
1123 2
        $rows = [];
1124
1125
        // Creating parameters:
1126
        $parameters = [
1127 2
            'url' => $url
1128
        ];
1129
1130
        // fe user group simulation:
1131 2
        $uGs = implode(',', array_unique(GeneralUtility::intExplode(',', $subCfg['userGroups'], true)));
1132 2
        if ($uGs) {
1133
            $parameters['feUserGroupList'] = $uGs;
1134
        }
1135
1136
        // Setting processing instructions
1137 2
        $parameters['procInstructions'] = GeneralUtility::trimExplode(',', $subCfg['procInstrFilter']);
1138 2
        if (is_array($subCfg['procInstrParams.'])) {
1139 2
            $parameters['procInstrParams'] = $subCfg['procInstrParams.'];
1140
        }
1141
1142
        // Compile value array:
1143 2
        $parameters_serialized = serialize($parameters);
1144
        $fieldArray = [
1145 2
            'page_id' => (int)$id,
1146 2
            'parameters' => $parameters_serialized,
1147 2
            'parameters_hash' => GeneralUtility::shortMD5($parameters_serialized),
1148 2
            'configuration_hash' => $configurationHash,
1149 2
            'scheduled' => $tstamp,
1150 2
            'exec_time' => 0,
1151 2
            'set_id' => intval($this->setID),
1152 2
            'result_data' => '',
1153 2
            'configuration' => $subCfg['key'],
1154
        ];
1155
1156 2
        if ($this->registerQueueEntriesInternallyOnly) {
1157
            //the entries will only be registered and not stored to the database
1158
            $this->queueEntries[] = $fieldArray;
1159
        } else {
1160 2
            if (!$skipInnerDuplicationCheck) {
1161
                // check if there is already an equal entry
1162 2
                $rows = $this->getDuplicateRowsIfExist($tstamp, $fieldArray);
1163
            }
1164
1165 2
            if (empty($rows)) {
1166 2
                $connectionForCrawlerQueue = GeneralUtility::makeInstance(ConnectionPool::class)->getConnectionForTable('tx_crawler_queue');
1167 2
                $connectionForCrawlerQueue->insert(
1168 2
                    'tx_crawler_queue',
1169 2
                    $fieldArray
1170
                );
1171 2
                $uid = $connectionForCrawlerQueue->lastInsertId('tx_crawler_queue', 'qid');
1172 2
                $rows[] = $uid;
1173 2
                $urlAdded = true;
1174 2
                EventDispatcher::getInstance()->post('urlAddedToQueue', $this->setID, ['uid' => $uid, 'fieldArray' => $fieldArray]);
1175
            } else {
1176
                EventDispatcher::getInstance()->post('duplicateUrlInQueue', $this->setID, ['rows' => $rows, 'fieldArray' => $fieldArray]);
1177
            }
1178
        }
1179
1180 2
        return $urlAdded;
1181
    }
1182
1183
    /**
1184
     * This method determines duplicates for a queue entry with the same parameters and this timestamp.
1185
     * If the timestamp is in the past, it will check if there is any unprocessed queue entry in the past.
1186
     * If the timestamp is in the future it will check, if the queued entry has exactly the same timestamp
1187
     *
1188
     * @param int $tstamp
1189
     * @param array $fieldArray
1190
     *
1191
     * @return array
1192
     *
1193
     * TODO: Write Functional Tests
1194
     */
1195 2
    protected function getDuplicateRowsIfExist($tstamp, $fieldArray)
1196
    {
1197 2
        $rows = [];
1198
1199 2
        $currentTime = $this->getCurrentTime();
1200
1201 2
        $queryBuilder = GeneralUtility::makeInstance(ConnectionPool::class)->getQueryBuilderForTable($this->tableName);
1202
        $queryBuilder
1203 2
            ->select('qid')
1204 2
            ->from('tx_crawler_queue');
1205
        //if this entry is scheduled with "now"
1206 2
        if ($tstamp <= $currentTime) {
1207
            if ($this->extensionSettings['enableTimeslot']) {
1208
                $timeBegin = $currentTime - 100;
1209
                $timeEnd = $currentTime + 100;
1210
                $queryBuilder
1211
                    ->where(
1212
                        'scheduled BETWEEN ' . $timeBegin . ' AND ' . $timeEnd . ''
1213
                    )
1214
                    ->orWhere(
1215
                        $queryBuilder->expr()->lte('scheduled', $currentTime)
1216
                    );
1217
            } else {
1218
                $queryBuilder
1219
                    ->where(
1220
                        $queryBuilder->expr()->lte('scheduled', $currentTime)
1221
                    );
1222
            }
1223 2
        } elseif ($tstamp > $currentTime) {
1224
            //entry with a timestamp in the future need to have the same schedule time
1225
            $queryBuilder
1226 2
                ->where(
1227 2
                    $queryBuilder->expr()->eq('scheduled', $tstamp)
1228
                );
1229
        }
1230
1231
        $statement = $queryBuilder
1232 2
            ->andWhere('exec_time != 0')
1233 2
            ->andWhere('process_id != 0')
1234 2
            ->andWhere($queryBuilder->expr()->eq('page_id', $queryBuilder->createNamedParameter($fieldArray['page_id'], \PDO::PARAM_INT)))
1235 2
            ->andWhere($queryBuilder->expr()->eq('parameters_hash', $queryBuilder->createNamedParameter($fieldArray['parameters_hash'], \PDO::PARAM_STR)))
1236 2
            ->execute();
1237
1238 2
        while ($row = $statement->fetch()) {
1239
            $rows[] = $row['qid'];
1240
        }
1241
1242 2
        return $rows;
1243
    }
1244
1245
    /**
1246
     * Returns the current system time
1247
     *
1248
     * @return int
1249
     */
1250
    public function getCurrentTime()
1251
    {
1252
        return time();
1253
    }
1254
1255
    /************************************
1256
     *
1257
     * URL reading
1258
     *
1259
     ************************************/
1260
1261
    /**
1262
     * Read URL for single queue entry
1263
     *
1264
     * @param integer $queueId
1265
     * @param boolean $force If set, will process even if exec_time has been set!
1266
     * @return integer
1267
     */
1268
    public function readUrl($queueId, $force = false)
1269
    {
1270
        $queryBuilder = GeneralUtility::makeInstance(ConnectionPool::class)->getQueryBuilderForTable($this->tableName);
1271
        $ret = 0;
1272
        $this->logger->debug('crawler-readurl start ' . microtime(true));
1273
        // Get entry:
1274
        $queryBuilder
1275
            ->select('*')
1276
            ->from('tx_crawler_queue')
1277
            ->where(
1278
                $queryBuilder->expr()->eq('qid', $queryBuilder->createNamedParameter($queueId, \PDO::PARAM_INT))
1279
            );
1280
        if (!$force) {
1281
            $queryBuilder
1282
                ->andWhere('exec_time = 0')
1283
                ->andWhere('process_scheduled > 0');
1284
        }
1285
        $queueRec = $queryBuilder->execute()->fetch();
1286
1287
        if (!is_array($queueRec)) {
1288
            return;
1289
        }
1290
1291
        SignalSlotUtility::emitSignal(
1292
            __CLASS__,
1293
            SignalSlotUtility::SIGNNAL_QUEUEITEM_PREPROCESS,
1294
            [$queueId, &$queueRec]
1295
        );
1296
1297
        // Set exec_time to lock record:
1298
        $field_array = ['exec_time' => $this->getCurrentTime()];
1299
1300
        if (isset($this->processID)) {
1301
            //if mulitprocessing is used we need to store the id of the process which has handled this entry
1302
            $field_array['process_id_completed'] = $this->processID;
1303
        }
1304
1305
        GeneralUtility::makeInstance(ConnectionPool::class)->getConnectionForTable('tx_crawler_queue')
1306
            ->update(
1307
                'tx_crawler_queue',
1308
                $field_array,
1309
                [ 'qid' => (int)$queueId ]
1310
            );
1311
1312
        $result = $this->queueExecutor->executeQueueItem($queueRec, $this);
1313
        $resultData = unserialize($result['content']);
1314
1315
        //atm there's no need to point to specific pollable extensions
1316
        if (is_array($GLOBALS['TYPO3_CONF_VARS']['EXTCONF']['crawler']['pollSuccess'])) {
1317
            foreach ($GLOBALS['TYPO3_CONF_VARS']['EXTCONF']['crawler']['pollSuccess'] as $pollable) {
1318
                // only check the success value if the instruction is runnig
1319
                // it is important to name the pollSuccess key same as the procInstructions key
1320
                if (is_array($resultData['parameters']['procInstructions']) && in_array(
1321
                    $pollable,
1322
                    $resultData['parameters']['procInstructions']
1323
                )
1324
                ) {
1325
                    if (!empty($resultData['success'][$pollable]) && $resultData['success'][$pollable]) {
1326
                        $ret |= self::CLI_STATUS_POLLABLE_PROCESSED;
1327
                    }
1328
                }
1329
            }
1330
        }
1331
1332
        // Set result in log which also denotes the end of the processing of this entry.
1333
        $field_array = ['result_data' => serialize($result)];
1334
1335
        SignalSlotUtility::emitSignal(
1336
            __CLASS__,
1337
            SignalSlotUtility::SIGNNAL_QUEUEITEM_POSTPROCESS,
1338
            [$queueId, &$field_array]
1339
        );
1340
1341
        GeneralUtility::makeInstance(ConnectionPool::class)->getConnectionForTable('tx_crawler_queue')
1342
            ->update(
1343
                'tx_crawler_queue',
1344
                $field_array,
1345
                [ 'qid' => (int)$queueId ]
1346
            );
1347
1348
        $this->logger->debug('crawler-readurl stop ' . microtime(true));
1349
        return $ret;
1350
    }
1351
1352
    /**
1353
     * Read URL for not-yet-inserted log-entry
1354
     *
1355
     * @param array $field_array Queue field array,
1356
     *
1357
     * @return string
1358
     */
1359
    public function readUrlFromArray($field_array)
1360
    {
1361
            // Set exec_time to lock record:
1362
        $field_array['exec_time'] = $this->getCurrentTime();
1363
        $connectionForCrawlerQueue = GeneralUtility::makeInstance(ConnectionPool::class)->getConnectionForTable('tx_crawler_queue');
1364
        $connectionForCrawlerQueue->insert(
1365
            'tx_crawler_queue',
1366
            $field_array
1367
        );
1368
        $queueId = $field_array['qid'] = $connectionForCrawlerQueue->lastInsertId('tx_crawler_queue', 'qid');
1369
1370
        $result = $this->queueExecutor->executeQueueItem($field_array, $this);
1371
1372
        // Set result in log which also denotes the end of the processing of this entry.
1373
        $field_array = ['result_data' => serialize($result)];
1374
1375
        SignalSlotUtility::emitSignal(
1376
            __CLASS__,
1377
            SignalSlotUtility::SIGNNAL_QUEUEITEM_POSTPROCESS,
1378
            [$queueId, &$field_array]
1379
        );
1380
1381
        $connectionForCrawlerQueue->update(
1382
            'tx_crawler_queue',
1383
            $field_array,
1384
            ['qid' => $queueId]
1385
        );
1386
1387
        return $result;
1388
    }
1389
1390
    /*****************************
1391
     *
1392
     * Compiling URLs to crawl - tools
1393
     *
1394
     *****************************/
1395
1396
    /**
1397
     * @param integer $id Root page id to start from.
1398
     * @param integer $depth Depth of tree, 0=only id-page, 1= on sublevel, 99 = infinite
1399
     * @param integer $scheduledTime Unix Time when the URL is timed to be visited when put in queue
1400
     * @param integer $reqMinute Number of requests per minute (creates the interleave between requests)
1401
     * @param boolean $submitCrawlUrls If set, submits the URLs to queue in database (real crawling)
1402
     * @param boolean $downloadCrawlUrls If set (and submitcrawlUrls is false) will fill $downloadUrls with entries)
1403
     * @param array $incomingProcInstructions Array of processing instructions
1404
     * @param array $configurationSelection Array of configuration keys
1405
     * @return string
1406
     */
1407
    public function getPageTreeAndUrls(
1408
        $id,
1409
        $depth,
1410
        $scheduledTime,
1411
        $reqMinute,
1412
        $submitCrawlUrls,
1413
        $downloadCrawlUrls,
1414
        array $incomingProcInstructions,
1415
        array $configurationSelection
1416
    ) {
1417
        $this->scheduledTime = $scheduledTime;
1418
        $this->reqMinute = $reqMinute;
1419
        $this->submitCrawlUrls = $submitCrawlUrls;
1420
        $this->downloadCrawlUrls = $downloadCrawlUrls;
1421
        $this->incomingProcInstructions = $incomingProcInstructions;
1422
        $this->incomingConfigurationSelection = $configurationSelection;
1423
1424
        $this->duplicateTrack = [];
1425
        $this->downloadUrls = [];
1426
1427
        // Drawing tree:
1428
        /* @var PageTreeView $tree */
1429
        $tree = GeneralUtility::makeInstance(PageTreeView::class);
1430
        $perms_clause = $GLOBALS['BE_USER']->getPagePermsClause(1);
1431
        $tree->init('AND ' . $perms_clause);
1432
1433
        $pageInfo = BackendUtility::readPageAccess($id, $perms_clause);
1434
        if (is_array($pageInfo)) {
1435
            // Set root row:
1436
            $tree->tree[] = [
1437
                'row' => $pageInfo,
1438
                'HTML' => $this->iconFactory->getIconForRecord('pages', $pageInfo, Icon::SIZE_SMALL)
1439
            ];
1440
        }
1441
1442
        // Get branch beneath:
1443
        if ($depth) {
1444
            $tree->getTree($id, $depth, '');
1445
        }
1446
1447
        // Traverse page tree:
1448
        $code = '';
1449
1450
        foreach ($tree->tree as $data) {
1451
            $this->MP = false;
1452
1453
            // recognize mount points
1454
            if ($data['row']['doktype'] == PageRepository::DOKTYPE_MOUNTPOINT) {
1455
                $queryBuilder = GeneralUtility::makeInstance(ConnectionPool::class)->getQueryBuilderForTable($this->tableName);
1456
                $queryBuilder->getRestrictions()->removeAll()->add(GeneralUtility::makeInstance(DeletedRestriction::class));
1457
                $mountpage = $queryBuilder
1458
                    ->select('*')
1459
                    ->from('pages')
1460
                    ->where(
1461
                        $queryBuilder->expr()->eq('uid', $queryBuilder->createNamedParameter($data['row']['uid'], \PDO::PARAM_INT))
1462
                    )
1463
                    ->execute()
1464
                    ->fetchAll();
1465
                $queryBuilder->getRestrictions()->reset();
1466
1467
                // fetch mounted pages
1468
                $this->MP = $mountpage[0]['mount_pid'] . '-' . $data['row']['uid'];
0 ignored issues
show
Documentation Bug introduced by
The property $MP was declared of type boolean, but $mountpage[0]['mount_pid...' . $data['row']['uid'] is of type string. Maybe add a type cast?

This check looks for assignments to scalar types that may be of the wrong type.

To ensure the code behaves as expected, it may be a good idea to add an explicit type cast.

$answer = 42;

$correct = false;

$correct = (bool) $answer;
Loading history...
1469
1470
                $mountTree = GeneralUtility::makeInstance(PageTreeView::class);
1471
                $mountTree->init('AND ' . $perms_clause);
1472
                $mountTree->getTree($mountpage[0]['mount_pid'], $depth);
1473
1474
                foreach ($mountTree->tree as $mountData) {
1475
                    $code .= $this->drawURLs_addRowsForPage(
1476
                        $mountData['row'],
1477
                        $mountData['HTML'] . BackendUtility::getRecordTitle('pages', $mountData['row'], true)
1478
                    );
1479
                }
1480
1481
                // replace page when mount_pid_ol is enabled
1482
                if ($mountpage[0]['mount_pid_ol']) {
1483
                    $data['row']['uid'] = $mountpage[0]['mount_pid'];
1484
                } else {
1485
                    // if the mount_pid_ol is not set the MP must not be used for the mountpoint page
1486
                    $this->MP = false;
1487
                }
1488
            }
1489
1490
            $code .= $this->drawURLs_addRowsForPage(
1491
                $data['row'],
1492
                $data['HTML'] . BackendUtility::getRecordTitle('pages', $data['row'], true)
1493
            );
1494
        }
1495
1496
        return $code;
1497
    }
1498
1499
    /**
1500
     * Expands exclude string
1501
     *
1502
     * @param string $excludeString Exclude string
1503
     * @return array
1504
     */
1505 1
    public function expandExcludeString($excludeString)
1506
    {
1507
        // internal static caches;
1508 1
        static $expandedExcludeStringCache;
1509 1
        static $treeCache;
1510
1511 1
        if (empty($expandedExcludeStringCache[$excludeString])) {
1512 1
            $pidList = [];
1513
1514 1
            if (!empty($excludeString)) {
1515
                /** @var PageTreeView $tree */
1516
                $tree = GeneralUtility::makeInstance(PageTreeView::class);
1517
                $tree->init('AND ' . $this->backendUser->getPagePermsClause(1));
1518
1519
                $excludeParts = GeneralUtility::trimExplode(',', $excludeString);
1520
1521
                foreach ($excludeParts as $excludePart) {
1522
                    list($pid, $depth) = GeneralUtility::trimExplode('+', $excludePart);
1523
1524
                    // default is "page only" = "depth=0"
1525
                    if (empty($depth)) {
1526
                        $depth = (stristr($excludePart, '+')) ? 99 : 0;
1527
                    }
1528
1529
                    $pidList[] = $pid;
1530
1531
                    if ($depth > 0) {
1532
                        if (empty($treeCache[$pid][$depth])) {
1533
                            $tree->reset();
1534
                            $tree->getTree($pid, $depth);
1535
                            $treeCache[$pid][$depth] = $tree->tree;
1536
                        }
1537
1538
                        foreach ($treeCache[$pid][$depth] as $data) {
1539
                            $pidList[] = $data['row']['uid'];
1540
                        }
1541
                    }
1542
                }
1543
            }
1544
1545 1
            $expandedExcludeStringCache[$excludeString] = array_unique($pidList);
1546
        }
1547
1548 1
        return $expandedExcludeStringCache[$excludeString];
1549
    }
1550
1551
    /**
1552
     * Create the rows for display of the page tree
1553
     * For each page a number of rows are shown displaying GET variable configuration
1554
     *
1555
     * @param    array        Page row
1556
     * @param    string        Page icon and title for row
1557
     * @return    string        HTML <tr> content (one or more)
1558
     */
1559
    public function drawURLs_addRowsForPage(array $pageRow, $pageTitleAndIcon)
1560
    {
1561
        $skipMessage = '';
1562
1563
        // Get list of configurations
1564
        $configurations = $this->getUrlsForPageRow($pageRow, $skipMessage);
1565
1566
        if (!empty($this->incomingConfigurationSelection)) {
1567
            // remove configuration that does not match the current selection
1568
            foreach ($configurations as $confKey => $confArray) {
1569
                if (!in_array($confKey, $this->incomingConfigurationSelection)) {
1570
                    unset($configurations[$confKey]);
1571
                }
1572
            }
1573
        }
1574
1575
        // Traverse parameter combinations:
1576
        $c = 0;
1577
        $content = '';
1578
        if (!empty($configurations)) {
1579
            foreach ($configurations as $confKey => $confArray) {
1580
1581
                    // Title column:
1582
                if (!$c) {
1583
                    $titleClm = '<td rowspan="' . count($configurations) . '">' . $pageTitleAndIcon . '</td>';
1584
                } else {
1585
                    $titleClm = '';
1586
                }
1587
1588
                if (!in_array($pageRow['uid'], $this->expandExcludeString($confArray['subCfg']['exclude']))) {
1589
1590
                        // URL list:
1591
                    $urlList = $this->urlListFromUrlArray(
1592
                        $confArray,
1593
                        $pageRow,
1594
                        $this->scheduledTime,
1595
                        $this->reqMinute,
1596
                        $this->submitCrawlUrls,
1597
                        $this->downloadCrawlUrls,
1598
                        $this->duplicateTrack,
1599
                        $this->downloadUrls,
1600
                        $this->incomingProcInstructions // if empty the urls won't be filtered by processing instructions
1601
                    );
1602
1603
                    // Expanded parameters:
1604
                    $paramExpanded = '';
1605
                    $calcAccu = [];
1606
                    $calcRes = 1;
1607
                    foreach ($confArray['paramExpanded'] as $gVar => $gVal) {
1608
                        $paramExpanded .= '
1609
                            <tr>
1610
                                <td class="bgColor4-20">' . htmlspecialchars('&' . $gVar . '=') . '<br/>' .
1611
                                                '(' . count($gVal) . ')' .
1612
                                                '</td>
1613
                                <td class="bgColor4" nowrap="nowrap">' . nl2br(htmlspecialchars(implode(chr(10), $gVal))) . '</td>
1614
                            </tr>
1615
                        ';
1616
                        $calcRes *= count($gVal);
1617
                        $calcAccu[] = count($gVal);
1618
                    }
1619
                    $paramExpanded = '<table class="lrPadding c-list param-expanded">' . $paramExpanded . '</table>';
1620
                    $paramExpanded .= 'Comb: ' . implode('*', $calcAccu) . '=' . $calcRes;
1621
1622
                    // Options
1623
                    $optionValues = '';
1624
                    if ($confArray['subCfg']['userGroups']) {
1625
                        $optionValues .= 'User Groups: ' . $confArray['subCfg']['userGroups'] . '<br/>';
1626
                    }
1627
                    if ($confArray['subCfg']['procInstrFilter']) {
1628
                        $optionValues .= 'ProcInstr: ' . $confArray['subCfg']['procInstrFilter'] . '<br/>';
1629
                    }
1630
1631
                    // Compile row:
1632
                    $content .= '
1633
                        <tr class="bgColor' . ($c % 2 ? '-20' : '-10') . '">
1634
                            ' . $titleClm . '
1635
                            <td>' . htmlspecialchars($confKey) . '</td>
1636
                            <td>' . nl2br(htmlspecialchars(rawurldecode(trim(str_replace('&', chr(10) . '&', GeneralUtility::implodeArrayForUrl('', $confArray['paramParsed'])))))) . '</td>
1637
                            <td>' . $paramExpanded . '</td>
1638
                            <td nowrap="nowrap">' . $urlList . '</td>
1639
                            <td nowrap="nowrap">' . $optionValues . '</td>
1640
                            <td nowrap="nowrap">' . DebugUtility::viewArray($confArray['subCfg']['procInstrParams.']) . '</td>
1641
                        </tr>';
1642
                } else {
1643
                    $content .= '<tr class="bgColor' . ($c % 2 ? '-20' : '-10') . '">
1644
                            ' . $titleClm . '
1645
                            <td>' . htmlspecialchars($confKey) . '</td>
1646
                            <td colspan="5"><em>No entries</em> (Page is excluded in this configuration)</td>
1647
                        </tr>';
1648
                }
1649
1650
                $c++;
1651
            }
1652
        } else {
1653
            $message = !empty($skipMessage) ? ' (' . $skipMessage . ')' : '';
1654
1655
            // Compile row:
1656
            $content .= '
1657
                <tr class="bgColor-20" style="border-bottom: 1px solid black;">
1658
                    <td>' . $pageTitleAndIcon . '</td>
1659
                    <td colspan="6"><em>No entries</em>' . $message . '</td>
1660
                </tr>';
1661
        }
1662
1663
        return $content;
1664
    }
1665
1666
    /*****************************
1667
     *
1668
     * CLI functions
1669
     *
1670
     *****************************/
1671
1672
    /**
1673
     * Running the functionality of the CLI (crawling URLs from queue)
1674
     *
1675
     * @param int $countInARun
1676
     * @param int $sleepTime
1677
     * @param int $sleepAfterFinish
1678
     * @return string
1679
     */
1680
    public function CLI_run($countInARun, $sleepTime, $sleepAfterFinish)
1681
    {
1682
        $result = 0;
1683
        $counter = 0;
1684
1685
        // First, run hooks:
1686
        $this->CLI_runHooks();
1687
1688
        // Clean up the queue
1689
        if (intval($this->extensionSettings['purgeQueueDays']) > 0) {
1690
            $purgeDate = $this->getCurrentTime() - 24 * 60 * 60 * intval($this->extensionSettings['purgeQueueDays']);
1691
1692
            $queryBuilderDelete = GeneralUtility::makeInstance(ConnectionPool::class)->getQueryBuilderForTable($this->tableName);
1693
            $del = $queryBuilderDelete
1694
                ->delete($this->tableName)
1695
                ->where(
1696
                    'exec_time != 0 AND exec_time < ' . $purgeDate
1697
                )->execute();
1698
1699
            if (false === $del) {
1700
                $this->logger->info(
1701
                    'Records could not be deleted.'
1702
                );
1703
            }
1704
        }
1705
1706
        // Select entries:
1707
        //TODO Shouldn't this reside within the transaction?
1708
        $queryBuilderSelect = GeneralUtility::makeInstance(ConnectionPool::class)->getQueryBuilderForTable($this->tableName);
1709
        $rows = $queryBuilderSelect
1710
            ->select('qid', 'scheduled')
1711
            ->from('tx_crawler_queue')
1712
            ->where(
1713
                $queryBuilderSelect->expr()->eq('exec_time', 0),
1714
                $queryBuilderSelect->expr()->eq('process_scheduled', 0),
1715
                $queryBuilderSelect->expr()->lte('scheduled', $this->getCurrentTime())
1716
            )
1717
            ->orderBy('scheduled')
1718
            ->addOrderBy('qid')
1719
            ->setMaxResults($countInARun)
1720
            ->execute()
1721
            ->fetchAll();
1722
1723
        if (!empty($rows)) {
1724
            $quidList = [];
1725
1726
            foreach ($rows as $r) {
1727
                $quidList[] = $r['qid'];
1728
            }
1729
1730
            $processId = $this->CLI_buildProcessId();
1731
1732
            //reserve queue entries for process
1733
1734
            //$this->queryBuilder->getConnection()->executeQuery('BEGIN');
1735
            //TODO make sure we're not taking assigned queue-entires
1736
1737
            //save the number of assigned queue entrys to determine who many have been processed later
1738
            $queryBuilderUpdate = GeneralUtility::makeInstance(ConnectionPool::class)->getQueryBuilderForTable($this->tableName);
1739
            $numberOfAffectedRows = $queryBuilderUpdate
1740
                ->update('tx_crawler_queue')
1741
                ->where(
1742
                    $queryBuilderUpdate->expr()->in('qid', $quidList)
1743
                )
1744
                ->set('process_scheduled', $this->getCurrentTime())
1745
                ->set('process_id', $processId)
1746
                ->execute();
1747
1748
            GeneralUtility::makeInstance(ConnectionPool::class)->getConnectionForTable('tx_crawler_process')
1749
                ->update(
1750
                    'tx_crawler_process',
1751
                    [ 'assigned_items_count' => (int)$numberOfAffectedRows ],
1752
                    [ 'process_id' => $processId ]
1753
                );
1754
1755
            if ($numberOfAffectedRows == count($quidList)) {
1756
                //$this->queryBuilder->getConnection()->executeQuery('COMMIT');
1757
            } else {
1758
                //$this->queryBuilder->getConnection()->executeQuery('ROLLBACK');
1759
                $this->CLI_debug("Nothing processed due to multi-process collision (" . $this->CLI_buildProcessId() . ")");
1760
                return ($result | self::CLI_STATUS_ABORTED);
1761
            }
1762
1763
            foreach ($rows as $r) {
1764
                $result |= $this->readUrl($r['qid']);
1765
1766
                $counter++;
1767
                usleep(intval($sleepTime)); // Just to relax the system
1768
1769
                // if during the start and the current read url the cli has been disable we need to return from the function
1770
                // mark the process NOT as ended.
1771
                if ($this->getDisabled()) {
1772
                    return ($result | self::CLI_STATUS_ABORTED);
1773
                }
1774
1775
                if (!$this->CLI_checkIfProcessIsActive($this->CLI_buildProcessId())) {
1776
                    $this->CLI_debug("conflict / timeout (" . $this->CLI_buildProcessId() . ")");
1777
1778
                    //TODO might need an additional returncode
1779
                    $result |= self::CLI_STATUS_ABORTED;
1780
                    break; //possible timeout
1781
                }
1782
            }
1783
1784
            sleep(intval($sleepAfterFinish));
1785
1786
            $msg = 'Rows: ' . $counter;
1787
            $this->CLI_debug($msg . " (" . $this->CLI_buildProcessId() . ")");
1788
        } else {
1789
            $this->CLI_debug("Nothing within queue which needs to be processed (" . $this->CLI_buildProcessId() . ")");
1790
        }
1791
1792
        if ($counter > 0) {
1793
            $result |= self::CLI_STATUS_PROCESSED;
1794
        }
1795
1796
        return $result;
1797
    }
1798
1799
    /**
1800
     * Activate hooks
1801
     *
1802
     * @return void
1803
     */
1804
    public function CLI_runHooks()
1805
    {
1806
        foreach ($GLOBALS['TYPO3_CONF_VARS']['EXTCONF']['crawler']['cli_hooks'] ?? [] as $objRef) {
1807
            $hookObj = GeneralUtility::makeInstance($objRef);
1808
            if (is_object($hookObj)) {
1809
                $hookObj->crawler_init($this);
1810
            }
1811
        }
1812
    }
1813
1814
    /**
1815
     * Try to acquire a new process with the given id
1816
     * also performs some auto-cleanup for orphan processes
1817
     * @todo preemption might not be the most elegant way to clean up
1818
     *
1819
     * @param string $id identification string for the process
1820
     * @return boolean
1821
     */
1822
    public function CLI_checkAndAcquireNewProcess($id)
1823
    {
1824
        $queryBuilder = GeneralUtility::makeInstance(ConnectionPool::class)->getQueryBuilderForTable($this->tableName);
1825
        $ret = true;
1826
1827
        $systemProcessId = getmypid();
1828
        if ($systemProcessId < 1) {
1829
            return false;
1830
        }
1831
1832
        $processCount = 0;
1833
        $orphanProcesses = [];
1834
1835
        //$this->queryBuilder->getConnection()->executeQuery('BEGIN');
1836
1837
        $statement = $queryBuilder
1838
            ->select('process_id', 'ttl')
1839
            ->from('tx_crawler_process')
1840
            ->where(
1841
                'active = 1 AND deleted = 0'
1842
            )
1843
            ->execute();
1844
1845
        $currentTime = $this->getCurrentTime();
1846
1847
        while ($row = $statement->fetch()) {
1848
            if ($row['ttl'] < $currentTime) {
1849
                $orphanProcesses[] = $row['process_id'];
1850
            } else {
1851
                $processCount++;
1852
            }
1853
        }
1854
1855
        // if there are less than allowed active processes then add a new one
1856
        if ($processCount < intval($this->extensionSettings['processLimit'])) {
1857
            $this->CLI_debug("add process " . $this->CLI_buildProcessId() . " (" . ($processCount + 1) . "/" . intval($this->extensionSettings['processLimit']) . ")");
1858
1859
            GeneralUtility::makeInstance(ConnectionPool::class)->getConnectionForTable('tx_crawler_process')->insert(
1860
                'tx_crawler_process',
1861
                [
1862
                    'process_id' => $id,
1863
                    'active' => 1,
1864
                    'ttl' => $currentTime + (int)$this->extensionSettings['processMaxRunTime'],
1865
                    'system_process_id' => $systemProcessId
1866
                ]
1867
            );
1868
        } else {
1869
            $this->CLI_debug("Processlimit reached (" . ($processCount) . "/" . intval($this->extensionSettings['processLimit']) . ")");
1870
            $ret = false;
1871
        }
1872
1873
        $this->processRepository->deleteProcessesMarkedAsDeleted();
1874
        $this->CLI_releaseProcesses($orphanProcesses, true); // maybe this should be somehow included into the current lock
1875
1876
        return $ret;
1877
    }
1878
1879
    /**
1880
     * Release a process and the required resources
1881
     *
1882
     * @param  mixed    $releaseIds   string with a single process-id or array with multiple process-ids
1883
     * @param  boolean  $withinLock   show whether the DB-actions are included within an existing lock
1884
     * @return boolean
1885
     */
1886
    public function CLI_releaseProcesses($releaseIds, $withinLock = false)
1887
    {
1888
        $queryBuilder = GeneralUtility::makeInstance(ConnectionPool::class)->getQueryBuilderForTable($this->tableName);
1889
1890
        if (!is_array($releaseIds)) {
1891
            $releaseIds = [$releaseIds];
1892
        }
1893
1894
        if (empty($releaseIds)) {
1895
            return false;   //nothing to release
1896
        }
1897
1898
        if (!$withinLock) {
1899
            //$this->queryBuilder->getConnection()->executeQuery('BEGIN');
1900
        }
1901
1902
        // some kind of 2nd chance algo - this way you need at least 2 processes to have a real cleanup
1903
        // this ensures that a single process can't mess up the entire process table
1904
1905
        // mark all processes as deleted which have no "waiting" queue-entires and which are not active
1906
1907
        $queryBuilder
1908
        ->update('tx_crawler_queue', 'q')
1909
        ->where(
1910
            'q.process_id IN(SELECT p.process_id FROM tx_crawler_process as p WHERE p.active = 0)'
1911
        )
1912
        ->set('q.process_scheduled', 0)
1913
        ->set('q.process_id', '')
1914
        ->execute();
1915
1916
        // FIXME: Not entirely sure that this is equivalent to the previous version
1917
        $queryBuilder->resetQueryPart('set');
1918
1919
        $queryBuilder
1920
            ->update('tx_crawler_process')
1921
            ->where(
1922
                $queryBuilder->expr()->eq('active', 0),
1923
                'process_id IN(SELECT q.process_id FROM tx_crawler_queue as q WHERE q.exec_time = 0)'
1924
            )
1925
            ->set('system_process_id', 0)
1926
            ->execute();
1927
        // previous version for reference
1928
        /*
1929
        $GLOBALS['TYPO3_DB']->exec_UPDATEquery(
1930
            'tx_crawler_process',
1931
            'active=0 AND deleted=0
1932
            AND NOT EXISTS (
1933
                SELECT * FROM tx_crawler_queue
1934
                WHERE tx_crawler_queue.process_id = tx_crawler_process.process_id
1935
                AND tx_crawler_queue.exec_time = 0
1936
            )',
1937
            [
1938
                'deleted' => '1',
1939
                'system_process_id' => 0
1940
            ]
1941
        );*/
1942
        // mark all requested processes as non-active
1943
        $queryBuilder
1944
            ->update('tx_crawler_process')
1945
            ->where(
1946
                'NOT EXISTS (
1947
                SELECT * FROM tx_crawler_queue
1948
                    WHERE tx_crawler_queue.process_id = tx_crawler_process.process_id
1949
                    AND tx_crawler_queue.exec_time = 0
1950
                )',
1951
                $queryBuilder->expr()->in('process_id', $queryBuilder->createNamedParameter($releaseIds, Connection::PARAM_STR_ARRAY)),
1952
                $queryBuilder->expr()->eq('deleted', 0)
1953
            )
1954
            ->set('active', 0)
1955
            ->execute();
1956
        $queryBuilder->resetQueryPart('set');
1957
        $queryBuilder
1958
            ->update('tx_crawler_queue')
1959
            ->where(
1960
                $queryBuilder->expr()->eq('exec_time', 0),
1961
                $queryBuilder->expr()->in('process_id', $queryBuilder->createNamedParameter($releaseIds, Connection::PARAM_STR_ARRAY))
1962
            )
1963
            ->set('process_scheduled', 0)
1964
            ->set('process_id', '')
1965
            ->execute();
1966
1967
        if (!$withinLock) {
1968
            //$this->queryBuilder->getConnection()->executeQuery('COMMIT');
1969
        }
1970
1971
        return true;
1972
    }
1973
1974
    /**
1975
     * Check if there are still resources left for the process with the given id
1976
     * Used to determine timeouts and to ensure a proper cleanup if there's a timeout
1977
     *
1978
     * @param  string  identification string for the process
1979
     * @return boolean determines if the process is still active / has resources
1980
     *
1981
     * TODO: Please consider moving this to Domain Model for Process or in ProcessRepository
1982
     */
1983 1
    public function CLI_checkIfProcessIsActive($pid)
1984
    {
1985 1
        $queryBuilder = GeneralUtility::makeInstance(ConnectionPool::class)->getQueryBuilderForTable($this->tableName);
1986 1
        $ret = false;
1987
1988
        $statement = $queryBuilder
1989 1
            ->from('tx_crawler_process')
1990 1
            ->select('active')
1991 1
            ->where(
1992 1
                $queryBuilder->expr()->eq('process_id', intval($pid))
1993
            )
1994 1
            ->orderBy('ttl')
1995 1
            ->execute();
1996
1997 1
        if ($row = $statement->fetch(0)) {
1998 1
            $ret = intVal($row['active']) == 1;
1999
        }
2000
2001 1
        return $ret;
2002
    }
2003
2004
    /**
2005
     * Create a unique Id for the current process
2006
     *
2007
     * @return string  the ID
2008
     */
2009 2
    public function CLI_buildProcessId()
2010
    {
2011 2
        if (!$this->processID) {
2012 1
            $this->processID = GeneralUtility::shortMD5($this->microtime(true));
2013
        }
2014 2
        return $this->processID;
2015
    }
2016
2017
    /**
2018
     * @param bool $get_as_float
2019
     *
2020
     * @return mixed
2021
     */
2022
    protected function microtime($get_as_float = false)
2023
    {
2024
        return microtime($get_as_float);
2025
    }
2026
2027
    /**
2028
     * Prints a message to the stdout (only if debug-mode is enabled)
2029
     *
2030
     * @param  string $msg  the message
2031
     */
2032
    public function CLI_debug($msg)
2033
    {
2034
        if (intval($this->extensionSettings['processDebug'])) {
2035
            echo $msg . "\n";
2036
            flush();
2037
        }
2038
    }
2039
2040
    /**
2041
     * Cleans up entries that stayed for too long in the queue. These are:
2042
     * - processed entries that are over 1.5 days in age
2043
     * - scheduled entries that are over 7 days old
2044
     *
2045
     * @return void
2046
     */
2047
    public function cleanUpOldQueueEntries()
2048
    {
2049
        $processedAgeInSeconds = $this->extensionSettings['cleanUpProcessedAge'] * 86400; // 24*60*60 Seconds in 24 hours
2050
        $scheduledAgeInSeconds = $this->extensionSettings['cleanUpScheduledAge'] * 86400;
2051
2052
        $now = time();
2053
        $condition = '(exec_time<>0 AND exec_time<' . ($now - $processedAgeInSeconds) . ') OR scheduled<=' . ($now - $scheduledAgeInSeconds);
2054
        $this->flushQueue($condition);
2055
    }
2056
2057
    /**
2058
     * Returns a md5 hash generated from a serialized configuration array.
2059
     *
2060
     * @param array $configuration
2061
     *
2062
     * @return string
2063
     */
2064 7
    protected function getConfigurationHash(array $configuration)
2065
    {
2066 7
        unset($configuration['paramExpanded']);
2067 7
        unset($configuration['URLs']);
2068 7
        return md5(serialize($configuration));
2069
    }
2070
2071
    /**
2072
     * Build a URL from a Page and the Query String. If the page has a Site configuration, it can be built by using
2073
     * the Site instance.
2074
     *
2075
     * @param int $pageId
2076
     * @param string $queryString
2077
     * @param string|null $alternativeBaseUrl
2078
     * @param int $httpsOrHttp see tx_crawler_configuration.force_ssl
2079
     * @return UriInterface
2080
     * @throws \TYPO3\CMS\Core\Exception\SiteNotFoundException
2081
     * @throws \TYPO3\CMS\Core\Routing\InvalidRouteArgumentsException
2082
     */
2083 2
    protected function getUrlFromPageAndQueryParameters(int $pageId, string $queryString, ?string $alternativeBaseUrl, int $httpsOrHttp): UriInterface
2084
    {
2085 2
        $site = GeneralUtility::makeInstance(SiteMatcher::class)->matchByPageId((int)$pageId);
2086 2
        if ($site instanceof Site) {
0 ignored issues
show
Bug introduced by
The class TYPO3\CMS\Core\Site\Entity\Site does not exist. Did you forget a USE statement, or did you not list all dependencies?

This error could be the result of:

1. Missing dependencies

PHP Analyzer uses your composer.json file (if available) to determine the dependencies of your project and to determine all the available classes and functions. It expects the composer.json to be in the root folder of your repository.

Are you sure this class is defined by one of your dependencies, or did you maybe not list a dependency in either the require or require-dev section?

2. Missing use statement

PHP does not complain about undefined classes in ìnstanceof checks. For example, the following PHP code will work perfectly fine:

if ($x instanceof DoesNotExist) {
    // Do something.
}

If you have not tested against this specific condition, such errors might go unnoticed.

Loading history...
2087
            $queryString = ltrim($queryString, '?&');
2088
            $queryParts = [];
2089
            parse_str($queryString, $queryParts);
2090
            unset($queryParts['id']);
2091
            // workaround as long as we don't have native language support in crawler configurations
2092
            if (isset($queryParts['L'])) {
2093
                $queryParts['_language'] = $queryParts['L'];
2094
                unset($queryParts['L']);
2095
                $siteLanguage = $site->getLanguageById((int)$queryParts['_language']);
0 ignored issues
show
Unused Code introduced by
$siteLanguage is not used, you could remove the assignment.

This check looks for variable assignements that are either overwritten by other assignments or where the variable is not used subsequently.

$myVar = 'Value';
$higher = false;

if (rand(1, 6) > 3) {
    $higher = true;
} else {
    $higher = false;
}

Both the $myVar assignment in line 1 and the $higher assignment in line 2 are dead. The first because $myVar is never used and the second because $higher is always overwritten for every possible time line.

Loading history...
2096
            } else {
2097
                $siteLanguage = $site->getDefaultLanguage();
0 ignored issues
show
Unused Code introduced by
$siteLanguage is not used, you could remove the assignment.

This check looks for variable assignements that are either overwritten by other assignments or where the variable is not used subsequently.

$myVar = 'Value';
$higher = false;

if (rand(1, 6) > 3) {
    $higher = true;
} else {
    $higher = false;
}

Both the $myVar assignment in line 1 and the $higher assignment in line 2 are dead. The first because $myVar is never used and the second because $higher is always overwritten for every possible time line.

Loading history...
2098
            }
2099
            $url = $site->getRouter()->generateUri($pageId, $queryParts);
2100
            if (!empty($alternativeBaseUrl)) {
2101
                $alternativeBaseUrl = new Uri($alternativeBaseUrl);
2102
                $url = $url->withHost($alternativeBaseUrl->getHost());
2103
                $url = $url->withScheme($alternativeBaseUrl->getScheme());
2104
                $url = $url->withPort($alternativeBaseUrl->getPort());
2105
            }
2106
        } else {
2107
            // Technically this is not possible with site handling, but kept for backwards-compatibility reasons
2108
            // Once EXT:crawler is v10-only compatible, this should be removed completely
2109 2
            $baseUrl = ($alternativeBaseUrl ?: GeneralUtility::getIndpEnv('TYPO3_SITE_URL'));
2110 2
            $cacheHashCalculator = GeneralUtility::makeInstance(CacheHashCalculator::class);
2111 2
            $queryString .= '&cHash=' . $cacheHashCalculator->generateForParameters($queryString);
2112 2
            $url = rtrim($baseUrl, '/') . '/index.php' . $queryString;
2113 2
            $url = new Uri($url);
2114
        }
2115
2116 2
        if ($httpsOrHttp === -1) {
2117
            $url = $url->withScheme('http');
2118 2
        } elseif ($httpsOrHttp === 1) {
2119
            $url = $url->withScheme('https');
2120
        }
2121
2122 2
        return $url;
2123
    }
2124
}
2125