Completed
Push — typo3v9 ( d49016...44c1fe )
by Tomas Norre
05:35 queued 53s
created

CrawlerController::getConfigurationsForBranch()   B

Complexity

Conditions 7
Paths 32

Size

Total Lines 44

Duplication

Lines 0
Ratio 0 %

Code Coverage

Tests 0
CRAP Score 56

Importance

Changes 0
Metric Value
cc 7
nc 32
nop 2
dl 0
loc 44
ccs 0
cts 30
cp 0
crap 56
rs 8.2826
c 0
b 0
f 0
1
<?php
2
namespace AOE\Crawler\Controller;
3
4
/***************************************************************
5
 *  Copyright notice
6
 *
7
 *  (c) 2019 AOE GmbH <[email protected]>
8
 *
9
 *  All rights reserved
10
 *
11
 *  This script is part of the TYPO3 project. The TYPO3 project is
12
 *  free software; you can redistribute it and/or modify
13
 *  it under the terms of the GNU General Public License as published by
14
 *  the Free Software Foundation; either version 3 of the License, or
15
 *  (at your option) any later version.
16
 *
17
 *  The GNU General Public License can be found at
18
 *  http://www.gnu.org/copyleft/gpl.html.
19
 *
20
 *  This script is distributed in the hope that it will be useful,
21
 *  but WITHOUT ANY WARRANTY; without even the implied warranty of
22
 *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
23
 *  GNU General Public License for more details.
24
 *
25
 *  This copyright notice MUST APPEAR in all copies of the script!
26
 ***************************************************************/
27
28
use AOE\Crawler\QueueExecutor;
29
use AOE\Crawler\Configuration\ExtensionConfigurationProvider;
30
use AOE\Crawler\Domain\Repository\ConfigurationRepository;
31
use AOE\Crawler\Domain\Repository\ProcessRepository;
32
use AOE\Crawler\Domain\Repository\QueueRepository;
33
use AOE\Crawler\Event\EventDispatcher;
34
use AOE\Crawler\Utility\SignalSlotUtility;
35
use Psr\Http\Message\UriInterface;
36
use Psr\Log\LoggerAwareInterface;
37
use Psr\Log\LoggerAwareTrait;
38
use TYPO3\CMS\Backend\Tree\View\PageTreeView;
39
use TYPO3\CMS\Backend\Utility\BackendUtility;
40
use TYPO3\CMS\Core\Authentication\BackendUserAuthentication;
41
use TYPO3\CMS\Core\Core\Environment;
42
use TYPO3\CMS\Core\Database\Connection;
43
use TYPO3\CMS\Core\Database\ConnectionPool;
44
use TYPO3\CMS\Core\Database\Query\Restriction\DeletedRestriction;
45
use TYPO3\CMS\Core\Http\Uri;
46
use TYPO3\CMS\Core\Imaging\Icon;
47
use TYPO3\CMS\Core\Imaging\IconFactory;
48
use TYPO3\CMS\Core\Routing\SiteMatcher;
49
use TYPO3\CMS\Core\Site\Entity\Site;
50
use TYPO3\CMS\Core\Type\Bitmask\Permission;
51
use TYPO3\CMS\Core\TypoScript\Parser\TypoScriptParser;
52
use TYPO3\CMS\Core\Utility\DebugUtility;
53
use TYPO3\CMS\Core\Utility\GeneralUtility;
54
use TYPO3\CMS\Core\Utility\MathUtility;
55
use TYPO3\CMS\Extbase\Object\ObjectManager;
56
use TYPO3\CMS\Frontend\Page\CacheHashCalculator;
57
use TYPO3\CMS\Frontend\Page\PageRepository;
58
59
/**
60
 * Class CrawlerController
61
 *
62
 * @package AOE\Crawler\Controller
63
 */
64
class CrawlerController implements LoggerAwareInterface
65
{
66
    use LoggerAwareTrait;
67
68
    const CLI_STATUS_NOTHING_PROCCESSED = 0;
69
    const CLI_STATUS_REMAIN = 1; //queue not empty
70
    const CLI_STATUS_PROCESSED = 2; //(some) queue items where processed
71
    const CLI_STATUS_ABORTED = 4; //instance didn't finish
72
    const CLI_STATUS_POLLABLE_PROCESSED = 8;
73
74
    /**
75
     * @var integer
76
     */
77
    public $setID = 0;
78
79
    /**
80
     * @var string
81
     */
82
    public $processID = '';
83
84
    /**
85
     * @var array
86
     */
87
    public $duplicateTrack = [];
88
89
    /**
90
     * @var array
91
     */
92
    public $downloadUrls = [];
93
94
    /**
95
     * @var array
96
     */
97
    public $incomingProcInstructions = [];
98
99
    /**
100
     * @var array
101
     */
102
    public $incomingConfigurationSelection = [];
103
104
    /**
105
     * @var bool
106
     */
107
    public $registerQueueEntriesInternallyOnly = false;
108
109
    /**
110
     * @var array
111
     */
112
    public $queueEntries = [];
113
114
    /**
115
     * @var array
116
     */
117
    public $urlList = [];
118
119
    /**
120
     * @var array
121
     */
122
    public $extensionSettings = [];
123
124
    /**
125
     * Mount Point
126
     *
127
     * @var boolean
128
     */
129
    public $MP = false;
130
131
    /**
132
     * @var string
133
     */
134
    protected $processFilename;
135
136
    /**
137
     * Holds the internal access mode can be 'gui','cli' or 'cli_im'
138
     *
139
     * @var string
140
     */
141
    protected $accessMode;
142
143
    /**
144
     * @var BackendUserAuthentication
145
     */
146
    private $backendUser;
147
148
    /**
149
     * @var integer
150
     */
151
    private $scheduledTime = 0;
152
153
    /**
154
     * @var integer
155
     */
156
    private $reqMinute = 0;
157
158
    /**
159
     * @var bool
160
     */
161
    private $submitCrawlUrls = false;
162
163
    /**
164
     * @var bool
165
     */
166
    private $downloadCrawlUrls = false;
167
168
    /**
169
     * @var QueueRepository
170
     */
171
    protected $queueRepository;
172
173
    /**
174
     * @var ProcessRepository
175
     */
176
    protected $processRepository;
177
178
    /**
179
     * @var ConfigurationRepository
180
     */
181
    protected $configurationRepository;
182
183
    /**
184
     * @var string
185
     */
186
    protected $tableName = 'tx_crawler_queue';
187
188
    /**
189
     * @var QueueExecutor
190
     */
191
    protected $queueExecutor;
192
193
    /**
194
     * @var int
195
     */
196
    protected $maximumUrlsToCompile = 10000;
197
198
    /**
199
     * @var IconFactory
200
     */
201
    protected $iconFactory;
202
203
    /**
204
     * Method to set the accessMode can be gui, cli or cli_im
205
     *
206
     * @return string
207
     */
208 1
    public function getAccessMode()
209
    {
210 1
        return $this->accessMode;
211
    }
212
213
    /**
214
     * @param string $accessMode
215
     */
216 1
    public function setAccessMode($accessMode)
217
    {
218 1
        $this->accessMode = $accessMode;
219 1
    }
220
221
    /**
222
     * Set disabled status to prevent processes from being processed
223
     *
224
     * @param  bool $disabled (optional, defaults to true)
225
     * @return void
226
     */
227 3
    public function setDisabled($disabled = true)
228
    {
229 3
        if ($disabled) {
230 2
            GeneralUtility::writeFile($this->processFilename, '');
231
        } else {
232 1
            if (is_file($this->processFilename)) {
233 1
                unlink($this->processFilename);
234
            }
235
        }
236 3
    }
237
238
    /**
239
     * Get disable status
240
     *
241
     * @return bool true if disabled
242
     */
243 3
    public function getDisabled()
244
    {
245 3
        return is_file($this->processFilename);
246
    }
247
248
    /**
249
     * @param string $filenameWithPath
250
     *
251
     * @return void
252
     */
253 4
    public function setProcessFilename($filenameWithPath)
254
    {
255 4
        $this->processFilename = $filenameWithPath;
256 4
    }
257
258
    /**
259
     * @return string
260
     */
261 1
    public function getProcessFilename()
262
    {
263 1
        return $this->processFilename;
264
    }
265
266
    /************************************
267
     *
268
     * Getting URLs based on Page TSconfig
269
     *
270
     ************************************/
271
272 25
    public function __construct()
273
    {
274 25
        $objectManager = GeneralUtility::makeInstance(ObjectManager::class);
275 25
        $this->queueRepository = $objectManager->get(QueueRepository::class);
276 25
        $this->processRepository = $objectManager->get(ProcessRepository::class);
277 25
        $this->configurationRepository = $objectManager->get(ConfigurationRepository::class);
278 25
        $this->queueExecutor = $objectManager->get(QueueExecutor::class);
279 25
        $this->iconFactory = GeneralUtility::makeInstance(IconFactory::class);
280
281 25
        $this->backendUser = $GLOBALS['BE_USER'];
282 25
        $this->processFilename = Environment::getVarPath() . '/locks/tx_crawler.proc';
283
284
        /** @var ExtensionConfigurationProvider $configurationProvider */
285 25
        $configurationProvider = GeneralUtility::makeInstance(ExtensionConfigurationProvider::class);
286 25
        $settings = $configurationProvider->getExtensionConfiguration();
287 25
        $this->extensionSettings = is_array($settings) ? $settings : [];
288
289
        // set defaults:
290 25
        if (MathUtility::convertToPositiveInteger($this->extensionSettings['countInARun']) == 0) {
291
            $this->extensionSettings['countInARun'] = 100;
292
        }
293
294 25
        $this->extensionSettings['processLimit'] = MathUtility::forceIntegerInRange($this->extensionSettings['processLimit'], 1, 99, 1);
295 25
        $this->maximumUrlsToCompile = MathUtility::forceIntegerInRange($this->extensionSettings['maxCompileUrls'], 1, 1000000000, 10000);
296 25
    }
297
298
    /**
299
     * Sets the extensions settings (unserialized pendant of $TYPO3_CONF_VARS['EXT']['extConf']['crawler']).
300
     *
301
     * @param array $extensionSettings
302
     * @return void
303
     */
304 7
    public function setExtensionSettings(array $extensionSettings)
305
    {
306 7
        $this->extensionSettings = $extensionSettings;
307 7
    }
308
309
    /**
310
     * Check if the given page should be crawled
311
     *
312
     * @param array $pageRow
313
     * @return false|string false if the page should be crawled (not excluded), true / skipMessage if it should be skipped
314
     */
315 8
    public function checkIfPageShouldBeSkipped(array $pageRow)
316
    {
317 8
        $skipPage = false;
318 8
        $skipMessage = 'Skipped'; // message will be overwritten later
319
320
        // if page is hidden
321 8
        if (!$this->extensionSettings['crawlHiddenPages']) {
322 8
            if ($pageRow['hidden']) {
323 1
                $skipPage = true;
324 1
                $skipMessage = 'Because page is hidden';
325
            }
326
        }
327
328 8
        if (!$skipPage) {
329 7
            if (GeneralUtility::inList('3,4', $pageRow['doktype']) || $pageRow['doktype'] >= 199) {
330 3
                $skipPage = true;
331 3
                $skipMessage = 'Because doktype is not allowed';
332
            }
333
        }
334
335 8
        if (!$skipPage) {
336 4
            foreach ($GLOBALS['TYPO3_CONF_VARS']['EXTCONF']['crawler']['excludeDoktype'] ?? [] as $key => $doktypeList) {
337 1
                if (GeneralUtility::inList($doktypeList, $pageRow['doktype'])) {
338 1
                    $skipPage = true;
339 1
                    $skipMessage = 'Doktype was excluded by "' . $key . '"';
340 1
                    break;
341
                }
342
            }
343
        }
344
345 8
        if (!$skipPage) {
346
            // veto hook
347 3
            foreach ($GLOBALS['TYPO3_CONF_VARS']['EXTCONF']['crawler']['pageVeto'] ?? [] as $key => $func) {
348
                $params = [
349
                    'pageRow' => $pageRow
350
                ];
351
                // expects "false" if page is ok and "true" or a skipMessage if this page should _not_ be crawled
352
                $veto = GeneralUtility::callUserFunction($func, $params, $this);
353
                if ($veto !== false) {
354
                    $skipPage = true;
355
                    if (is_string($veto)) {
356
                        $skipMessage = $veto;
357
                    } else {
358
                        $skipMessage = 'Veto from hook "' . htmlspecialchars($key) . '"';
359
                    }
360
                    // no need to execute other hooks if a previous one return a veto
361
                    break;
362
                }
363
            }
364
        }
365
366 8
        return $skipPage ? $skipMessage : false;
367
    }
368
369
    /**
370
     * Wrapper method for getUrlsForPageId()
371
     * It returns an array of configurations and no urls!
372
     *
373
     * @param array $pageRow Page record with at least dok-type and uid columns.
374
     * @param string $skipMessage
375
     * @return array
376
     * @see getUrlsForPageId()
377
     */
378 4
    public function getUrlsForPageRow(array $pageRow, &$skipMessage = '')
379
    {
380 4
        $message = $this->checkIfPageShouldBeSkipped($pageRow);
381
382 4
        if ($message === false) {
383 3
            $res = $this->getUrlsForPageId($pageRow['uid']);
384 3
            $skipMessage = '';
385
        } else {
386 1
            $skipMessage = $message;
387 1
            $res = [];
388
        }
389
390 4
        return $res;
391
    }
392
393
    /**
394
     * This method is used to count if there are ANY unprocessed queue entries
395
     * of a given page_id and the configuration which matches a given hash.
396
     * If there if none, we can skip an inner detail check
397
     *
398
     * @param  int $uid
399
     * @param  string $configurationHash
400
     * @return boolean
401
     */
402 5
    protected function noUnprocessedQueueEntriesForPageWithConfigurationHashExist($uid, $configurationHash)
403
    {
404 5
        $queryBuilder = GeneralUtility::makeInstance(ConnectionPool::class)->getQueryBuilderForTable($this->tableName);
405 5
        $noUnprocessedQueueEntriesFound = true;
406
407
        $result = $queryBuilder
408 5
            ->count('*')
409 5
            ->from($this->tableName)
410 5
            ->where(
411 5
                $queryBuilder->expr()->eq('page_id', (int)$uid),
412 5
                $queryBuilder->expr()->eq('configuration_hash', $queryBuilder->createNamedParameter($configurationHash)),
413 5
                $queryBuilder->expr()->eq('exec_time', 0)
414
            )
415 5
            ->execute()
416 5
            ->fetchColumn();
417
418 5
        if ($result) {
419 3
            $noUnprocessedQueueEntriesFound = false;
420
        }
421
422 5
        return $noUnprocessedQueueEntriesFound;
423
    }
424
425
    /**
426
     * Creates a list of URLs from input array (and submits them to queue if asked for)
427
     * See Web > Info module script + "indexed_search"'s crawler hook-client using this!
428
     *
429
     * @param    array        Information about URLs from pageRow to crawl.
430
     * @param    array        Page row
431
     * @param    integer        Unix time to schedule indexing to, typically time()
432
     * @param    integer        Number of requests per minute (creates the interleave between requests)
433
     * @param    boolean        If set, submits the URLs to queue
434
     * @param    boolean        If set (and submitcrawlUrls is false) will fill $downloadUrls with entries)
435
     * @param    array        Array which is passed by reference and contains the an id per url to secure we will not crawl duplicates
436
     * @param    array        Array which will be filled with URLS for download if flag is set.
437
     * @param    array        Array of processing instructions
438
     * @return    string        List of URLs (meant for display in backend module)
439
     *
440
     */
441 2
    public function urlListFromUrlArray(
442
        array $vv,
443
        array $pageRow,
444
        $scheduledTime,
445
        $reqMinute,
446
        $submitCrawlUrls,
447
        $downloadCrawlUrls,
448
        array &$duplicateTrack,
449
        array &$downloadUrls,
450
        array $incomingProcInstructions
451
    ) {
452
453 2
        if (!is_array($vv['URLs'])) {
454
            return 'ERROR - no URL generated';
455
        }
456 2
        $urlLog = [];
457 2
        $pageId = (int)$pageRow['uid'];
458 2
        $configurationHash = $this->getConfigurationHash($vv);
459 2
        $skipInnerCheck = $this->noUnprocessedQueueEntriesForPageWithConfigurationHashExist($pageId, $configurationHash);
460
461 2
        foreach ($vv['URLs'] as $urlQuery) {
462 2
            if (!$this->drawURLs_PIfilter($vv['subCfg']['procInstrFilter'], $incomingProcInstructions)) {
463
                continue;
464
            }
465 2
            $url = (string)$this->getUrlFromPageAndQueryParameters(
466 2
                $pageId,
467 2
                $urlQuery,
468 2
                $vv['subCfg']['baseUrl'] ?? null,
469 2
                $vv['subCfg']['force_ssl'] ?? 0
470
            );
471
472
            // Create key by which to determine unique-ness:
473 2
            $uKey = $url . '|' . $vv['subCfg']['userGroups'] . '|' . $vv['subCfg']['procInstrFilter'];
474
475 2
            if (isset($duplicateTrack[$uKey])) {
476
                //if the url key is registered just display it and do not resubmit is
477
                $urlLog[] = '<em><span class="text-muted">' . htmlspecialchars($url) . '</span></em>';
478
            } else {
479
                // Scheduled time:
480 2
                $schTime = $scheduledTime + round(count($duplicateTrack) * (60 / $reqMinute));
481 2
                $schTime = floor($schTime / 60) * 60;
482 2
                $formattedDate = BackendUtility::datetime($schTime);
483 2
                $this->urlList[] = '[' . $formattedDate . '] ' . $url;
484 2
                $urlList = '[' . $formattedDate . '] ' . htmlspecialchars($url);
485
486
                // Submit for crawling!
487 2
                if ($submitCrawlUrls) {
488 2
                    $added = $this->addUrl(
489 2
                        $pageId,
490 2
                        $url,
491 2
                        $vv['subCfg'],
492 2
                        $scheduledTime,
493 2
                        $configurationHash,
494 2
                        $skipInnerCheck
495
                    );
496 2
                    if ($added === false) {
497 2
                        $urlList .= ' (URL already existed)';
498
                    }
499
                } elseif ($downloadCrawlUrls) {
500
                    $downloadUrls[$url] = $url;
501
                }
502 2
                $urlLog[] = $urlList;
503
            }
504 2
            $duplicateTrack[$uKey] = true;
505
        }
506
507 2
        return implode('<br>', $urlLog);
508
    }
509
510
    /**
511
     * Returns true if input processing instruction is among registered ones.
512
     *
513
     * @param string $piString PI to test
514
     * @param array $incomingProcInstructions Processing instructions
515
     * @return boolean
516
     */
517 5
    public function drawURLs_PIfilter($piString, array $incomingProcInstructions)
518
    {
519 5
        if (empty($incomingProcInstructions)) {
520 1
            return true;
521
        }
522
523 4
        foreach ($incomingProcInstructions as $pi) {
524 4
            if (GeneralUtility::inList($piString, $pi)) {
525 2
                return true;
526
            }
527
        }
528 2
        return false;
529
    }
530
531 2
    public function getPageTSconfigForId($id)
532
    {
533 2
        if (!$this->MP) {
534 2
            $pageTSconfig = BackendUtility::getPagesTSconfig($id);
535
        } else {
536
            [, $mountPointId] = explode('-', $this->MP);
0 ignored issues
show
Bug introduced by
The variable $mountPointId does not exist. Did you forget to declare it?

This check marks access to variables or properties that have not been declared yet. While PHP has no explicit notion of declaring a variable, accessing it before a value is assigned to it is most likely a bug.

Loading history...
537
            $pageTSconfig = BackendUtility::getPagesTSconfig($mountPointId);
538
        }
539
540
        // Call a hook to alter configuration
541 2
        if (is_array($GLOBALS['TYPO3_CONF_VARS']['EXTCONF']['crawler']['getPageTSconfigForId'])) {
542
            $params = [
543
                'pageId' => $id,
544
                'pageTSConfig' => &$pageTSconfig
545
            ];
546
            foreach ($GLOBALS['TYPO3_CONF_VARS']['EXTCONF']['crawler']['getPageTSconfigForId'] as $userFunc) {
547
                GeneralUtility::callUserFunction($userFunc, $params, $this);
548
            }
549
        }
550 2
        return $pageTSconfig;
551
    }
552
553
    /**
554
     * This methods returns an array of configurations.
555
     * And no urls!
556
     *
557
     * @param integer $id Page ID
0 ignored issues
show
Bug introduced by
There is no parameter named $id. Was it maybe removed?

This check looks for PHPDoc comments describing methods or function parameters that do not exist on the corresponding method or function.

Consider the following example. The parameter $italy is not defined by the method finale(...).

/**
 * @param array $germany
 * @param array $island
 * @param array $italy
 */
function finale($germany, $island) {
    return "2:1";
}

The most likely cause is that the parameter was removed, but the annotation was not.

Loading history...
558
     * @return array
559
     */
560 2
    public function getUrlsForPageId($pageId)
561
    {
562
        // Get page TSconfig for page ID
563 2
        $pageTSconfig = $this->getPageTSconfigForId($pageId);
564
565 2
        $res = [];
566
567
        // Fetch Crawler Configuration from pageTSconfig
568 2
        $crawlerCfg = $pageTSconfig['tx_crawler.']['crawlerCfg.']['paramSets.'] ?? [];
569 2
        foreach ($crawlerCfg as $key => $values) {
570 1
            if (!is_array($values)) {
571 1
                continue;
572
            }
573 1
            $key = str_replace('.', '', $key);
574
            // Sub configuration for a single configuration string:
575 1
            $subCfg = (array)$crawlerCfg[$key . '.'];
576 1
            $subCfg['key'] = $key;
577
578 1
            if (strcmp($subCfg['procInstrFilter'], '')) {
579 1
                $subCfg['procInstrFilter'] = implode(',', GeneralUtility::trimExplode(',', $subCfg['procInstrFilter']));
580
            }
581 1
            $pidOnlyList = implode(',', GeneralUtility::trimExplode(',', $subCfg['pidsOnly'], true));
582
583
            // process configuration if it is not page-specific or if the specific page is the current page:
584 1
            if (!strcmp($subCfg['pidsOnly'], '') || GeneralUtility::inList($pidOnlyList, $pageId)) {
585
586
                // Explode, process etc.:
587 1
                $res[$key] = [];
588 1
                $res[$key]['subCfg'] = $subCfg;
589 1
                $res[$key]['paramParsed'] = GeneralUtility::explodeUrl2Array($crawlerCfg[$key]);
590 1
                $res[$key]['paramExpanded'] = $this->expandParameters($res[$key]['paramParsed'], $pageId);
591 1
                $res[$key]['origin'] = 'pagets';
592
593
                // recognize MP value
594 1
                if (!$this->MP) {
595 1
                    $res[$key]['URLs'] = $this->compileUrls($res[$key]['paramExpanded'], ['?id=' . $pageId]);
596
                } else {
597
                    $res[$key]['URLs'] = $this->compileUrls($res[$key]['paramExpanded'], ['?id=' . $pageId . '&MP=' . $this->MP]);
598
                }
599
            }
600
        }
601
602
        // Get configuration from tx_crawler_configuration records up the rootline
603 2
        $crawlerConfigurations = $this->configurationRepository->getCrawlerConfigurationRecordsFromRootLine($pageId);
604 2
        foreach ($crawlerConfigurations as $configurationRecord) {
605
606
                // check access to the configuration record
607 1
            if (empty($configurationRecord['begroups']) || $this->backendUser->isAdmin() || $this->hasGroupAccess($this->backendUser->user['usergroup_cached_list'], $configurationRecord['begroups'])) {
608 1
                $pidOnlyList = implode(',', GeneralUtility::trimExplode(',', $configurationRecord['pidsonly'], true));
609
610
                // process configuration if it is not page-specific or if the specific page is the current page:
611 1
                if (!strcmp($configurationRecord['pidsonly'], '') || GeneralUtility::inList($pidOnlyList, $pageId)) {
612 1
                    $key = $configurationRecord['name'];
613
614
                    // don't overwrite previously defined paramSets
615 1
                    if (!isset($res[$key])) {
616
617
                            /* @var $TSparserObject \TYPO3\CMS\Core\TypoScript\Parser\TypoScriptParser */
618 1
                        $TSparserObject = GeneralUtility::makeInstance(TypoScriptParser::class);
619 1
                        $TSparserObject->parse($configurationRecord['processing_instruction_parameters_ts']);
620
621
                        $subCfg = [
622 1
                            'procInstrFilter' => $configurationRecord['processing_instruction_filter'],
623 1
                            'procInstrParams.' => $TSparserObject->setup,
624 1
                            'baseUrl' => $configurationRecord['base_url'],
625 1
                            'force_ssl' => (int)$configurationRecord['force_ssl'],
626 1
                            'userGroups' => $configurationRecord['fegroups'],
627 1
                            'exclude' => $configurationRecord['exclude'],
628 1
                            'key' => $key
629
                        ];
630
631 1
                        if (!in_array($pageId, $this->expandExcludeString($subCfg['exclude']))) {
632 1
                            $res[$key] = [];
633 1
                            $res[$key]['subCfg'] = $subCfg;
634 1
                            $res[$key]['paramParsed'] = GeneralUtility::explodeUrl2Array($configurationRecord['configuration']);
635 1
                            $res[$key]['paramExpanded'] = $this->expandParameters($res[$key]['paramParsed'], $pageId);
636 1
                            $res[$key]['URLs'] = $this->compileUrls($res[$key]['paramExpanded'], ['?id=' . $pageId]);
637 1
                            $res[$key]['origin'] = 'tx_crawler_configuration_' . $configurationRecord['uid'];
638
                        }
639
                    }
640
                }
641
            }
642
        }
643
644 2
        foreach ($GLOBALS['TYPO3_CONF_VARS']['EXTCONF']['crawler']['processUrls'] ?? [] as $func) {
645
            $params = [
646
                'res' => &$res,
647
            ];
648
            GeneralUtility::callUserFunction($func, $params, $this);
649
        }
650 2
        return $res;
651
    }
652
653
    /**
654
     * Find all configurations of subpages of a page
655
     *
656
     * @param int $rootid
657
     * @param $depth
658
     * @return array
659
     *
660
     * TODO: Write Functional Tests
661
     */
662
    public function getConfigurationsForBranch(int $rootid, $depth)
663
    {
664
        $configurationsForBranch = [];
665
        $pageTSconfig = $this->getPageTSconfigForId($rootid);
666
        $sets = $pageTSconfig['tx_crawler.']['crawlerCfg.']['paramSets.'] ?? [];
667
        foreach ($sets as $key => $value) {
668
            if (!is_array($value)) {
669
                continue;
670
            }
671
            $configurationsForBranch[] = substr($key, -1) == '.' ? substr($key, 0, -1) : $key;
672
        }
673
        $pids = [];
674
        $rootLine = BackendUtility::BEgetRootLine($rootid);
675
        foreach ($rootLine as $node) {
676
            $pids[] = $node['uid'];
677
        }
678
        /* @var PageTreeView $tree */
679
        $tree = GeneralUtility::makeInstance(PageTreeView::class);
680
        $perms_clause = $this->backendUser->getPagePermsClause(Permission::PAGE_SHOW);
681
        $tree->init('AND ' . $perms_clause);
682
        $tree->getTree($rootid, $depth, '');
683
        foreach ($tree->tree as $node) {
684
            $pids[] = $node['row']['uid'];
685
        }
686
687
        $queryBuilder = $this->getQueryBuilder('tx_crawler_configuration');
688
689
        $queryBuilder->getRestrictions()
690
            ->removeAll()
691
            ->add(GeneralUtility::makeInstance(DeletedRestriction::class));
692
693
        $statement = $queryBuilder
694
            ->select('name')
695
            ->from('tx_crawler_configuration')
696
            ->where(
697
                $queryBuilder->expr()->in('pid', $queryBuilder->createNamedParameter($pids, Connection::PARAM_INT_ARRAY))
698
            )
699
            ->execute();
700
701
        while ($row = $statement->fetch()) {
702
            $configurationsForBranch[] = $row['name'];
703
        }
704
        return $configurationsForBranch;
705
    }
706
707
    /**
708
     * Get querybuilder for given table
709
     *
710
     * @param string $table
711
     * @return \TYPO3\CMS\Core\Database\Query\QueryBuilder
712
     */
713 9
    private function getQueryBuilder(string $table)
714
    {
715 9
        return GeneralUtility::makeInstance(ConnectionPool::class)->getQueryBuilderForTable($table);
716
    }
717
718
    /**
719
     * Check if a user has access to an item
720
     * (e.g. get the group list of the current logged in user from $GLOBALS['TSFE']->gr_list)
721
     *
722
     * @see \TYPO3\CMS\Frontend\Page\PageRepository::getMultipleGroupsWhereClause()
723
     * @param  string $groupList    Comma-separated list of (fe_)group UIDs from a user
724
     * @param  string $accessList   Comma-separated list of (fe_)group UIDs of the item to access
725
     * @return bool                 TRUE if at least one of the users group UIDs is in the access list or the access list is empty
726
     */
727 3
    public function hasGroupAccess($groupList, $accessList)
728
    {
729 3
        if (empty($accessList)) {
730 1
            return true;
731
        }
732 2
        foreach (GeneralUtility::intExplode(',', $groupList) as $groupUid) {
733 2
            if (GeneralUtility::inList($accessList, $groupUid)) {
734 1
                return true;
735
            }
736
        }
737 1
        return false;
738
    }
739
740
    /**
741
     * Will expand the parameters configuration to individual values. This follows a certain syntax of the value of each parameter.
742
     * Syntax of values:
743
     * - Basically: If the value is wrapped in [...] it will be expanded according to the following syntax, otherwise the value is taken literally
744
     * - Configuration is splitted by "|" and the parts are processed individually and finally added together
745
     * - For each configuration part:
746
     *         - "[int]-[int]" = Integer range, will be expanded to all values in between, values included, starting from low to high (max. 1000). Example "1-34" or "-40--30"
747
     *         - "_TABLE:[TCA table name];[_PID:[optional page id, default is current page]];[_ENABLELANG:1]" = Look up of table records from PID, filtering out deleted records. Example "_TABLE:tt_content; _PID:123"
748
     *        _ENABLELANG:1 picks only original records without their language overlays
749
     *         - Default: Literal value
750
     *
751
     * @param array $paramArray Array with key (GET var name) and values (value of GET var which is configuration for expansion)
752
     * @param integer $pid Current page ID
753
     * @return array
754
     *
755
     * TODO: Write Functional Tests
756
     */
757 2
    public function expandParameters($paramArray, $pid)
758
    {
759
        // Traverse parameter names:
760 2
        foreach ($paramArray as $p => $v) {
761 2
            $v = trim($v);
762
763
            // If value is encapsulated in square brackets it means there are some ranges of values to find, otherwise the value is literal
764 2
            if (substr($v, 0, 1) === '[' && substr($v, -1) === ']') {
765
                // So, find the value inside brackets and reset the paramArray value as an array.
766 2
                $v = substr($v, 1, -1);
767 2
                $paramArray[$p] = [];
768
769
                // Explode parts and traverse them:
770 2
                $parts = explode('|', $v);
771 2
                foreach ($parts as $pV) {
772
773
                        // Look for integer range: (fx. 1-34 or -40--30 // reads minus 40 to minus 30)
774 2
                    if (preg_match('/^(-?[0-9]+)\s*-\s*(-?[0-9]+)$/', trim($pV), $reg)) {
775
776
                        // Swap if first is larger than last:
777
                        if ($reg[1] > $reg[2]) {
778
                            $temp = $reg[2];
779
                            $reg[2] = $reg[1];
780
                            $reg[1] = $temp;
781
                        }
782
783
                        // Traverse range, add values:
784
                        $runAwayBrake = 1000; // Limit to size of range!
785
                        for ($a = $reg[1]; $a <= $reg[2];$a++) {
786
                            $paramArray[$p][] = $a;
787
                            $runAwayBrake--;
788
                            if ($runAwayBrake <= 0) {
789
                                break;
790
                            }
791
                        }
792 2
                    } elseif (substr(trim($pV), 0, 7) == '_TABLE:') {
793
794
                        // Parse parameters:
795
                        $subparts = GeneralUtility::trimExplode(';', $pV);
796
                        $subpartParams = [];
797
                        foreach ($subparts as $spV) {
798
                            list($pKey, $pVal) = GeneralUtility::trimExplode(':', $spV);
799
                            $subpartParams[$pKey] = $pVal;
800
                        }
801
802
                        // Table exists:
803
                        if (isset($GLOBALS['TCA'][$subpartParams['_TABLE']])) {
804
                            $lookUpPid = isset($subpartParams['_PID']) ? (int)$subpartParams['_PID'] : $pid;
805
                            $pidField = isset($subpartParams['_PIDFIELD']) ? trim($subpartParams['_PIDFIELD']) : 'pid';
806
                            $where = isset($subpartParams['_WHERE']) ? $subpartParams['_WHERE'] : '';
807
                            $addTable = isset($subpartParams['_ADDTABLE']) ? $subpartParams['_ADDTABLE'] : '';
808
809
                            $fieldName = $subpartParams['_FIELD'] ? $subpartParams['_FIELD'] : 'uid';
810
                            if ($fieldName === 'uid' || $GLOBALS['TCA'][$subpartParams['_TABLE']]['columns'][$fieldName]) {
811
                                $queryBuilder = $this->getQueryBuilder($subpartParams['_TABLE']);
812
813
                                $queryBuilder->getRestrictions()
814
                                    ->removeAll()
815
                                    ->add(GeneralUtility::makeInstance(DeletedRestriction::class));
816
817
                                $queryBuilder
818
                                    ->select($fieldName)
819
                                    ->from($subpartParams['_TABLE'])
820
                                    // TODO: Check if this works as intended!
821
                                    ->add('from', $addTable)
822
                                    ->where(
823
                                        $queryBuilder->expr()->eq($queryBuilder->quoteIdentifier($pidField), $queryBuilder->createNamedParameter($lookUpPid, \PDO::PARAM_INT)),
824
                                        $where
825
                                    );
826
                                $transOrigPointerField = $GLOBALS['TCA'][$subpartParams['_TABLE']]['ctrl']['transOrigPointerField'];
827
828
                                if ($subpartParams['_ENABLELANG'] && $transOrigPointerField) {
829
                                    $queryBuilder->andWhere(
830
                                        $queryBuilder->expr()->lte(
831
                                            $queryBuilder->quoteIdentifier($transOrigPointerField),
832
                                            0
833
                                        )
834
                                    );
835
                                }
836
837
                                $statement = $queryBuilder->execute();
838
839
                                $rows = [];
840
                                while ($row = $statement->fetch()) {
841
                                    $rows[$fieldName] = $row;
842
                                }
843
844
                                if (is_array($rows)) {
845
                                    $paramArray[$p] = array_merge($paramArray[$p], array_keys($rows));
846
                                }
847
                            }
848
                        }
849
                    } else { // Just add value:
850 2
                        $paramArray[$p][] = $pV;
851
                    }
852
                    // Hook for processing own expandParameters place holder
853 2
                    if (is_array($GLOBALS['TYPO3_CONF_VARS']['SC_OPTIONS']['crawler/class.tx_crawler_lib.php']['expandParameters'])) {
854
                        $_params = [
855
                            'pObj' => &$this,
856
                            'paramArray' => &$paramArray,
857
                            'currentKey' => $p,
858
                            'currentValue' => $pV,
859
                            'pid' => $pid
860
                        ];
861
                        foreach ($GLOBALS['TYPO3_CONF_VARS']['SC_OPTIONS']['crawler/class.tx_crawler_lib.php']['expandParameters'] as $key => $_funcRef) {
862
                            GeneralUtility::callUserFunction($_funcRef, $_params, $this);
863
                        }
864
                    }
865
                }
866
867
                // Make unique set of values and sort array by key:
868 2
                $paramArray[$p] = array_unique($paramArray[$p]);
869 2
                ksort($paramArray);
870
            } else {
871
                // Set the literal value as only value in array:
872 2
                $paramArray[$p] = [$v];
873
            }
874
        }
875
876 2
        return $paramArray;
877
    }
878
879
    /**
880
     * Compiling URLs from parameter array (output of expandParameters())
881
     * The number of URLs will be the multiplication of the number of parameter values for each key
882
     *
883
     * @param array $paramArray Output of expandParameters(): Array with keys (GET var names) and for each an array of values
884
     * @param array $urls URLs accumulated in this array (for recursion)
885
     * @return array
886
     */
887 5
    public function compileUrls($paramArray, array $urls)
888
    {
889 5
        if (empty($paramArray)) {
890 5
            return $urls;
891
        }
892
        // shift first off stack:
893 4
        reset($paramArray);
894 4
        $varName = key($paramArray);
895 4
        $valueSet = array_shift($paramArray);
896
897
        // Traverse value set:
898 4
        $newUrls = [];
899 4
        foreach ($urls as $url) {
900 3
            foreach ($valueSet as $val) {
901 3
                $newUrls[] = $url . (strcmp($val, '') ? '&' . rawurlencode($varName) . '=' . rawurlencode($val) : '');
902
903 3
                if (count($newUrls) > $this->maximumUrlsToCompile) {
904
                    break;
905
                }
906
            }
907
        }
908 4
        return $this->compileUrls($paramArray, $newUrls);
909
    }
910
911
    /************************************
912
     *
913
     * Crawler log
914
     *
915
     ************************************/
916
917
    /**
918
     * Return array of records from crawler queue for input page ID
919
     *
920
     * @param integer $id Page ID for which to look up log entries.
921
     * @param string$filter Filter: "all" => all entries, "pending" => all that is not yet run, "finished" => all complete ones
922
     * @param boolean $doFlush If TRUE, then entries selected at DELETED(!) instead of selected!
923
     * @param boolean $doFullFlush
924
     * @param integer $itemsPerPage Limit the amount of entries per page default is 10
925
     * @return array
926
     */
927 4
    public function getLogEntriesForPageId($id, $filter = '', $doFlush = false, $doFullFlush = false, $itemsPerPage = 10)
928
    {
929 4
        $queryBuilder = GeneralUtility::makeInstance(ConnectionPool::class)->getQueryBuilderForTable($this->tableName);
930
        $queryBuilder
931 4
            ->select('*')
932 4
            ->from($this->tableName)
933 4
            ->where(
934 4
                $queryBuilder->expr()->eq('page_id', $queryBuilder->createNamedParameter($id, \PDO::PARAM_INT))
935
            )
936 4
            ->orderBy('scheduled', 'DESC');
937
938 4
        $expressionBuilder = GeneralUtility::makeInstance(ConnectionPool::class)
939 4
            ->getConnectionForTable($this->tableName)
940 4
            ->getExpressionBuilder();
941 4
        $query = $expressionBuilder->andX();
942
        // PHPStorm adds the highlight that the $addWhere is immediately overwritten,
943
        // but the $query = $expressionBuilder->andX() ensures that the $addWhere is written correctly with AND
944
        // between the statements, it's not a mistake in the code.
945 4
        $addWhere = '';
0 ignored issues
show
Unused Code introduced by
$addWhere is not used, you could remove the assignment.

This check looks for variable assignements that are either overwritten by other assignments or where the variable is not used subsequently.

$myVar = 'Value';
$higher = false;

if (rand(1, 6) > 3) {
    $higher = true;
} else {
    $higher = false;
}

Both the $myVar assignment in line 1 and the $higher assignment in line 2 are dead. The first because $myVar is never used and the second because $higher is always overwritten for every possible time line.

Loading history...
946 4
        switch ($filter) {
947 4
            case 'pending':
948
                $queryBuilder->andWhere($queryBuilder->expr()->eq('exec_time', 0));
949
                $addWhere = ' AND ' . $query->add($expressionBuilder->eq('exec_time', 0));
0 ignored issues
show
Unused Code introduced by
$addWhere is not used, you could remove the assignment.

This check looks for variable assignements that are either overwritten by other assignments or where the variable is not used subsequently.

$myVar = 'Value';
$higher = false;

if (rand(1, 6) > 3) {
    $higher = true;
} else {
    $higher = false;
}

Both the $myVar assignment in line 1 and the $higher assignment in line 2 are dead. The first because $myVar is never used and the second because $higher is always overwritten for every possible time line.

Loading history...
950
                break;
951 4
            case 'finished':
952
                $queryBuilder->andWhere($queryBuilder->expr()->gt('exec_time', 0));
953
                $addWhere = ' AND ' . $query->add($expressionBuilder->gt('exec_time', 0));
0 ignored issues
show
Unused Code introduced by
$addWhere is not used, you could remove the assignment.

This check looks for variable assignements that are either overwritten by other assignments or where the variable is not used subsequently.

$myVar = 'Value';
$higher = false;

if (rand(1, 6) > 3) {
    $higher = true;
} else {
    $higher = false;
}

Both the $myVar assignment in line 1 and the $higher assignment in line 2 are dead. The first because $myVar is never used and the second because $higher is always overwritten for every possible time line.

Loading history...
954
                break;
955
        }
956
957
        // FIXME: Write unit test that ensures that the right records are deleted.
958 4
        if ($doFlush) {
959 2
            $addWhere = $query->add($expressionBuilder->eq('page_id', (int)$id));
960 2
            $this->flushQueue($doFullFlush ? '1=1' : $addWhere);
961 2
            return [];
962
        } else {
963 2
            if ($itemsPerPage > 0) {
964
                $queryBuilder
965 2
                    ->setMaxResults((int)$itemsPerPage);
966
            }
967
968 2
            return $queryBuilder->execute()->fetchAll();
969
        }
970
    }
971
972
    /**
973
     * Return array of records from crawler queue for input set ID
974
     *
975
     * @param integer $set_id Set ID for which to look up log entries.
976
     * @param string $filter Filter: "all" => all entries, "pending" => all that is not yet run, "finished" => all complete ones
977
     * @param boolean $doFlush If TRUE, then entries selected at DELETED(!) instead of selected!
978
     * @param integer $itemsPerPage Limit the amount of entires per page default is 10
979
     * @return array
980
     */
981 6
    public function getLogEntriesForSetId($set_id, $filter = '', $doFlush = false, $doFullFlush = false, $itemsPerPage = 10)
982
    {
983 6
        $queryBuilder = GeneralUtility::makeInstance(ConnectionPool::class)->getQueryBuilderForTable($this->tableName);
984
        $queryBuilder
985 6
            ->select('*')
986 6
            ->from($this->tableName)
987 6
            ->where(
988 6
                $queryBuilder->expr()->eq('set_id', $queryBuilder->createNamedParameter($set_id, \PDO::PARAM_INT))
989
            )
990 6
            ->orderBy('scheduled', 'DESC');
991
992 6
        $expressionBuilder = GeneralUtility::makeInstance(ConnectionPool::class)
993 6
            ->getConnectionForTable($this->tableName)
994 6
            ->getExpressionBuilder();
995 6
        $query = $expressionBuilder->andX();
996
        // FIXME: Write Unit tests for Filters
997
        // PHPStorm adds the highlight that the $addWhere is immediately overwritten,
998
        // but the $query = $expressionBuilder->andX() ensures that the $addWhere is written correctly with AND
999
        // between the statements, it's not a mistake in the code.
1000 6
        $addWhere = '';
0 ignored issues
show
Unused Code introduced by
$addWhere is not used, you could remove the assignment.

This check looks for variable assignements that are either overwritten by other assignments or where the variable is not used subsequently.

$myVar = 'Value';
$higher = false;

if (rand(1, 6) > 3) {
    $higher = true;
} else {
    $higher = false;
}

Both the $myVar assignment in line 1 and the $higher assignment in line 2 are dead. The first because $myVar is never used and the second because $higher is always overwritten for every possible time line.

Loading history...
1001 6
        switch ($filter) {
1002 6
            case 'pending':
1003 1
                $queryBuilder->andWhere($queryBuilder->expr()->eq('exec_time', 0));
1004 1
                $addWhere = $query->add($expressionBuilder->eq('exec_time', 0));
0 ignored issues
show
Unused Code introduced by
$addWhere is not used, you could remove the assignment.

This check looks for variable assignements that are either overwritten by other assignments or where the variable is not used subsequently.

$myVar = 'Value';
$higher = false;

if (rand(1, 6) > 3) {
    $higher = true;
} else {
    $higher = false;
}

Both the $myVar assignment in line 1 and the $higher assignment in line 2 are dead. The first because $myVar is never used and the second because $higher is always overwritten for every possible time line.

Loading history...
1005 1
                break;
1006 5
            case 'finished':
1007 1
                $queryBuilder->andWhere($queryBuilder->expr()->gt('exec_time', 0));
1008 1
                $addWhere = $query->add($expressionBuilder->gt('exec_time', 0));
0 ignored issues
show
Unused Code introduced by
$addWhere is not used, you could remove the assignment.

This check looks for variable assignements that are either overwritten by other assignments or where the variable is not used subsequently.

$myVar = 'Value';
$higher = false;

if (rand(1, 6) > 3) {
    $higher = true;
} else {
    $higher = false;
}

Both the $myVar assignment in line 1 and the $higher assignment in line 2 are dead. The first because $myVar is never used and the second because $higher is always overwritten for every possible time line.

Loading history...
1009 1
                break;
1010
        }
1011
        // FIXME: Write unit test that ensures that the right records are deleted.
1012 6
        if ($doFlush) {
1013 4
            $addWhere = $query->add($expressionBuilder->eq('set_id', (int)$set_id));
1014 4
            $this->flushQueue($doFullFlush ? '' : $addWhere);
1015 4
            return [];
1016
        } else {
1017 2
            if ($itemsPerPage > 0) {
1018
                $queryBuilder
1019 2
                    ->setMaxResults((int)$itemsPerPage);
1020
            }
1021
1022 2
            return $queryBuilder->execute()->fetchAll();
1023
        }
1024
    }
1025
1026
    /**
1027
     * Removes queue entries
1028
     *
1029
     * @param string $where SQL related filter for the entries which should be removed
1030
     * @return void
1031
     */
1032 9
    protected function flushQueue($where = '')
1033
    {
1034 9
        $realWhere = strlen($where) > 0 ? $where : '1=1';
1035
1036 9
        $queryBuilder = $this->getQueryBuilder($this->tableName);
1037
1038 9
        if (EventDispatcher::getInstance()->hasObserver('queueEntryFlush')) {
1039
            $groups = $queryBuilder
1040
                ->select('DISTINCT set_id')
1041
                ->from($this->tableName)
1042
                ->where($realWhere)
1043
                ->execute()
1044
                ->fetchAll();
1045
            if (is_array($groups)) {
1046
                foreach ($groups as $group) {
1047
                    $subSet = $queryBuilder
1048
                        ->select('uid', 'set_id')
1049
                        ->from($this->tableName)
1050
                        ->where(
1051
                            $realWhere,
1052
                            $queryBuilder->expr()->eq('set_id', $group['set_id'])
1053
                        )
1054
                        ->execute()
1055
                        ->fetchAll();
1056
                    EventDispatcher::getInstance()->post('queueEntryFlush', $group['set_id'], $subSet);
1057
                }
1058
            }
1059
        }
1060
1061
        $queryBuilder
1062 9
            ->delete($this->tableName)
1063 9
            ->where($realWhere)
1064 9
            ->execute();
1065 9
    }
1066
1067
    /**
1068
     * Adding call back entries to log (called from hooks typically, see indexed search class "class.crawler.php"
1069
     *
1070
     * @param integer $setId Set ID
1071
     * @param array $params Parameters to pass to call back function
1072
     * @param string $callBack Call back object reference, eg. 'EXT:indexed_search/class.crawler.php:&tx_indexedsearch_crawler'
1073
     * @param integer $page_id Page ID to attach it to
1074
     * @param integer $schedule Time at which to activate
1075
     * @return void
1076
     */
1077
    public function addQueueEntry_callBack($setId, $params, $callBack, $page_id = 0, $schedule = 0)
1078
    {
1079
        if (!is_array($params)) {
1080
            $params = [];
1081
        }
1082
        $params['_CALLBACKOBJ'] = $callBack;
1083
1084
        GeneralUtility::makeInstance(ConnectionPool::class)->getConnectionForTable('tx_crawler_queue')
1085
            ->insert(
1086
                'tx_crawler_queue',
1087
                [
1088
                    'page_id' => (int)$page_id,
1089
                    'parameters' => serialize($params),
1090
                    'scheduled' => (int)$schedule ?: $this->getCurrentTime(),
1091
                    'exec_time' => 0,
1092
                    'set_id' => (int)$setId,
1093
                    'result_data' => '',
1094
                ]
1095
            );
1096
    }
1097
1098
    /************************************
1099
     *
1100
     * URL setting
1101
     *
1102
     ************************************/
1103
1104
    /**
1105
     * Setting a URL for crawling:
1106
     *
1107
     * @param integer $id Page ID
1108
     * @param string $url Complete URL
1109
     * @param array $subCfg Sub configuration array (from TS config)
1110
     * @param integer $tstamp Scheduled-time
1111
     * @param string $configurationHash (optional) configuration hash
1112
     * @param bool $skipInnerDuplicationCheck (optional) skip inner duplication check
1113
     * @return bool
1114
     */
1115 2
    public function addUrl(
1116
        $id,
1117
        $url,
1118
        array $subCfg,
1119
        $tstamp,
1120
        $configurationHash = '',
1121
        $skipInnerDuplicationCheck = false
1122
    ) {
1123 2
        $urlAdded = false;
1124 2
        $rows = [];
1125
1126
        // Creating parameters:
1127
        $parameters = [
1128 2
            'url' => $url
1129
        ];
1130
1131
        // fe user group simulation:
1132 2
        $uGs = implode(',', array_unique(GeneralUtility::intExplode(',', $subCfg['userGroups'], true)));
1133 2
        if ($uGs) {
1134
            $parameters['feUserGroupList'] = $uGs;
1135
        }
1136
1137
        // Setting processing instructions
1138 2
        $parameters['procInstructions'] = GeneralUtility::trimExplode(',', $subCfg['procInstrFilter']);
1139 2
        if (is_array($subCfg['procInstrParams.'])) {
1140 2
            $parameters['procInstrParams'] = $subCfg['procInstrParams.'];
1141
        }
1142
1143
        // Compile value array:
1144 2
        $parameters_serialized = serialize($parameters);
1145
        $fieldArray = [
1146 2
            'page_id' => (int)$id,
1147 2
            'parameters' => $parameters_serialized,
1148 2
            'parameters_hash' => GeneralUtility::shortMD5($parameters_serialized),
1149 2
            'configuration_hash' => $configurationHash,
1150 2
            'scheduled' => $tstamp,
1151 2
            'exec_time' => 0,
1152 2
            'set_id' => (int)$this->setID,
1153 2
            'result_data' => '',
1154 2
            'configuration' => $subCfg['key'],
1155
        ];
1156
1157 2
        if ($this->registerQueueEntriesInternallyOnly) {
1158
            //the entries will only be registered and not stored to the database
1159
            $this->queueEntries[] = $fieldArray;
1160
        } else {
1161 2
            if (!$skipInnerDuplicationCheck) {
1162
                // check if there is already an equal entry
1163 2
                $rows = $this->getDuplicateRowsIfExist($tstamp, $fieldArray);
1164
            }
1165
1166 2
            if (empty($rows)) {
1167 2
                $connectionForCrawlerQueue = GeneralUtility::makeInstance(ConnectionPool::class)->getConnectionForTable('tx_crawler_queue');
1168 2
                $connectionForCrawlerQueue->insert(
1169 2
                    'tx_crawler_queue',
1170 2
                    $fieldArray
1171
                );
1172 2
                $uid = $connectionForCrawlerQueue->lastInsertId('tx_crawler_queue', 'qid');
1173 2
                $rows[] = $uid;
1174 2
                $urlAdded = true;
1175 2
                EventDispatcher::getInstance()->post('urlAddedToQueue', $this->setID, ['uid' => $uid, 'fieldArray' => $fieldArray]);
1176
            } else {
1177
                EventDispatcher::getInstance()->post('duplicateUrlInQueue', $this->setID, ['rows' => $rows, 'fieldArray' => $fieldArray]);
1178
            }
1179
        }
1180
1181 2
        return $urlAdded;
1182
    }
1183
1184
    /**
1185
     * This method determines duplicates for a queue entry with the same parameters and this timestamp.
1186
     * If the timestamp is in the past, it will check if there is any unprocessed queue entry in the past.
1187
     * If the timestamp is in the future it will check, if the queued entry has exactly the same timestamp
1188
     *
1189
     * @param int $tstamp
1190
     * @param array $fieldArray
1191
     *
1192
     * @return array
1193
     *
1194
     * TODO: Write Functional Tests
1195
     */
1196 2
    protected function getDuplicateRowsIfExist($tstamp, $fieldArray)
1197
    {
1198 2
        $rows = [];
1199
1200 2
        $currentTime = $this->getCurrentTime();
1201
1202 2
        $queryBuilder = GeneralUtility::makeInstance(ConnectionPool::class)->getQueryBuilderForTable($this->tableName);
1203
        $queryBuilder
1204 2
            ->select('qid')
1205 2
            ->from('tx_crawler_queue');
1206
        //if this entry is scheduled with "now"
1207 2
        if ($tstamp <= $currentTime) {
1208
            if ($this->extensionSettings['enableTimeslot']) {
1209
                $timeBegin = $currentTime - 100;
1210
                $timeEnd = $currentTime + 100;
1211
                $queryBuilder
1212
                    ->where(
1213
                        'scheduled BETWEEN ' . $timeBegin . ' AND ' . $timeEnd . ''
1214
                    )
1215
                    ->orWhere(
1216
                        $queryBuilder->expr()->lte('scheduled', $currentTime)
1217
                    );
1218
            } else {
1219
                $queryBuilder
1220
                    ->where(
1221
                        $queryBuilder->expr()->lte('scheduled', $currentTime)
1222
                    );
1223
            }
1224 2
        } elseif ($tstamp > $currentTime) {
1225
            //entry with a timestamp in the future need to have the same schedule time
1226
            $queryBuilder
1227 2
                ->where(
1228 2
                    $queryBuilder->expr()->eq('scheduled', $tstamp)
1229
                );
1230
        }
1231
1232
        $statement = $queryBuilder
1233 2
            ->andWhere('exec_time != 0')
1234 2
            ->andWhere('process_id != 0')
1235 2
            ->andWhere($queryBuilder->expr()->eq('page_id', $queryBuilder->createNamedParameter($fieldArray['page_id'], \PDO::PARAM_INT)))
1236 2
            ->andWhere($queryBuilder->expr()->eq('parameters_hash', $queryBuilder->createNamedParameter($fieldArray['parameters_hash'], \PDO::PARAM_STR)))
1237 2
            ->execute();
1238
1239 2
        while ($row = $statement->fetch()) {
1240
            $rows[] = $row['qid'];
1241
        }
1242
1243 2
        return $rows;
1244
    }
1245
1246
    /**
1247
     * Returns the current system time
1248
     *
1249
     * @return int
1250
     */
1251
    public function getCurrentTime()
1252
    {
1253
        return time();
1254
    }
1255
1256
    /************************************
1257
     *
1258
     * URL reading
1259
     *
1260
     ************************************/
1261
1262
    /**
1263
     * Read URL for single queue entry
1264
     *
1265
     * @param integer $queueId
1266
     * @param boolean $force If set, will process even if exec_time has been set!
1267
     * @return integer
1268
     */
1269
    public function readUrl($queueId, $force = false)
1270
    {
1271
        $queryBuilder = GeneralUtility::makeInstance(ConnectionPool::class)->getQueryBuilderForTable($this->tableName);
1272
        $ret = 0;
1273
        $this->logger->debug('crawler-readurl start ' . microtime(true));
1274
        // Get entry:
1275
        $queryBuilder
1276
            ->select('*')
1277
            ->from('tx_crawler_queue')
1278
            ->where(
1279
                $queryBuilder->expr()->eq('qid', $queryBuilder->createNamedParameter($queueId, \PDO::PARAM_INT))
1280
            );
1281
        if (!$force) {
1282
            $queryBuilder
1283
                ->andWhere('exec_time = 0')
1284
                ->andWhere('process_scheduled > 0');
1285
        }
1286
        $queueRec = $queryBuilder->execute()->fetch();
1287
1288
        if (!is_array($queueRec)) {
1289
            return;
1290
        }
1291
1292
        SignalSlotUtility::emitSignal(
1293
            __CLASS__,
1294
            SignalSlotUtility::SIGNNAL_QUEUEITEM_PREPROCESS,
1295
            [$queueId, &$queueRec]
1296
        );
1297
1298
        // Set exec_time to lock record:
1299
        $field_array = ['exec_time' => $this->getCurrentTime()];
1300
1301
        if (isset($this->processID)) {
1302
            //if mulitprocessing is used we need to store the id of the process which has handled this entry
1303
            $field_array['process_id_completed'] = $this->processID;
1304
        }
1305
1306
        GeneralUtility::makeInstance(ConnectionPool::class)->getConnectionForTable('tx_crawler_queue')
1307
            ->update(
1308
                'tx_crawler_queue',
1309
                $field_array,
1310
                [ 'qid' => (int)$queueId ]
1311
            );
1312
1313
        $result = $this->queueExecutor->executeQueueItem($queueRec, $this);
1314
        $resultData = unserialize($result['content']);
1315
1316
        //atm there's no need to point to specific pollable extensions
1317
        if (is_array($GLOBALS['TYPO3_CONF_VARS']['EXTCONF']['crawler']['pollSuccess'])) {
1318
            foreach ($GLOBALS['TYPO3_CONF_VARS']['EXTCONF']['crawler']['pollSuccess'] as $pollable) {
1319
                // only check the success value if the instruction is runnig
1320
                // it is important to name the pollSuccess key same as the procInstructions key
1321
                if (is_array($resultData['parameters']['procInstructions']) && in_array(
1322
                    $pollable,
1323
                    $resultData['parameters']['procInstructions']
1324
                )
1325
                ) {
1326
                    if (!empty($resultData['success'][$pollable]) && $resultData['success'][$pollable]) {
1327
                        $ret |= self::CLI_STATUS_POLLABLE_PROCESSED;
1328
                    }
1329
                }
1330
            }
1331
        }
1332
1333
        // Set result in log which also denotes the end of the processing of this entry.
1334
        $field_array = ['result_data' => serialize($result)];
1335
1336
        SignalSlotUtility::emitSignal(
1337
            __CLASS__,
1338
            SignalSlotUtility::SIGNNAL_QUEUEITEM_POSTPROCESS,
1339
            [$queueId, &$field_array]
1340
        );
1341
1342
        GeneralUtility::makeInstance(ConnectionPool::class)->getConnectionForTable('tx_crawler_queue')
1343
            ->update(
1344
                'tx_crawler_queue',
1345
                $field_array,
1346
                [ 'qid' => (int)$queueId ]
1347
            );
1348
1349
        $this->logger->debug('crawler-readurl stop ' . microtime(true));
1350
        return $ret;
1351
    }
1352
1353
    /**
1354
     * Read URL for not-yet-inserted log-entry
1355
     *
1356
     * @param array $field_array Queue field array,
1357
     *
1358
     * @return string
1359
     */
1360
    public function readUrlFromArray($field_array)
1361
    {
1362
            // Set exec_time to lock record:
1363
        $field_array['exec_time'] = $this->getCurrentTime();
1364
        $connectionForCrawlerQueue = GeneralUtility::makeInstance(ConnectionPool::class)->getConnectionForTable($this->tableName);
1365
        $connectionForCrawlerQueue->insert(
1366
            $this->tableName,
1367
            $field_array
1368
        );
1369
        $queueId = $field_array['qid'] = $connectionForCrawlerQueue->lastInsertId($this->tableName, 'qid');
1370
1371
        $result = $this->queueExecutor->executeQueueItem($field_array, $this);
1372
1373
        // Set result in log which also denotes the end of the processing of this entry.
1374
        $field_array = ['result_data' => serialize($result)];
1375
1376
        SignalSlotUtility::emitSignal(
1377
            __CLASS__,
1378
            SignalSlotUtility::SIGNNAL_QUEUEITEM_POSTPROCESS,
1379
            [$queueId, &$field_array]
1380
        );
1381
1382
        $connectionForCrawlerQueue->update(
1383
            $this->tableName,
1384
            $field_array,
1385
            ['qid' => $queueId]
1386
        );
1387
1388
        return $result;
1389
    }
1390
1391
    /*****************************
1392
     *
1393
     * Compiling URLs to crawl - tools
1394
     *
1395
     *****************************/
1396
1397
    /**
1398
     * @param integer $id Root page id to start from.
1399
     * @param integer $depth Depth of tree, 0=only id-page, 1= on sublevel, 99 = infinite
1400
     * @param integer $scheduledTime Unix Time when the URL is timed to be visited when put in queue
1401
     * @param integer $reqMinute Number of requests per minute (creates the interleave between requests)
1402
     * @param boolean $submitCrawlUrls If set, submits the URLs to queue in database (real crawling)
1403
     * @param boolean $downloadCrawlUrls If set (and submitcrawlUrls is false) will fill $downloadUrls with entries)
1404
     * @param array $incomingProcInstructions Array of processing instructions
1405
     * @param array $configurationSelection Array of configuration keys
1406
     * @return string
1407
     */
1408
    public function getPageTreeAndUrls(
1409
        $id,
1410
        $depth,
1411
        $scheduledTime,
1412
        $reqMinute,
1413
        $submitCrawlUrls,
1414
        $downloadCrawlUrls,
1415
        array $incomingProcInstructions,
1416
        array $configurationSelection
1417
    ) {
1418
        $this->scheduledTime = $scheduledTime;
1419
        $this->reqMinute = $reqMinute;
1420
        $this->submitCrawlUrls = $submitCrawlUrls;
1421
        $this->downloadCrawlUrls = $downloadCrawlUrls;
1422
        $this->incomingProcInstructions = $incomingProcInstructions;
1423
        $this->incomingConfigurationSelection = $configurationSelection;
1424
1425
        $this->duplicateTrack = [];
1426
        $this->downloadUrls = [];
1427
1428
        // Drawing tree:
1429
        /* @var PageTreeView $tree */
1430
        $tree = GeneralUtility::makeInstance(PageTreeView::class);
1431
        $perms_clause = $this->backendUser->getPagePermsClause(Permission::PAGE_SHOW);
1432
        $tree->init('AND ' . $perms_clause);
1433
1434
        $pageInfo = BackendUtility::readPageAccess($id, $perms_clause);
1435
        if (is_array($pageInfo)) {
1436
            // Set root row:
1437
            $tree->tree[] = [
1438
                'row' => $pageInfo,
1439
                'HTML' => $this->iconFactory->getIconForRecord('pages', $pageInfo, Icon::SIZE_SMALL)
1440
            ];
1441
        }
1442
1443
        // Get branch beneath:
1444
        if ($depth) {
1445
            $tree->getTree($id, $depth, '');
1446
        }
1447
1448
        // Traverse page tree:
1449
        $code = '';
1450
1451
        foreach ($tree->tree as $data) {
1452
            $this->MP = false;
1453
1454
            // recognize mount points
1455
            if ($data['row']['doktype'] == PageRepository::DOKTYPE_MOUNTPOINT) {
1456
                $queryBuilder = GeneralUtility::makeInstance(ConnectionPool::class)->getQueryBuilderForTable($this->tableName);
1457
                $queryBuilder->getRestrictions()->removeAll()->add(GeneralUtility::makeInstance(DeletedRestriction::class));
1458
                $mountpage = $queryBuilder
1459
                    ->select('*')
1460
                    ->from('pages')
1461
                    ->where(
1462
                        $queryBuilder->expr()->eq('uid', $queryBuilder->createNamedParameter($data['row']['uid'], \PDO::PARAM_INT))
1463
                    )
1464
                    ->execute()
1465
                    ->fetchAll();
1466
                $queryBuilder->resetRestrictions();
1467
1468
                // fetch mounted pages
1469
                $this->MP = $mountpage[0]['mount_pid'] . '-' . $data['row']['uid'];
0 ignored issues
show
Documentation Bug introduced by
The property $MP was declared of type boolean, but $mountpage[0]['mount_pid...' . $data['row']['uid'] is of type string. Maybe add a type cast?

This check looks for assignments to scalar types that may be of the wrong type.

To ensure the code behaves as expected, it may be a good idea to add an explicit type cast.

$answer = 42;

$correct = false;

$correct = (bool) $answer;
Loading history...
1470
1471
                $mountTree = GeneralUtility::makeInstance(PageTreeView::class);
1472
                $mountTree->init('AND ' . $perms_clause);
1473
                $mountTree->getTree($mountpage[0]['mount_pid'], $depth);
1474
1475
                foreach ($mountTree->tree as $mountData) {
1476
                    $code .= $this->drawURLs_addRowsForPage(
1477
                        $mountData['row'],
1478
                        $mountData['HTML'] . BackendUtility::getRecordTitle('pages', $mountData['row'], true)
1479
                    );
1480
                }
1481
1482
                // replace page when mount_pid_ol is enabled
1483
                if ($mountpage[0]['mount_pid_ol']) {
1484
                    $data['row']['uid'] = $mountpage[0]['mount_pid'];
1485
                } else {
1486
                    // if the mount_pid_ol is not set the MP must not be used for the mountpoint page
1487
                    $this->MP = false;
1488
                }
1489
            }
1490
1491
            $code .= $this->drawURLs_addRowsForPage(
1492
                $data['row'],
1493
                $data['HTML'] . BackendUtility::getRecordTitle('pages', $data['row'], true)
1494
            );
1495
        }
1496
1497
        return $code;
1498
    }
1499
1500
    /**
1501
     * Expands exclude string
1502
     *
1503
     * @param string $excludeString Exclude string
1504
     * @return array
1505
     */
1506 1
    public function expandExcludeString($excludeString)
1507
    {
1508
        // internal static caches;
1509 1
        static $expandedExcludeStringCache;
1510 1
        static $treeCache;
1511
1512 1
        if (empty($expandedExcludeStringCache[$excludeString])) {
1513 1
            $pidList = [];
1514
1515 1
            if (!empty($excludeString)) {
1516
                /** @var PageTreeView $tree */
1517
                $tree = GeneralUtility::makeInstance(PageTreeView::class);
1518
                $tree->init('AND ' . $this->backendUser->getPagePermsClause(Permission::PAGE_SHOW));
1519
1520
                $excludeParts = GeneralUtility::trimExplode(',', $excludeString);
1521
1522
                foreach ($excludeParts as $excludePart) {
1523
                    list($pid, $depth) = GeneralUtility::trimExplode('+', $excludePart);
1524
1525
                    // default is "page only" = "depth=0"
1526
                    if (empty($depth)) {
1527
                        $depth = (stristr($excludePart, '+')) ? 99 : 0;
1528
                    }
1529
1530
                    $pidList[] = $pid;
1531
1532
                    if ($depth > 0) {
1533
                        if (empty($treeCache[$pid][$depth])) {
1534
                            $tree->reset();
1535
                            $tree->getTree($pid, $depth);
1536
                            $treeCache[$pid][$depth] = $tree->tree;
1537
                        }
1538
1539
                        foreach ($treeCache[$pid][$depth] as $data) {
1540
                            $pidList[] = $data['row']['uid'];
1541
                        }
1542
                    }
1543
                }
1544
            }
1545
1546 1
            $expandedExcludeStringCache[$excludeString] = array_unique($pidList);
1547
        }
1548
1549 1
        return $expandedExcludeStringCache[$excludeString];
1550
    }
1551
1552
    /**
1553
     * Create the rows for display of the page tree
1554
     * For each page a number of rows are shown displaying GET variable configuration
1555
     *
1556
     * @param    array        Page row
1557
     * @param    string        Page icon and title for row
1558
     * @return    string        HTML <tr> content (one or more)
1559
     */
1560
    public function drawURLs_addRowsForPage(array $pageRow, $pageTitleAndIcon)
1561
    {
1562
        $skipMessage = '';
1563
1564
        // Get list of configurations
1565
        $configurations = $this->getUrlsForPageRow($pageRow, $skipMessage);
1566
1567
        if (!empty($this->incomingConfigurationSelection)) {
1568
            // remove configuration that does not match the current selection
1569
            foreach ($configurations as $confKey => $confArray) {
1570
                if (!in_array($confKey, $this->incomingConfigurationSelection)) {
1571
                    unset($configurations[$confKey]);
1572
                }
1573
            }
1574
        }
1575
1576
        // Traverse parameter combinations:
1577
        $c = 0;
1578
        $content = '';
1579
        if (!empty($configurations)) {
1580
            foreach ($configurations as $confKey => $confArray) {
1581
1582
                    // Title column:
1583
                if (!$c) {
1584
                    $titleClm = '<td rowspan="' . count($configurations) . '">' . $pageTitleAndIcon . '</td>';
1585
                } else {
1586
                    $titleClm = '';
1587
                }
1588
1589
                if (!in_array($pageRow['uid'], $this->expandExcludeString($confArray['subCfg']['exclude']))) {
1590
1591
                        // URL list:
1592
                    $urlList = $this->urlListFromUrlArray(
1593
                        $confArray,
1594
                        $pageRow,
1595
                        $this->scheduledTime,
1596
                        $this->reqMinute,
1597
                        $this->submitCrawlUrls,
1598
                        $this->downloadCrawlUrls,
1599
                        $this->duplicateTrack,
1600
                        $this->downloadUrls,
1601
                        $this->incomingProcInstructions // if empty the urls won't be filtered by processing instructions
1602
                    );
1603
1604
                    // Expanded parameters:
1605
                    $paramExpanded = '';
1606
                    $calcAccu = [];
1607
                    $calcRes = 1;
1608
                    foreach ($confArray['paramExpanded'] as $gVar => $gVal) {
1609
                        $paramExpanded .= '
1610
                            <tr>
1611
                                <td>' . htmlspecialchars('&' . $gVar . '=') . '<br/>' .
1612
                                '(' . count($gVal) . ')' .
1613
                                '</td>
1614
                                <td nowrap="nowrap">' . nl2br(htmlspecialchars(implode(chr(10), $gVal))) . '</td>
1615
                            </tr>
1616
                        ';
1617
                        $calcRes *= count($gVal);
1618
                        $calcAccu[] = count($gVal);
1619
                    }
1620
                    $paramExpanded = '<table>' . $paramExpanded . '</table>';
1621
                    $paramExpanded .= 'Comb: ' . implode('*', $calcAccu) . '=' . $calcRes;
1622
1623
                    // Options
1624
                    $optionValues = '';
1625
                    if ($confArray['subCfg']['userGroups']) {
1626
                        $optionValues .= 'User Groups: ' . $confArray['subCfg']['userGroups'] . '<br/>';
1627
                    }
1628
                    if ($confArray['subCfg']['procInstrFilter']) {
1629
                        $optionValues .= 'ProcInstr: ' . $confArray['subCfg']['procInstrFilter'] . '<br/>';
1630
                    }
1631
1632
                    // Compile row:
1633
                    $content .= '
1634
                        <tr>
1635
                            ' . $titleClm . '
1636
                            <td>' . htmlspecialchars($confKey) . '</td>
1637
                            <td>' . nl2br(htmlspecialchars(rawurldecode(trim(str_replace('&', chr(10) . '&', GeneralUtility::implodeArrayForUrl('', $confArray['paramParsed'])))))) . '</td>
1638
                            <td>' . $paramExpanded . '</td>
1639
                            <td nowrap="nowrap">' . $urlList . '</td>
1640
                            <td nowrap="nowrap">' . $optionValues . '</td>
1641
                            <td nowrap="nowrap">' . DebugUtility::viewArray($confArray['subCfg']['procInstrParams.']) . '</td>
1642
                        </tr>';
1643
                } else {
1644
                    $content .= '<tr>
1645
                            ' . $titleClm . '
1646
                            <td>' . htmlspecialchars($confKey) . '</td>
1647
                            <td colspan="5"><em>No entries</em> (Page is excluded in this configuration)</td>
1648
                        </tr>';
1649
                }
1650
1651
                $c++;
1652
            }
1653
        } else {
1654
            $message = !empty($skipMessage) ? ' (' . $skipMessage . ')' : '';
1655
1656
            // Compile row:
1657
            $content .= '
1658
                <tr>
1659
                    <td>' . $pageTitleAndIcon . '</td>
1660
                    <td colspan="6"><em>No entries</em>' . $message . '</td>
1661
                </tr>';
1662
        }
1663
1664
        return $content;
1665
    }
1666
1667
    /*****************************
1668
     *
1669
     * CLI functions
1670
     *
1671
     *****************************/
1672
1673
    /**
1674
     * Running the functionality of the CLI (crawling URLs from queue)
1675
     *
1676
     * @param int $countInARun
1677
     * @param int $sleepTime
1678
     * @param int $sleepAfterFinish
1679
     * @return string
1680
     */
1681
    public function CLI_run($countInARun, $sleepTime, $sleepAfterFinish)
1682
    {
1683
        $result = 0;
1684
        $counter = 0;
1685
1686
        // First, run hooks:
1687
        $this->CLI_runHooks();
1688
1689
        // Clean up the queue
1690
        if ((int)$this->extensionSettings['purgeQueueDays'] > 0) {
1691
            $purgeDate = $this->getCurrentTime() - 24 * 60 * 60 * (int)$this->extensionSettings['purgeQueueDays'];
1692
1693
            $queryBuilderDelete = GeneralUtility::makeInstance(ConnectionPool::class)->getQueryBuilderForTable($this->tableName);
1694
            $del = $queryBuilderDelete
1695
                ->delete($this->tableName)
1696
                ->where(
1697
                    'exec_time != 0 AND exec_time < ' . $purgeDate
1698
                )->execute();
1699
1700
            if (false === $del) {
1701
                $this->logger->info(
1702
                    'Records could not be deleted.'
1703
                );
1704
            }
1705
        }
1706
1707
        // Select entries:
1708
        //TODO Shouldn't this reside within the transaction?
1709
        $queryBuilderSelect = GeneralUtility::makeInstance(ConnectionPool::class)->getQueryBuilderForTable($this->tableName);
1710
        $rows = $queryBuilderSelect
1711
            ->select('qid', 'scheduled')
1712
            ->from($this->tableName)
1713
            ->where(
1714
                $queryBuilderSelect->expr()->eq('exec_time', 0),
1715
                $queryBuilderSelect->expr()->eq('process_scheduled', 0),
1716
                $queryBuilderSelect->expr()->lte('scheduled', $this->getCurrentTime())
1717
            )
1718
            ->orderBy('scheduled')
1719
            ->addOrderBy('qid')
1720
            ->setMaxResults($countInARun)
1721
            ->execute()
1722
            ->fetchAll();
1723
1724
        if (!empty($rows)) {
1725
            $quidList = [];
1726
1727
            foreach ($rows as $r) {
1728
                $quidList[] = $r['qid'];
1729
            }
1730
1731
            $processId = $this->CLI_buildProcessId();
1732
1733
            //reserve queue entries for process
1734
1735
            //$this->queryBuilder->getConnection()->executeQuery('BEGIN');
1736
            //TODO make sure we're not taking assigned queue-entires
1737
1738
            //save the number of assigned queue entrys to determine who many have been processed later
1739
            $queryBuilderUpdate = GeneralUtility::makeInstance(ConnectionPool::class)->getQueryBuilderForTable($this->tableName);
1740
            $numberOfAffectedRows = $queryBuilderUpdate
1741
                ->update($this->tableName)
1742
                ->where(
1743
                    $queryBuilderUpdate->expr()->in('qid', $quidList)
1744
                )
1745
                ->set('process_scheduled', $this->getCurrentTime())
1746
                ->set('process_id', $processId)
1747
                ->execute();
1748
1749
            GeneralUtility::makeInstance(ConnectionPool::class)->getConnectionForTable('tx_crawler_process')
1750
                ->update(
1751
                    'tx_crawler_process',
1752
                    [ 'assigned_items_count' => (int)$numberOfAffectedRows ],
1753
                    [ 'process_id' => $processId ]
1754
                );
1755
1756
            if ($numberOfAffectedRows == count($quidList)) {
1757
                //$this->queryBuilder->getConnection()->executeQuery('COMMIT');
1758
            } else {
1759
                //$this->queryBuilder->getConnection()->executeQuery('ROLLBACK');
1760
                $this->CLI_debug("Nothing processed due to multi-process collision (" . $this->CLI_buildProcessId() . ")");
1761
                return ($result | self::CLI_STATUS_ABORTED);
1762
            }
1763
1764
            foreach ($rows as $r) {
1765
                $result |= $this->readUrl($r['qid']);
1766
1767
                $counter++;
1768
                usleep((int)$sleepTime); // Just to relax the system
1769
1770
                // if during the start and the current read url the cli has been disable we need to return from the function
1771
                // mark the process NOT as ended.
1772
                if ($this->getDisabled()) {
1773
                    return ($result | self::CLI_STATUS_ABORTED);
1774
                }
1775
1776
                if (!$this->processRepository->isProcessActive($this->CLI_buildProcessId())) {
1777
                    $this->CLI_debug("conflict / timeout (" . $this->CLI_buildProcessId() . ")");
1778
1779
                    //TODO might need an additional returncode
1780
                    $result |= self::CLI_STATUS_ABORTED;
1781
                    break; //possible timeout
1782
                }
1783
            }
1784
1785
            sleep((int)$sleepAfterFinish);
1786
1787
            $msg = 'Rows: ' . $counter;
1788
            $this->CLI_debug($msg . " (" . $this->CLI_buildProcessId() . ")");
1789
        } else {
1790
            $this->CLI_debug("Nothing within queue which needs to be processed (" . $this->CLI_buildProcessId() . ")");
1791
        }
1792
1793
        if ($counter > 0) {
1794
            $result |= self::CLI_STATUS_PROCESSED;
1795
        }
1796
1797
        return $result;
1798
    }
1799
1800
    /**
1801
     * Activate hooks
1802
     *
1803
     * @return void
1804
     */
1805
    public function CLI_runHooks()
1806
    {
1807
        foreach ($GLOBALS['TYPO3_CONF_VARS']['EXTCONF']['crawler']['cli_hooks'] ?? [] as $objRef) {
1808
            $hookObj = GeneralUtility::makeInstance($objRef);
1809
            if (is_object($hookObj)) {
1810
                $hookObj->crawler_init($this);
1811
            }
1812
        }
1813
    }
1814
1815
    /**
1816
     * Try to acquire a new process with the given id
1817
     * also performs some auto-cleanup for orphan processes
1818
     * @todo preemption might not be the most elegant way to clean up
1819
     *
1820
     * @param string $id identification string for the process
1821
     * @return boolean
1822
     */
1823
    public function CLI_checkAndAcquireNewProcess($id)
1824
    {
1825
        $queryBuilder = GeneralUtility::makeInstance(ConnectionPool::class)->getQueryBuilderForTable($this->tableName);
1826
        $ret = true;
1827
1828
        $systemProcessId = getmypid();
1829
        if ($systemProcessId < 1) {
1830
            return false;
1831
        }
1832
1833
        $processCount = 0;
1834
        $orphanProcesses = [];
1835
1836
        //$this->queryBuilder->getConnection()->executeQuery('BEGIN');
1837
1838
        $statement = $queryBuilder
1839
            ->select('process_id', 'ttl')
1840
            ->from('tx_crawler_process')
1841
            ->where(
1842
                'active = 1 AND deleted = 0'
1843
            )
1844
            ->execute();
1845
1846
        $currentTime = $this->getCurrentTime();
1847
1848
        while ($row = $statement->fetch()) {
1849
            if ($row['ttl'] < $currentTime) {
1850
                $orphanProcesses[] = $row['process_id'];
1851
            } else {
1852
                $processCount++;
1853
            }
1854
        }
1855
1856
        // if there are less than allowed active processes then add a new one
1857
        if ($processCount < (int)$this->extensionSettings['processLimit']) {
1858
            $this->CLI_debug("add process " . $this->CLI_buildProcessId() . " (" . ($processCount + 1) . "/" . (int)$this->extensionSettings['processLimit'] . ")");
1859
1860
            GeneralUtility::makeInstance(ConnectionPool::class)->getConnectionForTable('tx_crawler_process')->insert(
1861
                'tx_crawler_process',
1862
                [
1863
                    'process_id' => $id,
1864
                    'active' => 1,
1865
                    'ttl' => $currentTime + (int)$this->extensionSettings['processMaxRunTime'],
1866
                    'system_process_id' => $systemProcessId
1867
                ]
1868
            );
1869
        } else {
1870
            $this->CLI_debug("Processlimit reached (" . ($processCount) . "/" . (int)$this->extensionSettings['processLimit'] . ")");
1871
            $ret = false;
1872
        }
1873
1874
        $this->processRepository->deleteProcessesMarkedAsDeleted();
1875
        $this->CLI_releaseProcesses($orphanProcesses, true); // maybe this should be somehow included into the current lock
1876
1877
        return $ret;
1878
    }
1879
1880
    /**
1881
     * Release a process and the required resources
1882
     *
1883
     * @param  mixed    $releaseIds   string with a single process-id or array with multiple process-ids
1884
     * @param  boolean  $withinLock   show whether the DB-actions are included within an existing lock
1885
     * @return boolean
1886
     */
1887
    public function CLI_releaseProcesses($releaseIds, $withinLock = false)
1888
    {
1889
        $queryBuilder = GeneralUtility::makeInstance(ConnectionPool::class)->getQueryBuilderForTable($this->tableName);
1890
1891
        if (!is_array($releaseIds)) {
1892
            $releaseIds = [$releaseIds];
1893
        }
1894
1895
        if (empty($releaseIds)) {
1896
            return false;   //nothing to release
1897
        }
1898
1899
        if (!$withinLock) {
1900
            //$this->queryBuilder->getConnection()->executeQuery('BEGIN');
1901
        }
1902
1903
        // some kind of 2nd chance algo - this way you need at least 2 processes to have a real cleanup
1904
        // this ensures that a single process can't mess up the entire process table
1905
1906
        // mark all processes as deleted which have no "waiting" queue-entires and which are not active
1907
1908
        $queryBuilder
1909
        ->update($this->tableName, 'q')
1910
        ->where(
1911
            'q.process_id IN(SELECT p.process_id FROM tx_crawler_process as p WHERE p.active = 0)'
1912
        )
1913
        ->set('q.process_scheduled', 0)
1914
        ->set('q.process_id', '')
1915
        ->execute();
1916
1917
        // FIXME: Not entirely sure that this is equivalent to the previous version
1918
        $queryBuilder->resetQueryPart('set');
1919
1920
        $queryBuilder
1921
            ->update('tx_crawler_process')
1922
            ->where(
1923
                $queryBuilder->expr()->eq('active', 0),
1924
                'process_id IN(SELECT q.process_id FROM tx_crawler_queue as q WHERE q.exec_time = 0)'
1925
            )
1926
            ->set('system_process_id', 0)
1927
            ->execute();
1928
        // previous version for reference
1929
        /*
1930
        $GLOBALS['TYPO3_DB']->exec_UPDATEquery(
1931
            'tx_crawler_process',
1932
            'active=0 AND deleted=0
1933
            AND NOT EXISTS (
1934
                SELECT * FROM tx_crawler_queue
1935
                WHERE tx_crawler_queue.process_id = tx_crawler_process.process_id
1936
                AND tx_crawler_queue.exec_time = 0
1937
            )',
1938
            [
1939
                'deleted' => '1',
1940
                'system_process_id' => 0
1941
            ]
1942
        );*/
1943
        // mark all requested processes as non-active
1944
        $queryBuilder
1945
            ->update('tx_crawler_process')
1946
            ->where(
1947
                'NOT EXISTS (
1948
                SELECT * FROM tx_crawler_queue
1949
                    WHERE tx_crawler_queue.process_id = tx_crawler_process.process_id
1950
                    AND tx_crawler_queue.exec_time = 0
1951
                )',
1952
                $queryBuilder->expr()->in('process_id', $queryBuilder->createNamedParameter($releaseIds, Connection::PARAM_STR_ARRAY)),
1953
                $queryBuilder->expr()->eq('deleted', 0)
1954
            )
1955
            ->set('active', 0)
1956
            ->execute();
1957
        $queryBuilder->resetQueryPart('set');
1958
        $queryBuilder
1959
            ->update($this->tableName)
1960
            ->where(
1961
                $queryBuilder->expr()->eq('exec_time', 0),
1962
                $queryBuilder->expr()->in('process_id', $queryBuilder->createNamedParameter($releaseIds, Connection::PARAM_STR_ARRAY))
1963
            )
1964
            ->set('process_scheduled', 0)
1965
            ->set('process_id', '')
1966
            ->execute();
1967
1968
        if (!$withinLock) {
1969
            //$this->queryBuilder->getConnection()->executeQuery('COMMIT');
1970
        }
1971
1972
        return true;
1973
    }
1974
1975
    /**
1976
     * Create a unique Id for the current process
1977
     *
1978
     * @return string  the ID
1979
     */
1980 1
    public function CLI_buildProcessId()
1981
    {
1982 1
        if (!$this->processID) {
1983
            $this->processID = GeneralUtility::shortMD5(microtime(true));
1984
        }
1985 1
        return $this->processID;
1986
    }
1987
1988
    /**
1989
     * Prints a message to the stdout (only if debug-mode is enabled)
1990
     *
1991
     * @param  string $msg  the message
1992
     */
1993
    public function CLI_debug($msg)
1994
    {
1995
        if ((int)$this->extensionSettings['processDebug']) {
1996
            echo $msg . "\n";
1997
            flush();
1998
        }
1999
    }
2000
2001
    /**
2002
     * Cleans up entries that stayed for too long in the queue. These are:
2003
     * - processed entries that are over 1.5 days in age
2004
     * - scheduled entries that are over 7 days old
2005
     *
2006
     * @return void
2007
     */
2008
    public function cleanUpOldQueueEntries()
2009
    {
2010
        $processedAgeInSeconds = $this->extensionSettings['cleanUpProcessedAge'] * 86400; // 24*60*60 Seconds in 24 hours
2011
        $scheduledAgeInSeconds = $this->extensionSettings['cleanUpScheduledAge'] * 86400;
2012
2013
        $now = time();
2014
        $condition = '(exec_time<>0 AND exec_time<' . ($now - $processedAgeInSeconds) . ') OR scheduled<=' . ($now - $scheduledAgeInSeconds);
2015
        $this->flushQueue($condition);
2016
    }
2017
2018
    /**
2019
     * Returns a md5 hash generated from a serialized configuration array.
2020
     *
2021
     * @param array $configuration
2022
     *
2023
     * @return string
2024
     */
2025 7
    protected function getConfigurationHash(array $configuration)
2026
    {
2027 7
        unset($configuration['paramExpanded']);
2028 7
        unset($configuration['URLs']);
2029 7
        return md5(serialize($configuration));
2030
    }
2031
2032
    /**
2033
     * Build a URL from a Page and the Query String. If the page has a Site configuration, it can be built by using
2034
     * the Site instance.
2035
     *
2036
     * @param int $pageId
2037
     * @param string $queryString
2038
     * @param string|null $alternativeBaseUrl
2039
     * @param int $httpsOrHttp see tx_crawler_configuration.force_ssl
2040
     * @return UriInterface
2041
     * @throws \TYPO3\CMS\Core\Exception\SiteNotFoundException
2042
     * @throws \TYPO3\CMS\Core\Routing\InvalidRouteArgumentsException
2043
     */
2044 2
    protected function getUrlFromPageAndQueryParameters(int $pageId, string $queryString, ?string $alternativeBaseUrl, int $httpsOrHttp): UriInterface
2045
    {
2046 2
        $site = GeneralUtility::makeInstance(SiteMatcher::class)->matchByPageId((int)$pageId);
2047 2
        if ($site instanceof Site) {
0 ignored issues
show
Bug introduced by
The class TYPO3\CMS\Core\Site\Entity\Site does not exist. Did you forget a USE statement, or did you not list all dependencies?

This error could be the result of:

1. Missing dependencies

PHP Analyzer uses your composer.json file (if available) to determine the dependencies of your project and to determine all the available classes and functions. It expects the composer.json to be in the root folder of your repository.

Are you sure this class is defined by one of your dependencies, or did you maybe not list a dependency in either the require or require-dev section?

2. Missing use statement

PHP does not complain about undefined classes in ìnstanceof checks. For example, the following PHP code will work perfectly fine:

if ($x instanceof DoesNotExist) {
    // Do something.
}

If you have not tested against this specific condition, such errors might go unnoticed.

Loading history...
2048
            $queryString = ltrim($queryString, '?&');
2049
            $queryParts = [];
2050
            parse_str($queryString, $queryParts);
2051
            unset($queryParts['id']);
2052
            // workaround as long as we don't have native language support in crawler configurations
2053
            if (isset($queryParts['L'])) {
2054
                $queryParts['_language'] = $queryParts['L'];
2055
                unset($queryParts['L']);
2056
                $siteLanguage = $site->getLanguageById((int)$queryParts['_language']);
0 ignored issues
show
Unused Code introduced by
$siteLanguage is not used, you could remove the assignment.

This check looks for variable assignements that are either overwritten by other assignments or where the variable is not used subsequently.

$myVar = 'Value';
$higher = false;

if (rand(1, 6) > 3) {
    $higher = true;
} else {
    $higher = false;
}

Both the $myVar assignment in line 1 and the $higher assignment in line 2 are dead. The first because $myVar is never used and the second because $higher is always overwritten for every possible time line.

Loading history...
2057
            } else {
2058
                $siteLanguage = $site->getDefaultLanguage();
0 ignored issues
show
Unused Code introduced by
$siteLanguage is not used, you could remove the assignment.

This check looks for variable assignements that are either overwritten by other assignments or where the variable is not used subsequently.

$myVar = 'Value';
$higher = false;

if (rand(1, 6) > 3) {
    $higher = true;
} else {
    $higher = false;
}

Both the $myVar assignment in line 1 and the $higher assignment in line 2 are dead. The first because $myVar is never used and the second because $higher is always overwritten for every possible time line.

Loading history...
2059
            }
2060
            $url = $site->getRouter()->generateUri($pageId, $queryParts);
2061
            if (!empty($alternativeBaseUrl)) {
2062
                $alternativeBaseUrl = new Uri($alternativeBaseUrl);
2063
                $url = $url->withHost($alternativeBaseUrl->getHost());
2064
                $url = $url->withScheme($alternativeBaseUrl->getScheme());
2065
                $url = $url->withPort($alternativeBaseUrl->getPort());
2066
            }
2067
        } else {
2068
            // Technically this is not possible with site handling, but kept for backwards-compatibility reasons
2069
            // Once EXT:crawler is v10-only compatible, this should be removed completely
2070 2
            $baseUrl = ($alternativeBaseUrl ?: GeneralUtility::getIndpEnv('TYPO3_SITE_URL'));
2071 2
            $cacheHashCalculator = GeneralUtility::makeInstance(CacheHashCalculator::class);
2072 2
            $queryString .= '&cHash=' . $cacheHashCalculator->generateForParameters($queryString);
2073 2
            $url = rtrim($baseUrl, '/') . '/index.php' . $queryString;
2074 2
            $url = new Uri($url);
2075
        }
2076
2077 2
        if ($httpsOrHttp === -1) {
2078
            $url = $url->withScheme('http');
2079 2
        } elseif ($httpsOrHttp === 1) {
2080
            $url = $url->withScheme('https');
2081
        }
2082
2083 2
        return $url;
2084
    }
2085
}
2086