Completed
Push — typo3v9 ( efc308...b63611 )
by Tomas Norre
06:24
created

CrawlerController::getUrlsForPageId()   D

Complexity

Conditions 16
Paths 96

Size

Total Lines 92

Duplication

Lines 0
Ratio 0 %

Code Coverage

Tests 46
CRAP Score 16.0585

Importance

Changes 0
Metric Value
cc 16
nc 96
nop 1
dl 0
loc 92
ccs 46
cts 49
cp 0.9388
crap 16.0585
rs 4.6278
c 0
b 0
f 0

How to fix   Long Method    Complexity   

Long Method

Small methods make your code easier to understand, in particular if combined with a good name. Besides, if your method is small, finding a good name is usually much easier.

For example, if you find yourself adding comments to a method's body, this is usually a good sign to extract the commented part to a new method, and use the comment as a starting point when coming up with a good name for this new method.

Commonly applied refactorings include:

1
<?php
2
namespace AOE\Crawler\Controller;
3
4
/***************************************************************
5
 *  Copyright notice
6
 *
7
 *  (c) 2019 AOE GmbH <[email protected]>
8
 *
9
 *  All rights reserved
10
 *
11
 *  This script is part of the TYPO3 project. The TYPO3 project is
12
 *  free software; you can redistribute it and/or modify
13
 *  it under the terms of the GNU General Public License as published by
14
 *  the Free Software Foundation; either version 3 of the License, or
15
 *  (at your option) any later version.
16
 *
17
 *  The GNU General Public License can be found at
18
 *  http://www.gnu.org/copyleft/gpl.html.
19
 *
20
 *  This script is distributed in the hope that it will be useful,
21
 *  but WITHOUT ANY WARRANTY; without even the implied warranty of
22
 *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
23
 *  GNU General Public License for more details.
24
 *
25
 *  This copyright notice MUST APPEAR in all copies of the script!
26
 ***************************************************************/
27
28
use AOE\Crawler\QueueExecutor;
29
use AOE\Crawler\Configuration\ExtensionConfigurationProvider;
30
use AOE\Crawler\Domain\Repository\ConfigurationRepository;
31
use AOE\Crawler\Domain\Repository\ProcessRepository;
32
use AOE\Crawler\Domain\Repository\QueueRepository;
33
use AOE\Crawler\Event\EventDispatcher;
34
use AOE\Crawler\Utility\IconUtility;
35
use AOE\Crawler\Utility\SignalSlotUtility;
36
use Psr\Http\Message\UriInterface;
37
use Psr\Log\LoggerAwareInterface;
38
use Psr\Log\LoggerAwareTrait;
39
use TYPO3\CMS\Backend\Tree\View\PageTreeView;
40
use TYPO3\CMS\Backend\Utility\BackendUtility;
41
use TYPO3\CMS\Core\Authentication\BackendUserAuthentication;
42
use TYPO3\CMS\Core\Core\Environment;
43
use TYPO3\CMS\Core\Database\Connection;
44
use TYPO3\CMS\Core\Database\ConnectionPool;
45
use TYPO3\CMS\Core\Database\Query\Restriction\DeletedRestriction;
46
use TYPO3\CMS\Core\Http\Uri;
47
use TYPO3\CMS\Core\Routing\SiteMatcher;
48
use TYPO3\CMS\Core\Site\Entity\Site;
49
use TYPO3\CMS\Core\TypoScript\Parser\TypoScriptParser;
50
use TYPO3\CMS\Core\Utility\DebugUtility;
51
use TYPO3\CMS\Core\Utility\GeneralUtility;
52
use TYPO3\CMS\Core\Utility\MathUtility;
53
use TYPO3\CMS\Extbase\Object\ObjectManager;
54
use TYPO3\CMS\Frontend\Page\CacheHashCalculator;
55
use TYPO3\CMS\Frontend\Page\PageRepository;
56
57
/**
58
 * Class CrawlerController
59
 *
60
 * @package AOE\Crawler\Controller
61
 */
62
class CrawlerController implements LoggerAwareInterface
63
{
64
    use LoggerAwareTrait;
65
66
    const CLI_STATUS_NOTHING_PROCCESSED = 0;
67
    const CLI_STATUS_REMAIN = 1; //queue not empty
68
    const CLI_STATUS_PROCESSED = 2; //(some) queue items where processed
69
    const CLI_STATUS_ABORTED = 4; //instance didn't finish
70
    const CLI_STATUS_POLLABLE_PROCESSED = 8;
71
72
    /**
73
     * @var integer
74
     */
75
    public $setID = 0;
76
77
    /**
78
     * @var string
79
     */
80
    public $processID = '';
81
82
    /**
83
     * @var array
84
     */
85
    public $duplicateTrack = [];
86
87
    /**
88
     * @var array
89
     */
90
    public $downloadUrls = [];
91
92
    /**
93
     * @var array
94
     */
95
    public $incomingProcInstructions = [];
96
97
    /**
98
     * @var array
99
     */
100
    public $incomingConfigurationSelection = [];
101
102
    /**
103
     * @var bool
104
     */
105
    public $registerQueueEntriesInternallyOnly = false;
106
107
    /**
108
     * @var array
109
     */
110
    public $queueEntries = [];
111
112
    /**
113
     * @var array
114
     */
115
    public $urlList = [];
116
117
    /**
118
     * @var array
119
     */
120
    public $extensionSettings = [];
121
122
    /**
123
     * Mount Point
124
     *
125
     * @var boolean
126
     */
127
    public $MP = false;
128
129
    /**
130
     * @var string
131
     */
132
    protected $processFilename;
133
134
    /**
135
     * Holds the internal access mode can be 'gui','cli' or 'cli_im'
136
     *
137
     * @var string
138
     */
139
    protected $accessMode;
140
141
    /**
142
     * @var BackendUserAuthentication
143
     */
144
    private $backendUser;
145
146
    /**
147
     * @var integer
148
     */
149
    private $scheduledTime = 0;
150
151
    /**
152
     * @var integer
153
     */
154
    private $reqMinute = 0;
155
156
    /**
157
     * @var bool
158
     */
159
    private $submitCrawlUrls = false;
160
161
    /**
162
     * @var bool
163
     */
164
    private $downloadCrawlUrls = false;
165
166
    /**
167
     * @var QueueRepository
168
     */
169
    protected $queueRepository;
170
171
    /**
172
     * @var ProcessRepository
173
     */
174
    protected $processRepository;
175
176
    /**
177
     * @var ConfigurationRepository
178
     */
179
    protected $configurationRepository;
180
181
    /**
182
     * @var string
183
     */
184
    protected $tableName = 'tx_crawler_queue';
185
186
    /**
187
     * @var QueueExecutor
188
     */
189
    protected $queueExecutor;
190
191
    /**
192
     * @var int
193
     */
194
    protected $maximumUrlsToCompile = 10000;
195
196
    /**
197
     * Method to set the accessMode can be gui, cli or cli_im
198
     *
199
     * @return string
200
     */
201 1
    public function getAccessMode()
202
    {
203 1
        return $this->accessMode;
204
    }
205
206
    /**
207
     * @param string $accessMode
208
     */
209 1
    public function setAccessMode($accessMode)
210
    {
211 1
        $this->accessMode = $accessMode;
212 1
    }
213
214
    /**
215
     * Set disabled status to prevent processes from being processed
216
     *
217
     * @param  bool $disabled (optional, defaults to true)
218
     * @return void
219
     */
220 3
    public function setDisabled($disabled = true)
221
    {
222 3
        if ($disabled) {
223 2
            GeneralUtility::writeFile($this->processFilename, '');
224
        } else {
225 1
            if (is_file($this->processFilename)) {
226 1
                unlink($this->processFilename);
227
            }
228
        }
229 3
    }
230
231
    /**
232
     * Get disable status
233
     *
234
     * @return bool true if disabled
235
     */
236 3
    public function getDisabled()
237
    {
238 3
        return is_file($this->processFilename);
239
    }
240
241
    /**
242
     * @param string $filenameWithPath
243
     *
244
     * @return void
245
     */
246 4
    public function setProcessFilename($filenameWithPath)
247
    {
248 4
        $this->processFilename = $filenameWithPath;
249 4
    }
250
251
    /**
252
     * @return string
253
     */
254 1
    public function getProcessFilename()
255
    {
256 1
        return $this->processFilename;
257
    }
258
259
    /************************************
260
     *
261
     * Getting URLs based on Page TSconfig
262
     *
263
     ************************************/
264
265 26
    public function __construct()
266
    {
267 26
        $objectManager = GeneralUtility::makeInstance(ObjectManager::class);
268 26
        $this->queueRepository = $objectManager->get(QueueRepository::class);
269 26
        $this->processRepository = $objectManager->get(ProcessRepository::class);
270 26
        $this->configurationRepository = $objectManager->get(ConfigurationRepository::class);
271 26
        $this->queueExecutor = $objectManager->get(QueueExecutor::class);
272
273 26
        $this->backendUser = $GLOBALS['BE_USER'];
274 26
        $this->processFilename = Environment::getVarPath() . '/locks/tx_crawler.proc';
275
276
        /** @var ExtensionConfigurationProvider $configurationProvider */
277 26
        $configurationProvider = GeneralUtility::makeInstance(ExtensionConfigurationProvider::class);
278 26
        $settings = $configurationProvider->getExtensionConfiguration();
279 26
        $this->extensionSettings = is_array($settings) ? $settings : [];
280
281
        // set defaults:
282 26
        if (MathUtility::convertToPositiveInteger($this->extensionSettings['countInARun']) == 0) {
283
            $this->extensionSettings['countInARun'] = 100;
284
        }
285
286 26
        $this->extensionSettings['processLimit'] = MathUtility::forceIntegerInRange($this->extensionSettings['processLimit'], 1, 99, 1);
287 26
        $this->maximumUrlsToCompile = MathUtility::forceIntegerInRange($this->extensionSettings['maxCompileUrls'], 1, 1000000000, 10000);
288 26
    }
289
290
    /**
291
     * Sets the extensions settings (unserialized pendant of $TYPO3_CONF_VARS['EXT']['extConf']['crawler']).
292
     *
293
     * @param array $extensionSettings
294
     * @return void
295
     */
296 7
    public function setExtensionSettings(array $extensionSettings)
297
    {
298 7
        $this->extensionSettings = $extensionSettings;
299 7
    }
300
301
    /**
302
     * Check if the given page should be crawled
303
     *
304
     * @param array $pageRow
305
     * @return false|string false if the page should be crawled (not excluded), true / skipMessage if it should be skipped
306
     */
307 8
    public function checkIfPageShouldBeSkipped(array $pageRow)
308
    {
309 8
        $skipPage = false;
310 8
        $skipMessage = 'Skipped'; // message will be overwritten later
311
312
        // if page is hidden
313 8
        if (!$this->extensionSettings['crawlHiddenPages']) {
314 8
            if ($pageRow['hidden']) {
315 1
                $skipPage = true;
316 1
                $skipMessage = 'Because page is hidden';
317
            }
318
        }
319
320 8
        if (!$skipPage) {
321 7
            if (GeneralUtility::inList('3,4', $pageRow['doktype']) || $pageRow['doktype'] >= 199) {
322 3
                $skipPage = true;
323 3
                $skipMessage = 'Because doktype is not allowed';
324
            }
325
        }
326
327 8
        if (!$skipPage) {
328 4
            foreach ($GLOBALS['TYPO3_CONF_VARS']['EXTCONF']['crawler']['excludeDoktype'] ?? [] as $key => $doktypeList) {
329 1
                if (GeneralUtility::inList($doktypeList, $pageRow['doktype'])) {
330 1
                    $skipPage = true;
331 1
                    $skipMessage = 'Doktype was excluded by "' . $key . '"';
332 1
                    break;
333
                }
334
            }
335
        }
336
337 8
        if (!$skipPage) {
338
            // veto hook
339 3
            foreach ($GLOBALS['TYPO3_CONF_VARS']['EXTCONF']['crawler']['pageVeto'] ?? [] as $key => $func) {
340
                $params = [
341
                    'pageRow' => $pageRow
342
                ];
343
                // expects "false" if page is ok and "true" or a skipMessage if this page should _not_ be crawled
344
                $veto = GeneralUtility::callUserFunction($func, $params, $this);
345
                if ($veto !== false) {
346
                    $skipPage = true;
347
                    if (is_string($veto)) {
348
                        $skipMessage = $veto;
349
                    } else {
350
                        $skipMessage = 'Veto from hook "' . htmlspecialchars($key) . '"';
351
                    }
352
                    // no need to execute other hooks if a previous one return a veto
353
                    break;
354
                }
355
            }
356
        }
357
358 8
        return $skipPage ? $skipMessage : false;
359
    }
360
361
    /**
362
     * Wrapper method for getUrlsForPageId()
363
     * It returns an array of configurations and no urls!
364
     *
365
     * @param array $pageRow Page record with at least dok-type and uid columns.
366
     * @param string $skipMessage
367
     * @return array
368
     * @see getUrlsForPageId()
369
     */
370 4
    public function getUrlsForPageRow(array $pageRow, &$skipMessage = '')
371
    {
372 4
        $message = $this->checkIfPageShouldBeSkipped($pageRow);
373
374 4
        if ($message === false) {
375 3
            $res = $this->getUrlsForPageId($pageRow['uid']);
376 3
            $skipMessage = '';
377
        } else {
378 1
            $skipMessage = $message;
379 1
            $res = [];
380
        }
381
382 4
        return $res;
383
    }
384
385
    /**
386
     * This method is used to count if there are ANY unprocessed queue entries
387
     * of a given page_id and the configuration which matches a given hash.
388
     * If there if none, we can skip an inner detail check
389
     *
390
     * @param  int $uid
391
     * @param  string $configurationHash
392
     * @return boolean
393
     */
394 5
    protected function noUnprocessedQueueEntriesForPageWithConfigurationHashExist($uid, $configurationHash)
395
    {
396 5
        $queryBuilder = GeneralUtility::makeInstance(ConnectionPool::class)->getQueryBuilderForTable($this->tableName);
397 5
        $noUnprocessedQueueEntriesFound = true;
398
399
        $result = $queryBuilder
400 5
            ->count('*')
401 5
            ->from($this->tableName)
402 5
            ->where(
403 5
                $queryBuilder->expr()->eq('page_id', intval($uid)),
404 5
                $queryBuilder->expr()->eq('configuration_hash', $queryBuilder->createNamedParameter($configurationHash)),
405 5
                $queryBuilder->expr()->eq('exec_time', 0)
406
            )
407 5
            ->execute()
408 5
            ->fetchColumn();
409
410 5
        if ($result) {
411 3
            $noUnprocessedQueueEntriesFound = false;
412
        }
413
414 5
        return $noUnprocessedQueueEntriesFound;
415
    }
416
417
    /**
418
     * Creates a list of URLs from input array (and submits them to queue if asked for)
419
     * See Web > Info module script + "indexed_search"'s crawler hook-client using this!
420
     *
421
     * @param    array        Information about URLs from pageRow to crawl.
422
     * @param    array        Page row
423
     * @param    integer        Unix time to schedule indexing to, typically time()
424
     * @param    integer        Number of requests per minute (creates the interleave between requests)
425
     * @param    boolean        If set, submits the URLs to queue
426
     * @param    boolean        If set (and submitcrawlUrls is false) will fill $downloadUrls with entries)
427
     * @param    array        Array which is passed by reference and contains the an id per url to secure we will not crawl duplicates
428
     * @param    array        Array which will be filled with URLS for download if flag is set.
429
     * @param    array        Array of processing instructions
430
     * @return    string        List of URLs (meant for display in backend module)
431
     *
432
     */
433 2
    public function urlListFromUrlArray(
434
        array $vv,
435
        array $pageRow,
436
        $scheduledTime,
437
        $reqMinute,
438
        $submitCrawlUrls,
439
        $downloadCrawlUrls,
440
        array &$duplicateTrack,
441
        array &$downloadUrls,
442
        array $incomingProcInstructions
443
    ) {
444
445 2
        if (!is_array($vv['URLs'])) {
446
            return 'ERROR - no URL generated';
447
        }
448 2
        $urlLog = [];
449 2
        $pageId = (int)$pageRow['uid'];
450 2
        $configurationHash = $this->getConfigurationHash($vv);
451 2
        $skipInnerCheck = $this->noUnprocessedQueueEntriesForPageWithConfigurationHashExist($pageId, $configurationHash);
452
453 2
        foreach ($vv['URLs'] as $urlQuery) {
454 2
            if (!$this->drawURLs_PIfilter($vv['subCfg']['procInstrFilter'], $incomingProcInstructions)) {
455
                continue;
456
            }
457 2
            $url = (string)$this->getUrlFromPageAndQueryParameters(
458 2
                $pageId,
459 2
                $urlQuery,
460 2
                $vv['subCfg']['baseUrl'] ?? null,
461 2
                $vv['subCfg']['force_ssl'] ?? 0
462
            );
463
464
            // Create key by which to determine unique-ness:
465 2
            $uKey = $url . '|' . $vv['subCfg']['userGroups'] . '|' . $vv['subCfg']['procInstrFilter'];
466
467 2
            if (isset($duplicateTrack[$uKey])) {
468
                //if the url key is registered just display it and do not resubmit is
469
                $urlLog[] = '<em><span class="text-muted">' . htmlspecialchars($url) . '</span></em>';
470
            } else {
471
                // Scheduled time:
472 2
                $schTime = $scheduledTime + round(count($duplicateTrack) * (60 / $reqMinute));
473 2
                $schTime = floor($schTime / 60) * 60;
474 2
                $formattedDate = BackendUtility::datetime($schTime);
475 2
                $this->urlList[] = '[' . $formattedDate . '] ' . $url;
476 2
                $urlList = '[' . $formattedDate . '] ' . htmlspecialchars($url);
477
478
                // Submit for crawling!
479 2
                if ($submitCrawlUrls) {
480 2
                    $added = $this->addUrl(
481 2
                        $pageId,
482 2
                        $url,
483 2
                        $vv['subCfg'],
484 2
                        $scheduledTime,
485 2
                        $configurationHash,
486 2
                        $skipInnerCheck
487
                    );
488 2
                    if ($added === false) {
489 2
                        $urlList .= ' (URL already existed)';
490
                    }
491
                } elseif ($downloadCrawlUrls) {
492
                    $downloadUrls[$url] = $url;
493
                }
494 2
                $urlLog[] = $urlList;
495
            }
496 2
            $duplicateTrack[$uKey] = true;
497
        }
498
499 2
        return implode('<br>', $urlLog);
500
    }
501
502
    /**
503
     * Returns true if input processing instruction is among registered ones.
504
     *
505
     * @param string $piString PI to test
506
     * @param array $incomingProcInstructions Processing instructions
507
     * @return boolean
508
     */
509 5
    public function drawURLs_PIfilter($piString, array $incomingProcInstructions)
510
    {
511 5
        if (empty($incomingProcInstructions)) {
512 1
            return true;
513
        }
514
515 4
        foreach ($incomingProcInstructions as $pi) {
516 4
            if (GeneralUtility::inList($piString, $pi)) {
517 2
                return true;
518
            }
519
        }
520 2
        return false;
521
    }
522
523 2
    public function getPageTSconfigForId($id)
524
    {
525 2
        if (!$this->MP) {
526 2
            $pageTSconfig = BackendUtility::getPagesTSconfig($id);
527
        } else {
528
            [, $mountPointId] = explode('-', $this->MP);
0 ignored issues
show
Bug introduced by
The variable $mountPointId does not exist. Did you forget to declare it?

This check marks access to variables or properties that have not been declared yet. While PHP has no explicit notion of declaring a variable, accessing it before a value is assigned to it is most likely a bug.

Loading history...
529
            $pageTSconfig = BackendUtility::getPagesTSconfig($mountPointId);
530
        }
531
532
        // Call a hook to alter configuration
533 2
        if (is_array($GLOBALS['TYPO3_CONF_VARS']['EXTCONF']['crawler']['getPageTSconfigForId'])) {
534
            $params = [
535
                'pageId' => $id,
536
                'pageTSConfig' => &$pageTSconfig
537
            ];
538
            foreach ($GLOBALS['TYPO3_CONF_VARS']['EXTCONF']['crawler']['getPageTSconfigForId'] as $userFunc) {
539
                GeneralUtility::callUserFunction($userFunc, $params, $this);
540
            }
541
        }
542 2
        return $pageTSconfig;
543
    }
544
545
    /**
546
     * This methods returns an array of configurations.
547
     * And no urls!
548
     *
549
     * @param integer $id Page ID
0 ignored issues
show
Bug introduced by
There is no parameter named $id. Was it maybe removed?

This check looks for PHPDoc comments describing methods or function parameters that do not exist on the corresponding method or function.

Consider the following example. The parameter $italy is not defined by the method finale(...).

/**
 * @param array $germany
 * @param array $island
 * @param array $italy
 */
function finale($germany, $island) {
    return "2:1";
}

The most likely cause is that the parameter was removed, but the annotation was not.

Loading history...
550
     * @return array
551
     */
552 2
    public function getUrlsForPageId($pageId)
553
    {
554
        // Get page TSconfig for page ID
555 2
        $pageTSconfig = $this->getPageTSconfigForId($pageId);
556
557 2
        $res = [];
558
559
        // Fetch Crawler Configuration from pageTSconfig
560 2
        $crawlerCfg = $pageTSconfig['tx_crawler.']['crawlerCfg.']['paramSets.'] ?? [];
561 2
        foreach ($crawlerCfg as $key => $values) {
562 1
            if (!is_array($values)) {
563 1
                continue;
564
            }
565 1
            $key = str_replace('.', '', $key);
566
            // Sub configuration for a single configuration string:
567 1
            $subCfg = (array)$crawlerCfg[$key . '.'];
568 1
            $subCfg['key'] = $key;
569
570 1
            if (strcmp($subCfg['procInstrFilter'], '')) {
571 1
                $subCfg['procInstrFilter'] = implode(',', GeneralUtility::trimExplode(',', $subCfg['procInstrFilter']));
572
            }
573 1
            $pidOnlyList = implode(',', GeneralUtility::trimExplode(',', $subCfg['pidsOnly'], true));
574
575
            // process configuration if it is not page-specific or if the specific page is the current page:
576 1
            if (!strcmp($subCfg['pidsOnly'], '') || GeneralUtility::inList($pidOnlyList, $pageId)) {
577
578
                // Explode, process etc.:
579 1
                $res[$key] = [];
580 1
                $res[$key]['subCfg'] = $subCfg;
581 1
                $res[$key]['paramParsed'] = GeneralUtility::explodeUrl2Array($crawlerCfg[$key]);
582 1
                $res[$key]['paramExpanded'] = $this->expandParameters($res[$key]['paramParsed'], $pageId);
583 1
                $res[$key]['origin'] = 'pagets';
584
585
                // recognize MP value
586 1
                if (!$this->MP) {
587 1
                    $res[$key]['URLs'] = $this->compileUrls($res[$key]['paramExpanded'], ['?id=' . $pageId]);
588
                } else {
589
                    $res[$key]['URLs'] = $this->compileUrls($res[$key]['paramExpanded'], ['?id=' . $pageId . '&MP=' . $this->MP]);
590
                }
591
            }
592
        }
593
594
        // Get configuration from tx_crawler_configuration records up the rootline
595 2
        $crawlerConfigurations = $this->configurationRepository->getCrawlerConfigurationRecordsFromRootLine($pageId);
596 2
        foreach ($crawlerConfigurations as $configurationRecord) {
597
598
                // check access to the configuration record
599 1
            if (empty($configurationRecord['begroups']) || $GLOBALS['BE_USER']->isAdmin() || $this->hasGroupAccess($GLOBALS['BE_USER']->user['usergroup_cached_list'], $configurationRecord['begroups'])) {
600 1
                $pidOnlyList = implode(',', GeneralUtility::trimExplode(',', $configurationRecord['pidsonly'], true));
601
602
                // process configuration if it is not page-specific or if the specific page is the current page:
603 1
                if (!strcmp($configurationRecord['pidsonly'], '') || GeneralUtility::inList($pidOnlyList, $pageId)) {
604 1
                    $key = $configurationRecord['name'];
605
606
                    // don't overwrite previously defined paramSets
607 1
                    if (!isset($res[$key])) {
608
609
                            /* @var $TSparserObject \TYPO3\CMS\Core\TypoScript\Parser\TypoScriptParser */
610 1
                        $TSparserObject = GeneralUtility::makeInstance(TypoScriptParser::class);
611 1
                        $TSparserObject->parse($configurationRecord['processing_instruction_parameters_ts']);
612
613
                        $subCfg = [
614 1
                            'procInstrFilter' => $configurationRecord['processing_instruction_filter'],
615 1
                            'procInstrParams.' => $TSparserObject->setup,
616 1
                            'baseUrl' => $configurationRecord['base_url'],
617 1
                            'force_ssl' => (int)$configurationRecord['force_ssl'],
618 1
                            'userGroups' => $configurationRecord['fegroups'],
619 1
                            'exclude' => $configurationRecord['exclude'],
620 1
                            'key' => $key
621
                        ];
622
623 1
                        if (!in_array($pageId, $this->expandExcludeString($subCfg['exclude']))) {
624 1
                            $res[$key] = [];
625 1
                            $res[$key]['subCfg'] = $subCfg;
626 1
                            $res[$key]['paramParsed'] = GeneralUtility::explodeUrl2Array($configurationRecord['configuration']);
627 1
                            $res[$key]['paramExpanded'] = $this->expandParameters($res[$key]['paramParsed'], $pageId);
628 1
                            $res[$key]['URLs'] = $this->compileUrls($res[$key]['paramExpanded'], ['?id=' . $pageId]);
629 1
                            $res[$key]['origin'] = 'tx_crawler_configuration_' . $configurationRecord['uid'];
630
                        }
631
                    }
632
                }
633
            }
634
        }
635
636 2
        foreach ($GLOBALS['TYPO3_CONF_VARS']['EXTCONF']['crawler']['processUrls'] ?? [] as $func) {
637
            $params = [
638
                'res' => &$res,
639
            ];
640
            GeneralUtility::callUserFunction($func, $params, $this);
641
        }
642 2
        return $res;
643
    }
644
645
    /**
646
     * Find all configurations of subpages of a page
647
     *
648
     * @param int $rootid
649
     * @param $depth
650
     * @return array
651
     *
652
     * TODO: Write Functional Tests
653
     */
654
    public function getConfigurationsForBranch(int $rootid, $depth)
655
    {
656
        $configurationsForBranch = [];
657
        $pageTSconfig = $this->getPageTSconfigForId($rootid);
658
        $sets = $pageTSconfig['tx_crawler.']['crawlerCfg.']['paramSets.'] ?? [];
659
        foreach ($sets as $key => $value) {
660
            if (!is_array($value)) {
661
                continue;
662
            }
663
            $configurationsForBranch[] = substr($key, -1) == '.' ? substr($key, 0, -1) : $key;
664
        }
665
        $pids = [];
666
        $rootLine = BackendUtility::BEgetRootLine($rootid);
667
        foreach ($rootLine as $node) {
668
            $pids[] = $node['uid'];
669
        }
670
        /* @var PageTreeView $tree */
671
        $tree = GeneralUtility::makeInstance(PageTreeView::class);
672
        $perms_clause = $GLOBALS['BE_USER']->getPagePermsClause(1);
673
        $tree->init('AND ' . $perms_clause);
674
        $tree->getTree($rootid, $depth, '');
675
        foreach ($tree->tree as $node) {
676
            $pids[] = $node['row']['uid'];
677
        }
678
679
        $queryBuilder = $this->getQueryBuilder('tx_crawler_configuration');
680
681
        $queryBuilder->getRestrictions()
682
            ->removeAll()
683
            ->add(GeneralUtility::makeInstance(DeletedRestriction::class));
684
685
        $statement = $queryBuilder
686
            ->select('name')
687
            ->from('tx_crawler_configuration')
688
            ->where(
689
                $queryBuilder->expr()->in('pid', $queryBuilder->createNamedParameter($pids, Connection::PARAM_INT_ARRAY))
690
            )
691
            ->execute();
692
693
        while ($row = $statement->fetch()) {
694
            $configurationsForBranch[] = $row['name'];
695
        }
696
        return $configurationsForBranch;
697
    }
698
699
    /**
700
     * Get querybuilder for given table
701
     *
702
     * @param string $table
703
     * @return \TYPO3\CMS\Core\Database\Query\QueryBuilder
704
     */
705 9
    private function getQueryBuilder(string $table)
706
    {
707 9
        return GeneralUtility::makeInstance(ConnectionPool::class)->getQueryBuilderForTable($table);
708
    }
709
710
    /**
711
     * Check if a user has access to an item
712
     * (e.g. get the group list of the current logged in user from $GLOBALS['TSFE']->gr_list)
713
     *
714
     * @see \TYPO3\CMS\Frontend\Page\PageRepository::getMultipleGroupsWhereClause()
715
     * @param  string $groupList    Comma-separated list of (fe_)group UIDs from a user
716
     * @param  string $accessList   Comma-separated list of (fe_)group UIDs of the item to access
717
     * @return bool                 TRUE if at least one of the users group UIDs is in the access list or the access list is empty
718
     */
719 3
    public function hasGroupAccess($groupList, $accessList)
720
    {
721 3
        if (empty($accessList)) {
722 1
            return true;
723
        }
724 2
        foreach (GeneralUtility::intExplode(',', $groupList) as $groupUid) {
725 2
            if (GeneralUtility::inList($accessList, $groupUid)) {
726 1
                return true;
727
            }
728
        }
729 1
        return false;
730
    }
731
732
    /**
733
     * Will expand the parameters configuration to individual values. This follows a certain syntax of the value of each parameter.
734
     * Syntax of values:
735
     * - Basically: If the value is wrapped in [...] it will be expanded according to the following syntax, otherwise the value is taken literally
736
     * - Configuration is splitted by "|" and the parts are processed individually and finally added together
737
     * - For each configuration part:
738
     *         - "[int]-[int]" = Integer range, will be expanded to all values in between, values included, starting from low to high (max. 1000). Example "1-34" or "-40--30"
739
     *         - "_TABLE:[TCA table name];[_PID:[optional page id, default is current page]];[_ENABLELANG:1]" = Look up of table records from PID, filtering out deleted records. Example "_TABLE:tt_content; _PID:123"
740
     *        _ENABLELANG:1 picks only original records without their language overlays
741
     *         - Default: Literal value
742
     *
743
     * @param array $paramArray Array with key (GET var name) and values (value of GET var which is configuration for expansion)
744
     * @param integer $pid Current page ID
745
     * @return array
746
     *
747
     * TODO: Write Functional Tests
748
     */
749 2
    public function expandParameters($paramArray, $pid)
750
    {
751
        // Traverse parameter names:
752 2
        foreach ($paramArray as $p => $v) {
753 2
            $v = trim($v);
754
755
            // If value is encapsulated in square brackets it means there are some ranges of values to find, otherwise the value is literal
756 2
            if (substr($v, 0, 1) === '[' && substr($v, -1) === ']') {
757
                // So, find the value inside brackets and reset the paramArray value as an array.
758 2
                $v = substr($v, 1, -1);
759 2
                $paramArray[$p] = [];
760
761
                // Explode parts and traverse them:
762 2
                $parts = explode('|', $v);
763 2
                foreach ($parts as $pV) {
764
765
                        // Look for integer range: (fx. 1-34 or -40--30 // reads minus 40 to minus 30)
766 2
                    if (preg_match('/^(-?[0-9]+)\s*-\s*(-?[0-9]+)$/', trim($pV), $reg)) {
767
768
                        // Swap if first is larger than last:
769
                        if ($reg[1] > $reg[2]) {
770
                            $temp = $reg[2];
771
                            $reg[2] = $reg[1];
772
                            $reg[1] = $temp;
773
                        }
774
775
                        // Traverse range, add values:
776
                        $runAwayBrake = 1000; // Limit to size of range!
777
                        for ($a = $reg[1]; $a <= $reg[2];$a++) {
778
                            $paramArray[$p][] = $a;
779
                            $runAwayBrake--;
780
                            if ($runAwayBrake <= 0) {
781
                                break;
782
                            }
783
                        }
784 2
                    } elseif (substr(trim($pV), 0, 7) == '_TABLE:') {
785
786
                        // Parse parameters:
787
                        $subparts = GeneralUtility::trimExplode(';', $pV);
788
                        $subpartParams = [];
789
                        foreach ($subparts as $spV) {
790
                            list($pKey, $pVal) = GeneralUtility::trimExplode(':', $spV);
791
                            $subpartParams[$pKey] = $pVal;
792
                        }
793
794
                        // Table exists:
795
                        if (isset($GLOBALS['TCA'][$subpartParams['_TABLE']])) {
796
                            $lookUpPid = isset($subpartParams['_PID']) ? intval($subpartParams['_PID']) : $pid;
797
                            $pidField = isset($subpartParams['_PIDFIELD']) ? trim($subpartParams['_PIDFIELD']) : 'pid';
798
                            $where = isset($subpartParams['_WHERE']) ? $subpartParams['_WHERE'] : '';
799
                            $addTable = isset($subpartParams['_ADDTABLE']) ? $subpartParams['_ADDTABLE'] : '';
800
801
                            $fieldName = $subpartParams['_FIELD'] ? $subpartParams['_FIELD'] : 'uid';
802
                            if ($fieldName === 'uid' || $GLOBALS['TCA'][$subpartParams['_TABLE']]['columns'][$fieldName]) {
803
                                $queryBuilder = $this->getQueryBuilder($subpartParams['_TABLE']);
804
805
                                $queryBuilder->getRestrictions()
806
                                    ->removeAll()
807
                                    ->add(GeneralUtility::makeInstance(DeletedRestriction::class));
808
809
                                $queryBuilder
810
                                    ->select($fieldName)
811
                                    ->from($subpartParams['_TABLE'])
812
                                    // TODO: Check if this works as intended!
813
                                    ->add('from', $addTable)
814
                                    ->where(
815
                                        $queryBuilder->expr()->eq($queryBuilder->quoteIdentifier($pidField), $queryBuilder->createNamedParameter($lookUpPid, \PDO::PARAM_INT)),
816
                                        $where
817
                                    );
818
                                $transOrigPointerField = $GLOBALS['TCA'][$subpartParams['_TABLE']]['ctrl']['transOrigPointerField'];
819
820
                                if ($subpartParams['_ENABLELANG'] && $transOrigPointerField) {
821
                                    $queryBuilder->andWhere(
822
                                        $queryBuilder->expr()->lte(
823
                                            $queryBuilder->quoteIdentifier($transOrigPointerField),
824
                                            0
825
                                        )
826
                                    );
827
                                }
828
829
                                $statement = $queryBuilder->execute();
830
831
                                $rows = [];
832
                                while ($row = $statement->fetch()) {
833
                                    $rows[$fieldName] = $row;
834
                                }
835
836
                                if (is_array($rows)) {
837
                                    $paramArray[$p] = array_merge($paramArray[$p], array_keys($rows));
838
                                }
839
                            }
840
                        }
841
                    } else { // Just add value:
842 2
                        $paramArray[$p][] = $pV;
843
                    }
844
                    // Hook for processing own expandParameters place holder
845 2
                    if (is_array($GLOBALS['TYPO3_CONF_VARS']['SC_OPTIONS']['crawler/class.tx_crawler_lib.php']['expandParameters'])) {
846
                        $_params = [
847
                            'pObj' => &$this,
848
                            'paramArray' => &$paramArray,
849
                            'currentKey' => $p,
850
                            'currentValue' => $pV,
851
                            'pid' => $pid
852
                        ];
853
                        foreach ($GLOBALS['TYPO3_CONF_VARS']['SC_OPTIONS']['crawler/class.tx_crawler_lib.php']['expandParameters'] as $key => $_funcRef) {
854
                            GeneralUtility::callUserFunction($_funcRef, $_params, $this);
855
                        }
856
                    }
857
                }
858
859
                // Make unique set of values and sort array by key:
860 2
                $paramArray[$p] = array_unique($paramArray[$p]);
861 2
                ksort($paramArray);
862
            } else {
863
                // Set the literal value as only value in array:
864 2
                $paramArray[$p] = [$v];
865
            }
866
        }
867
868 2
        return $paramArray;
869
    }
870
871
    /**
872
     * Compiling URLs from parameter array (output of expandParameters())
873
     * The number of URLs will be the multiplication of the number of parameter values for each key
874
     *
875
     * @param array $paramArray Output of expandParameters(): Array with keys (GET var names) and for each an array of values
876
     * @param array $urls URLs accumulated in this array (for recursion)
877
     * @return array
878
     */
879 5
    public function compileUrls($paramArray, array $urls)
880
    {
881 5
        if (empty($paramArray)) {
882 5
            return $urls;
883
        }
884
        // shift first off stack:
885 4
        reset($paramArray);
886 4
        $varName = key($paramArray);
887 4
        $valueSet = array_shift($paramArray);
888
889
        // Traverse value set:
890 4
        $newUrls = [];
891 4
        foreach ($urls as $url) {
892 3
            foreach ($valueSet as $val) {
893 3
                $newUrls[] = $url . (strcmp($val, '') ? '&' . rawurlencode($varName) . '=' . rawurlencode($val) : '');
894
895 3
                if (count($newUrls) > $this->maximumUrlsToCompile) {
896
                    break;
897
                }
898
            }
899
        }
900 4
        return $this->compileUrls($paramArray, $newUrls);
901
    }
902
903
    /************************************
904
     *
905
     * Crawler log
906
     *
907
     ************************************/
908
909
    /**
910
     * Return array of records from crawler queue for input page ID
911
     *
912
     * @param integer $id Page ID for which to look up log entries.
913
     * @param string$filter Filter: "all" => all entries, "pending" => all that is not yet run, "finished" => all complete ones
914
     * @param boolean $doFlush If TRUE, then entries selected at DELETED(!) instead of selected!
915
     * @param boolean $doFullFlush
916
     * @param integer $itemsPerPage Limit the amount of entries per page default is 10
917
     * @return array
918
     */
919 4
    public function getLogEntriesForPageId($id, $filter = '', $doFlush = false, $doFullFlush = false, $itemsPerPage = 10)
920
    {
921 4
        $queryBuilder = GeneralUtility::makeInstance(ConnectionPool::class)->getQueryBuilderForTable($this->tableName);
922
        $queryBuilder
923 4
            ->select('*')
924 4
            ->from($this->tableName)
925 4
            ->where(
926 4
                $queryBuilder->expr()->eq('page_id', $queryBuilder->createNamedParameter($id, \PDO::PARAM_INT))
927
            )
928 4
            ->orderBy('scheduled', 'DESC');
929
930 4
        $expressionBuilder = GeneralUtility::makeInstance(ConnectionPool::class)
931 4
            ->getConnectionForTable($this->tableName)
932 4
            ->getExpressionBuilder();
933 4
        $query = $expressionBuilder->andX();
934
        // PHPStorm adds the highlight that the $addWhere is immediately overwritten,
935
        // but the $query = $expressionBuilder->andX() ensures that the $addWhere is written correctly with AND
936
        // between the statements, it's not a mistake in the code.
937 4
        $addWhere = '';
0 ignored issues
show
Unused Code introduced by
$addWhere is not used, you could remove the assignment.

This check looks for variable assignements that are either overwritten by other assignments or where the variable is not used subsequently.

$myVar = 'Value';
$higher = false;

if (rand(1, 6) > 3) {
    $higher = true;
} else {
    $higher = false;
}

Both the $myVar assignment in line 1 and the $higher assignment in line 2 are dead. The first because $myVar is never used and the second because $higher is always overwritten for every possible time line.

Loading history...
938 4
        switch ($filter) {
939 4
            case 'pending':
940
                $queryBuilder->andWhere($queryBuilder->expr()->eq('exec_time', 0));
941
                $addWhere = ' AND ' . $query->add($expressionBuilder->eq('exec_time', 0));
0 ignored issues
show
Unused Code introduced by
$addWhere is not used, you could remove the assignment.

This check looks for variable assignements that are either overwritten by other assignments or where the variable is not used subsequently.

$myVar = 'Value';
$higher = false;

if (rand(1, 6) > 3) {
    $higher = true;
} else {
    $higher = false;
}

Both the $myVar assignment in line 1 and the $higher assignment in line 2 are dead. The first because $myVar is never used and the second because $higher is always overwritten for every possible time line.

Loading history...
942
                break;
943 4
            case 'finished':
944
                $queryBuilder->andWhere($queryBuilder->expr()->gt('exec_time', 0));
945
                $addWhere = ' AND ' . $query->add($expressionBuilder->gt('exec_time', 0));
0 ignored issues
show
Unused Code introduced by
$addWhere is not used, you could remove the assignment.

This check looks for variable assignements that are either overwritten by other assignments or where the variable is not used subsequently.

$myVar = 'Value';
$higher = false;

if (rand(1, 6) > 3) {
    $higher = true;
} else {
    $higher = false;
}

Both the $myVar assignment in line 1 and the $higher assignment in line 2 are dead. The first because $myVar is never used and the second because $higher is always overwritten for every possible time line.

Loading history...
946
                break;
947
        }
948
949
        // FIXME: Write unit test that ensures that the right records are deleted.
950 4
        if ($doFlush) {
951 2
            $addWhere = $query->add($expressionBuilder->eq('page_id', intval($id)));
952 2
            $this->flushQueue($doFullFlush ? '1=1' : $addWhere);
953 2
            return [];
954
        } else {
955 2
            if ($itemsPerPage > 0) {
956
                $queryBuilder
957 2
                    ->setMaxResults((int)$itemsPerPage);
958
            }
959
960 2
            return $queryBuilder->execute()->fetchAll();
961
        }
962
    }
963
964
    /**
965
     * Return array of records from crawler queue for input set ID
966
     *
967
     * @param integer $set_id Set ID for which to look up log entries.
968
     * @param string $filter Filter: "all" => all entries, "pending" => all that is not yet run, "finished" => all complete ones
969
     * @param boolean $doFlush If TRUE, then entries selected at DELETED(!) instead of selected!
970
     * @param integer $itemsPerPage Limit the amount of entires per page default is 10
971
     * @return array
972
     */
973 6
    public function getLogEntriesForSetId($set_id, $filter = '', $doFlush = false, $doFullFlush = false, $itemsPerPage = 10)
974
    {
975 6
        $queryBuilder = GeneralUtility::makeInstance(ConnectionPool::class)->getQueryBuilderForTable($this->tableName);
976
        $queryBuilder
977 6
            ->select('*')
978 6
            ->from($this->tableName)
979 6
            ->where(
980 6
                $queryBuilder->expr()->eq('set_id', $queryBuilder->createNamedParameter($set_id, \PDO::PARAM_INT))
981
            )
982 6
            ->orderBy('scheduled', 'DESC');
983
984 6
        $expressionBuilder = GeneralUtility::makeInstance(ConnectionPool::class)
985 6
            ->getConnectionForTable($this->tableName)
986 6
            ->getExpressionBuilder();
987 6
        $query = $expressionBuilder->andX();
988
        // FIXME: Write Unit tests for Filters
989
        // PHPStorm adds the highlight that the $addWhere is immediately overwritten,
990
        // but the $query = $expressionBuilder->andX() ensures that the $addWhere is written correctly with AND
991
        // between the statements, it's not a mistake in the code.
992 6
        $addWhere = '';
0 ignored issues
show
Unused Code introduced by
$addWhere is not used, you could remove the assignment.

This check looks for variable assignements that are either overwritten by other assignments or where the variable is not used subsequently.

$myVar = 'Value';
$higher = false;

if (rand(1, 6) > 3) {
    $higher = true;
} else {
    $higher = false;
}

Both the $myVar assignment in line 1 and the $higher assignment in line 2 are dead. The first because $myVar is never used and the second because $higher is always overwritten for every possible time line.

Loading history...
993 6
        switch ($filter) {
994 6
            case 'pending':
995 1
                $queryBuilder->andWhere($queryBuilder->expr()->eq('exec_time', 0));
996 1
                $addWhere = $query->add($expressionBuilder->eq('exec_time', 0));
0 ignored issues
show
Unused Code introduced by
$addWhere is not used, you could remove the assignment.

This check looks for variable assignements that are either overwritten by other assignments or where the variable is not used subsequently.

$myVar = 'Value';
$higher = false;

if (rand(1, 6) > 3) {
    $higher = true;
} else {
    $higher = false;
}

Both the $myVar assignment in line 1 and the $higher assignment in line 2 are dead. The first because $myVar is never used and the second because $higher is always overwritten for every possible time line.

Loading history...
997 1
                break;
998 5
            case 'finished':
999 1
                $queryBuilder->andWhere($queryBuilder->expr()->gt('exec_time', 0));
1000 1
                $addWhere = $query->add($expressionBuilder->gt('exec_time', 0));
0 ignored issues
show
Unused Code introduced by
$addWhere is not used, you could remove the assignment.

This check looks for variable assignements that are either overwritten by other assignments or where the variable is not used subsequently.

$myVar = 'Value';
$higher = false;

if (rand(1, 6) > 3) {
    $higher = true;
} else {
    $higher = false;
}

Both the $myVar assignment in line 1 and the $higher assignment in line 2 are dead. The first because $myVar is never used and the second because $higher is always overwritten for every possible time line.

Loading history...
1001 1
                break;
1002
        }
1003
        // FIXME: Write unit test that ensures that the right records are deleted.
1004 6
        if ($doFlush) {
1005 4
            $addWhere = $query->add($expressionBuilder->eq('set_id', intval($set_id)));
1006 4
            $this->flushQueue($doFullFlush ? '' : $addWhere);
1007 4
            return [];
1008
        } else {
1009 2
            if ($itemsPerPage > 0) {
1010
                $queryBuilder
1011 2
                    ->setMaxResults((int)$itemsPerPage);
1012
            }
1013
1014 2
            return $queryBuilder->execute()->fetchAll();
1015
        }
1016
    }
1017
1018
    /**
1019
     * Removes queue entries
1020
     *
1021
     * @param string $where SQL related filter for the entries which should be removed
1022
     * @return void
1023
     */
1024 9
    protected function flushQueue($where = '')
1025
    {
1026 9
        $realWhere = strlen($where) > 0 ? $where : '1=1';
1027
1028 9
        $queryBuilder = $this->getQueryBuilder($this->tableName);
1029
1030 9
        if (EventDispatcher::getInstance()->hasObserver('queueEntryFlush')) {
1031
            $groups = $queryBuilder
1032
                ->select('DISTINCT set_id')
1033
                ->from($this->tableName)
1034
                ->where($realWhere)
1035
                ->execute()
1036
                ->fetchAll();
1037
            if (is_array($groups)) {
1038
                foreach ($groups as $group) {
1039
                    $subSet = $queryBuilder
1040
                        ->select('uid', 'set_id')
1041
                        ->from($this->tableName)
1042
                        ->where(
1043
                            $realWhere,
1044
                            $queryBuilder->expr()->eq('set_id', $group['set_id'])
1045
                        )
1046
                        ->execute()
1047
                        ->fetchAll();
1048
                    EventDispatcher::getInstance()->post('queueEntryFlush', $group['set_id'], $subSet);
1049
                }
1050
            }
1051
        }
1052
1053
        $queryBuilder
1054 9
            ->delete($this->tableName)
1055 9
            ->where($realWhere)
1056 9
            ->execute();
1057 9
    }
1058
1059
    /**
1060
     * Adding call back entries to log (called from hooks typically, see indexed search class "class.crawler.php"
1061
     *
1062
     * @param integer $setId Set ID
1063
     * @param array $params Parameters to pass to call back function
1064
     * @param string $callBack Call back object reference, eg. 'EXT:indexed_search/class.crawler.php:&tx_indexedsearch_crawler'
1065
     * @param integer $page_id Page ID to attach it to
1066
     * @param integer $schedule Time at which to activate
1067
     * @return void
1068
     */
1069
    public function addQueueEntry_callBack($setId, $params, $callBack, $page_id = 0, $schedule = 0)
1070
    {
1071
        if (!is_array($params)) {
1072
            $params = [];
1073
        }
1074
        $params['_CALLBACKOBJ'] = $callBack;
1075
1076
        GeneralUtility::makeInstance(ConnectionPool::class)->getConnectionForTable('tx_crawler_queue')
1077
            ->insert(
1078
                'tx_crawler_queue',
1079
                [
1080
                    'page_id' => intval($page_id),
1081
                    'parameters' => serialize($params),
1082
                    'scheduled' => intval($schedule) ? intval($schedule) : $this->getCurrentTime(),
1083
                    'exec_time' => 0,
1084
                    'set_id' => intval($setId),
1085
                    'result_data' => '',
1086
                ]
1087
            );
1088
    }
1089
1090
    /************************************
1091
     *
1092
     * URL setting
1093
     *
1094
     ************************************/
1095
1096
    /**
1097
     * Setting a URL for crawling:
1098
     *
1099
     * @param integer $id Page ID
1100
     * @param string $url Complete URL
1101
     * @param array $subCfg Sub configuration array (from TS config)
1102
     * @param integer $tstamp Scheduled-time
1103
     * @param string $configurationHash (optional) configuration hash
1104
     * @param bool $skipInnerDuplicationCheck (optional) skip inner duplication check
1105
     * @return bool
1106
     */
1107 2
    public function addUrl(
1108
        $id,
1109
        $url,
1110
        array $subCfg,
1111
        $tstamp,
1112
        $configurationHash = '',
1113
        $skipInnerDuplicationCheck = false
1114
    ) {
1115 2
        $urlAdded = false;
1116 2
        $rows = [];
1117
1118
        // Creating parameters:
1119
        $parameters = [
1120 2
            'url' => $url
1121
        ];
1122
1123
        // fe user group simulation:
1124 2
        $uGs = implode(',', array_unique(GeneralUtility::intExplode(',', $subCfg['userGroups'], true)));
1125 2
        if ($uGs) {
1126
            $parameters['feUserGroupList'] = $uGs;
1127
        }
1128
1129
        // Setting processing instructions
1130 2
        $parameters['procInstructions'] = GeneralUtility::trimExplode(',', $subCfg['procInstrFilter']);
1131 2
        if (is_array($subCfg['procInstrParams.'])) {
1132 2
            $parameters['procInstrParams'] = $subCfg['procInstrParams.'];
1133
        }
1134
1135
        // Compile value array:
1136 2
        $parameters_serialized = serialize($parameters);
1137
        $fieldArray = [
1138 2
            'page_id' => (int)$id,
1139 2
            'parameters' => $parameters_serialized,
1140 2
            'parameters_hash' => GeneralUtility::shortMD5($parameters_serialized),
1141 2
            'configuration_hash' => $configurationHash,
1142 2
            'scheduled' => $tstamp,
1143 2
            'exec_time' => 0,
1144 2
            'set_id' => intval($this->setID),
1145 2
            'result_data' => '',
1146 2
            'configuration' => $subCfg['key'],
1147
        ];
1148
1149 2
        if ($this->registerQueueEntriesInternallyOnly) {
1150
            //the entries will only be registered and not stored to the database
1151
            $this->queueEntries[] = $fieldArray;
1152
        } else {
1153 2
            if (!$skipInnerDuplicationCheck) {
1154
                // check if there is already an equal entry
1155 2
                $rows = $this->getDuplicateRowsIfExist($tstamp, $fieldArray);
1156
            }
1157
1158 2
            if (empty($rows)) {
1159 2
                $connectionForCrawlerQueue = GeneralUtility::makeInstance(ConnectionPool::class)->getConnectionForTable('tx_crawler_queue');
1160 2
                $connectionForCrawlerQueue->insert(
1161 2
                    'tx_crawler_queue',
1162 2
                    $fieldArray
1163
                );
1164 2
                $uid = $connectionForCrawlerQueue->lastInsertId('tx_crawler_queue', 'qid');
1165 2
                $rows[] = $uid;
1166 2
                $urlAdded = true;
1167 2
                EventDispatcher::getInstance()->post('urlAddedToQueue', $this->setID, ['uid' => $uid, 'fieldArray' => $fieldArray]);
1168
            } else {
1169
                EventDispatcher::getInstance()->post('duplicateUrlInQueue', $this->setID, ['rows' => $rows, 'fieldArray' => $fieldArray]);
1170
            }
1171
        }
1172
1173 2
        return $urlAdded;
1174
    }
1175
1176
    /**
1177
     * This method determines duplicates for a queue entry with the same parameters and this timestamp.
1178
     * If the timestamp is in the past, it will check if there is any unprocessed queue entry in the past.
1179
     * If the timestamp is in the future it will check, if the queued entry has exactly the same timestamp
1180
     *
1181
     * @param int $tstamp
1182
     * @param array $fieldArray
1183
     *
1184
     * @return array
1185
     *
1186
     * TODO: Write Functional Tests
1187
     */
1188 2
    protected function getDuplicateRowsIfExist($tstamp, $fieldArray)
1189
    {
1190 2
        $rows = [];
1191
1192 2
        $currentTime = $this->getCurrentTime();
1193
1194 2
        $queryBuilder = GeneralUtility::makeInstance(ConnectionPool::class)->getQueryBuilderForTable($this->tableName);
1195
        $queryBuilder
1196 2
            ->select('qid')
1197 2
            ->from('tx_crawler_queue');
1198
        //if this entry is scheduled with "now"
1199 2
        if ($tstamp <= $currentTime) {
1200
            if ($this->extensionSettings['enableTimeslot']) {
1201
                $timeBegin = $currentTime - 100;
1202
                $timeEnd = $currentTime + 100;
1203
                $queryBuilder
1204
                    ->where(
1205
                        'scheduled BETWEEN ' . $timeBegin . ' AND ' . $timeEnd . ''
1206
                    )
1207
                    ->orWhere(
1208
                        $queryBuilder->expr()->lte('scheduled', $currentTime)
1209
                    );
1210
            } else {
1211
                $queryBuilder
1212
                    ->where(
1213
                        $queryBuilder->expr()->lte('scheduled', $currentTime)
1214
                    );
1215
            }
1216 2
        } elseif ($tstamp > $currentTime) {
1217
            //entry with a timestamp in the future need to have the same schedule time
1218
            $queryBuilder
1219 2
                ->where(
1220 2
                    $queryBuilder->expr()->eq('scheduled', $tstamp)
1221
                );
1222
        }
1223
1224
        $statement = $queryBuilder
1225 2
            ->andWhere('exec_time != 0')
1226 2
            ->andWhere('process_id != 0')
1227 2
            ->andWhere($queryBuilder->expr()->eq('page_id', $queryBuilder->createNamedParameter($fieldArray['page_id'], \PDO::PARAM_INT)))
1228 2
            ->andWhere($queryBuilder->expr()->eq('parameters_hash', $queryBuilder->createNamedParameter($fieldArray['parameters_hash'], \PDO::PARAM_STR)))
1229 2
            ->execute();
1230
1231 2
        while ($row = $statement->fetch()) {
1232
            $rows[] = $row['qid'];
1233
        }
1234
1235 2
        return $rows;
1236
    }
1237
1238
    /**
1239
     * Returns the current system time
1240
     *
1241
     * @return int
1242
     */
1243
    public function getCurrentTime()
1244
    {
1245
        return time();
1246
    }
1247
1248
    /************************************
1249
     *
1250
     * URL reading
1251
     *
1252
     ************************************/
1253
1254
    /**
1255
     * Read URL for single queue entry
1256
     *
1257
     * @param integer $queueId
1258
     * @param boolean $force If set, will process even if exec_time has been set!
1259
     * @return integer
1260
     */
1261
    public function readUrl($queueId, $force = false)
1262
    {
1263
        $queryBuilder = GeneralUtility::makeInstance(ConnectionPool::class)->getQueryBuilderForTable($this->tableName);
1264
        $ret = 0;
1265
        $this->logger->debug('crawler-readurl start ' . microtime(true));
1266
        // Get entry:
1267
        $queryBuilder
1268
            ->select('*')
1269
            ->from('tx_crawler_queue')
1270
            ->where(
1271
                $queryBuilder->expr()->eq('qid', $queryBuilder->createNamedParameter($queueId, \PDO::PARAM_INT))
1272
            );
1273
        if (!$force) {
1274
            $queryBuilder
1275
                ->andWhere('exec_time = 0')
1276
                ->andWhere('process_scheduled > 0');
1277
        }
1278
        $queueRec = $queryBuilder->execute()->fetch();
1279
1280
        if (!is_array($queueRec)) {
1281
            return;
1282
        }
1283
1284
        SignalSlotUtility::emitSignal(
1285
            __CLASS__,
1286
            SignalSlotUtility::SIGNNAL_QUEUEITEM_PREPROCESS,
1287
            [$queueId, &$queueRec]
1288
        );
1289
1290
        // Set exec_time to lock record:
1291
        $field_array = ['exec_time' => $this->getCurrentTime()];
1292
1293
        if (isset($this->processID)) {
1294
            //if mulitprocessing is used we need to store the id of the process which has handled this entry
1295
            $field_array['process_id_completed'] = $this->processID;
1296
        }
1297
1298
        GeneralUtility::makeInstance(ConnectionPool::class)->getConnectionForTable('tx_crawler_queue')
1299
            ->update(
1300
                'tx_crawler_queue',
1301
                $field_array,
1302
                [ 'qid' => (int)$queueId ]
1303
            );
1304
1305
        $result = $this->queueExecutor->executeQueueItem($queueRec, $this);
1306
        $resultData = unserialize($result['content']);
1307
1308
        //atm there's no need to point to specific pollable extensions
1309
        if (is_array($GLOBALS['TYPO3_CONF_VARS']['EXTCONF']['crawler']['pollSuccess'])) {
1310
            foreach ($GLOBALS['TYPO3_CONF_VARS']['EXTCONF']['crawler']['pollSuccess'] as $pollable) {
1311
                // only check the success value if the instruction is runnig
1312
                // it is important to name the pollSuccess key same as the procInstructions key
1313
                if (is_array($resultData['parameters']['procInstructions']) && in_array(
1314
                    $pollable,
1315
                    $resultData['parameters']['procInstructions']
1316
                )
1317
                ) {
1318
                    if (!empty($resultData['success'][$pollable]) && $resultData['success'][$pollable]) {
1319
                        $ret |= self::CLI_STATUS_POLLABLE_PROCESSED;
1320
                    }
1321
                }
1322
            }
1323
        }
1324
1325
        // Set result in log which also denotes the end of the processing of this entry.
1326
        $field_array = ['result_data' => serialize($result)];
1327
1328
        SignalSlotUtility::emitSignal(
1329
            __CLASS__,
1330
            SignalSlotUtility::SIGNNAL_QUEUEITEM_POSTPROCESS,
1331
            [$queueId, &$field_array]
1332
        );
1333
1334
        GeneralUtility::makeInstance(ConnectionPool::class)->getConnectionForTable('tx_crawler_queue')
1335
            ->update(
1336
                'tx_crawler_queue',
1337
                $field_array,
1338
                [ 'qid' => (int)$queueId ]
1339
            );
1340
1341
        $this->logger->debug('crawler-readurl stop ' . microtime(true));
1342
        return $ret;
1343
    }
1344
1345
    /**
1346
     * Read URL for not-yet-inserted log-entry
1347
     *
1348
     * @param array $field_array Queue field array,
1349
     *
1350
     * @return string
1351
     */
1352
    public function readUrlFromArray($field_array)
1353
    {
1354
            // Set exec_time to lock record:
1355
        $field_array['exec_time'] = $this->getCurrentTime();
1356
        $connectionForCrawlerQueue = GeneralUtility::makeInstance(ConnectionPool::class)->getConnectionForTable('tx_crawler_queue');
1357
        $connectionForCrawlerQueue->insert(
1358
            'tx_crawler_queue',
1359
            $field_array
1360
        );
1361
        $queueId = $field_array['qid'] = $connectionForCrawlerQueue->lastInsertId('tx_crawler_queue', 'qid');
1362
1363
        $result = $this->queueExecutor->executeQueueItem($field_array, $this);
1364
1365
        // Set result in log which also denotes the end of the processing of this entry.
1366
        $field_array = ['result_data' => serialize($result)];
1367
1368
        SignalSlotUtility::emitSignal(
1369
            __CLASS__,
1370
            SignalSlotUtility::SIGNNAL_QUEUEITEM_POSTPROCESS,
1371
            [$queueId, &$field_array]
1372
        );
1373
1374
        $connectionForCrawlerQueue->update(
1375
            'tx_crawler_queue',
1376
            $field_array,
1377
            ['qid' => $queueId]
1378
        );
1379
1380
        return $result;
1381
    }
1382
1383
    /*****************************
1384
     *
1385
     * Compiling URLs to crawl - tools
1386
     *
1387
     *****************************/
1388
1389
    /**
1390
     * @param integer $id Root page id to start from.
1391
     * @param integer $depth Depth of tree, 0=only id-page, 1= on sublevel, 99 = infinite
1392
     * @param integer $scheduledTime Unix Time when the URL is timed to be visited when put in queue
1393
     * @param integer $reqMinute Number of requests per minute (creates the interleave between requests)
1394
     * @param boolean $submitCrawlUrls If set, submits the URLs to queue in database (real crawling)
1395
     * @param boolean $downloadCrawlUrls If set (and submitcrawlUrls is false) will fill $downloadUrls with entries)
1396
     * @param array $incomingProcInstructions Array of processing instructions
1397
     * @param array $configurationSelection Array of configuration keys
1398
     * @return string
1399
     */
1400
    public function getPageTreeAndUrls(
1401
        $id,
1402
        $depth,
1403
        $scheduledTime,
1404
        $reqMinute,
1405
        $submitCrawlUrls,
1406
        $downloadCrawlUrls,
1407
        array $incomingProcInstructions,
1408
        array $configurationSelection
1409
    ) {
1410
        $this->scheduledTime = $scheduledTime;
1411
        $this->reqMinute = $reqMinute;
1412
        $this->submitCrawlUrls = $submitCrawlUrls;
1413
        $this->downloadCrawlUrls = $downloadCrawlUrls;
1414
        $this->incomingProcInstructions = $incomingProcInstructions;
1415
        $this->incomingConfigurationSelection = $configurationSelection;
1416
1417
        $this->duplicateTrack = [];
1418
        $this->downloadUrls = [];
1419
1420
        // Drawing tree:
1421
        /* @var PageTreeView $tree */
1422
        $tree = GeneralUtility::makeInstance(PageTreeView::class);
1423
        $perms_clause = $GLOBALS['BE_USER']->getPagePermsClause(1);
1424
        $tree->init('AND ' . $perms_clause);
1425
1426
        $pageInfo = BackendUtility::readPageAccess($id, $perms_clause);
1427
        if (is_array($pageInfo)) {
1428
            // Set root row:
1429
            $tree->tree[] = [
1430
                'row' => $pageInfo,
1431
                'HTML' => IconUtility::getIconForRecord('pages', $pageInfo)
1432
            ];
1433
        }
1434
1435
        // Get branch beneath:
1436
        if ($depth) {
1437
            $tree->getTree($id, $depth, '');
1438
        }
1439
1440
        // Traverse page tree:
1441
        $code = '';
1442
1443
        foreach ($tree->tree as $data) {
1444
            $this->MP = false;
1445
1446
            // recognize mount points
1447
            if ($data['row']['doktype'] == PageRepository::DOKTYPE_MOUNTPOINT) {
1448
                $queryBuilder = GeneralUtility::makeInstance(ConnectionPool::class)->getQueryBuilderForTable($this->tableName);
1449
                $queryBuilder->getRestrictions()->removeAll()->add(GeneralUtility::makeInstance(DeletedRestriction::class));
1450
                $mountpage = $queryBuilder
1451
                    ->select('*')
1452
                    ->from('pages')
1453
                    ->where(
1454
                        $queryBuilder->expr()->eq('uid', $queryBuilder->createNamedParameter($data['row']['uid'], \PDO::PARAM_INT))
1455
                    )
1456
                    ->execute()
1457
                    ->fetchAll();
1458
                $queryBuilder->getRestrictions()->reset();
1459
1460
                // fetch mounted pages
1461
                $this->MP = $mountpage[0]['mount_pid'] . '-' . $data['row']['uid'];
0 ignored issues
show
Documentation Bug introduced by
The property $MP was declared of type boolean, but $mountpage[0]['mount_pid...' . $data['row']['uid'] is of type string. Maybe add a type cast?

This check looks for assignments to scalar types that may be of the wrong type.

To ensure the code behaves as expected, it may be a good idea to add an explicit type cast.

$answer = 42;

$correct = false;

$correct = (bool) $answer;
Loading history...
1462
1463
                $mountTree = GeneralUtility::makeInstance(PageTreeView::class);
1464
                $mountTree->init('AND ' . $perms_clause);
1465
                $mountTree->getTree($mountpage[0]['mount_pid'], $depth);
1466
1467
                foreach ($mountTree->tree as $mountData) {
1468
                    $code .= $this->drawURLs_addRowsForPage(
1469
                        $mountData['row'],
1470
                        $mountData['HTML'] . BackendUtility::getRecordTitle('pages', $mountData['row'], true)
1471
                    );
1472
                }
1473
1474
                // replace page when mount_pid_ol is enabled
1475
                if ($mountpage[0]['mount_pid_ol']) {
1476
                    $data['row']['uid'] = $mountpage[0]['mount_pid'];
1477
                } else {
1478
                    // if the mount_pid_ol is not set the MP must not be used for the mountpoint page
1479
                    $this->MP = false;
1480
                }
1481
            }
1482
1483
            $code .= $this->drawURLs_addRowsForPage(
1484
                $data['row'],
1485
                $data['HTML'] . BackendUtility::getRecordTitle('pages', $data['row'], true)
1486
            );
1487
        }
1488
1489
        return $code;
1490
    }
1491
1492
    /**
1493
     * Expands exclude string
1494
     *
1495
     * @param string $excludeString Exclude string
1496
     * @return array
1497
     */
1498 1
    public function expandExcludeString($excludeString)
1499
    {
1500
        // internal static caches;
1501 1
        static $expandedExcludeStringCache;
1502 1
        static $treeCache;
1503
1504 1
        if (empty($expandedExcludeStringCache[$excludeString])) {
1505 1
            $pidList = [];
1506
1507 1
            if (!empty($excludeString)) {
1508
                /** @var PageTreeView $tree */
1509
                $tree = GeneralUtility::makeInstance(PageTreeView::class);
1510
                $tree->init('AND ' . $this->backendUser->getPagePermsClause(1));
1511
1512
                $excludeParts = GeneralUtility::trimExplode(',', $excludeString);
1513
1514
                foreach ($excludeParts as $excludePart) {
1515
                    list($pid, $depth) = GeneralUtility::trimExplode('+', $excludePart);
1516
1517
                    // default is "page only" = "depth=0"
1518
                    if (empty($depth)) {
1519
                        $depth = (stristr($excludePart, '+')) ? 99 : 0;
1520
                    }
1521
1522
                    $pidList[] = $pid;
1523
1524
                    if ($depth > 0) {
1525
                        if (empty($treeCache[$pid][$depth])) {
1526
                            $tree->reset();
1527
                            $tree->getTree($pid, $depth);
1528
                            $treeCache[$pid][$depth] = $tree->tree;
1529
                        }
1530
1531
                        foreach ($treeCache[$pid][$depth] as $data) {
1532
                            $pidList[] = $data['row']['uid'];
1533
                        }
1534
                    }
1535
                }
1536
            }
1537
1538 1
            $expandedExcludeStringCache[$excludeString] = array_unique($pidList);
1539
        }
1540
1541 1
        return $expandedExcludeStringCache[$excludeString];
1542
    }
1543
1544
    /**
1545
     * Create the rows for display of the page tree
1546
     * For each page a number of rows are shown displaying GET variable configuration
1547
     *
1548
     * @param    array        Page row
1549
     * @param    string        Page icon and title for row
1550
     * @return    string        HTML <tr> content (one or more)
1551
     */
1552
    public function drawURLs_addRowsForPage(array $pageRow, $pageTitleAndIcon)
1553
    {
1554
        $skipMessage = '';
1555
1556
        // Get list of configurations
1557
        $configurations = $this->getUrlsForPageRow($pageRow, $skipMessage);
1558
1559
        if (!empty($this->incomingConfigurationSelection)) {
1560
            // remove configuration that does not match the current selection
1561
            foreach ($configurations as $confKey => $confArray) {
1562
                if (!in_array($confKey, $this->incomingConfigurationSelection)) {
1563
                    unset($configurations[$confKey]);
1564
                }
1565
            }
1566
        }
1567
1568
        // Traverse parameter combinations:
1569
        $c = 0;
1570
        $content = '';
1571
        if (!empty($configurations)) {
1572
            foreach ($configurations as $confKey => $confArray) {
1573
1574
                    // Title column:
1575
                if (!$c) {
1576
                    $titleClm = '<td rowspan="' . count($configurations) . '">' . $pageTitleAndIcon . '</td>';
1577
                } else {
1578
                    $titleClm = '';
1579
                }
1580
1581
                if (!in_array($pageRow['uid'], $this->expandExcludeString($confArray['subCfg']['exclude']))) {
1582
1583
                        // URL list:
1584
                    $urlList = $this->urlListFromUrlArray(
1585
                        $confArray,
1586
                        $pageRow,
1587
                        $this->scheduledTime,
1588
                        $this->reqMinute,
1589
                        $this->submitCrawlUrls,
1590
                        $this->downloadCrawlUrls,
1591
                        $this->duplicateTrack,
1592
                        $this->downloadUrls,
1593
                        $this->incomingProcInstructions // if empty the urls won't be filtered by processing instructions
1594
                    );
1595
1596
                    // Expanded parameters:
1597
                    $paramExpanded = '';
1598
                    $calcAccu = [];
1599
                    $calcRes = 1;
1600
                    foreach ($confArray['paramExpanded'] as $gVar => $gVal) {
1601
                        $paramExpanded .= '
1602
                            <tr>
1603
                                <td class="bgColor4-20">' . htmlspecialchars('&' . $gVar . '=') . '<br/>' .
1604
                                                '(' . count($gVal) . ')' .
1605
                                                '</td>
1606
                                <td class="bgColor4" nowrap="nowrap">' . nl2br(htmlspecialchars(implode(chr(10), $gVal))) . '</td>
1607
                            </tr>
1608
                        ';
1609
                        $calcRes *= count($gVal);
1610
                        $calcAccu[] = count($gVal);
1611
                    }
1612
                    $paramExpanded = '<table class="lrPadding c-list param-expanded">' . $paramExpanded . '</table>';
1613
                    $paramExpanded .= 'Comb: ' . implode('*', $calcAccu) . '=' . $calcRes;
1614
1615
                    // Options
1616
                    $optionValues = '';
1617
                    if ($confArray['subCfg']['userGroups']) {
1618
                        $optionValues .= 'User Groups: ' . $confArray['subCfg']['userGroups'] . '<br/>';
1619
                    }
1620
                    if ($confArray['subCfg']['procInstrFilter']) {
1621
                        $optionValues .= 'ProcInstr: ' . $confArray['subCfg']['procInstrFilter'] . '<br/>';
1622
                    }
1623
1624
                    // Compile row:
1625
                    $content .= '
1626
                        <tr class="bgColor' . ($c % 2 ? '-20' : '-10') . '">
1627
                            ' . $titleClm . '
1628
                            <td>' . htmlspecialchars($confKey) . '</td>
1629
                            <td>' . nl2br(htmlspecialchars(rawurldecode(trim(str_replace('&', chr(10) . '&', GeneralUtility::implodeArrayForUrl('', $confArray['paramParsed'])))))) . '</td>
1630
                            <td>' . $paramExpanded . '</td>
1631
                            <td nowrap="nowrap">' . $urlList . '</td>
1632
                            <td nowrap="nowrap">' . $optionValues . '</td>
1633
                            <td nowrap="nowrap">' . DebugUtility::viewArray($confArray['subCfg']['procInstrParams.']) . '</td>
1634
                        </tr>';
1635
                } else {
1636
                    $content .= '<tr class="bgColor' . ($c % 2 ? '-20' : '-10') . '">
1637
                            ' . $titleClm . '
1638
                            <td>' . htmlspecialchars($confKey) . '</td>
1639
                            <td colspan="5"><em>No entries</em> (Page is excluded in this configuration)</td>
1640
                        </tr>';
1641
                }
1642
1643
                $c++;
1644
            }
1645
        } else {
1646
            $message = !empty($skipMessage) ? ' (' . $skipMessage . ')' : '';
1647
1648
            // Compile row:
1649
            $content .= '
1650
                <tr class="bgColor-20" style="border-bottom: 1px solid black;">
1651
                    <td>' . $pageTitleAndIcon . '</td>
1652
                    <td colspan="6"><em>No entries</em>' . $message . '</td>
1653
                </tr>';
1654
        }
1655
1656
        return $content;
1657
    }
1658
1659
    /*****************************
1660
     *
1661
     * CLI functions
1662
     *
1663
     *****************************/
1664
1665
    /**
1666
     * Running the functionality of the CLI (crawling URLs from queue)
1667
     *
1668
     * @param int $countInARun
1669
     * @param int $sleepTime
1670
     * @param int $sleepAfterFinish
1671
     * @return string
1672
     */
1673
    public function CLI_run($countInARun, $sleepTime, $sleepAfterFinish)
1674
    {
1675
        $result = 0;
1676
        $counter = 0;
1677
1678
        // First, run hooks:
1679
        $this->CLI_runHooks();
1680
1681
        // Clean up the queue
1682
        if (intval($this->extensionSettings['purgeQueueDays']) > 0) {
1683
            $purgeDate = $this->getCurrentTime() - 24 * 60 * 60 * intval($this->extensionSettings['purgeQueueDays']);
1684
1685
            $queryBuilderDelete = GeneralUtility::makeInstance(ConnectionPool::class)->getQueryBuilderForTable($this->tableName);
1686
            $del = $queryBuilderDelete
1687
                ->delete($this->tableName)
1688
                ->where(
1689
                    'exec_time != 0 AND exec_time < ' . $purgeDate
1690
                )->execute();
1691
1692
            if (false === $del) {
1693
                $this->logger->info(
1694
                    'Records could not be deleted.'
1695
                );
1696
            }
1697
        }
1698
1699
        // Select entries:
1700
        //TODO Shouldn't this reside within the transaction?
1701
        $queryBuilderSelect = GeneralUtility::makeInstance(ConnectionPool::class)->getQueryBuilderForTable($this->tableName);
1702
        $rows = $queryBuilderSelect
1703
            ->select('qid', 'scheduled')
1704
            ->from('tx_crawler_queue')
1705
            ->where(
1706
                $queryBuilderSelect->expr()->eq('exec_time', 0),
1707
                $queryBuilderSelect->expr()->eq('process_scheduled', 0),
1708
                $queryBuilderSelect->expr()->lte('scheduled', $this->getCurrentTime())
1709
            )
1710
            ->orderBy('scheduled')
1711
            ->addOrderBy('qid')
1712
            ->setMaxResults($countInARun)
1713
            ->execute()
1714
            ->fetchAll();
1715
1716
        if (!empty($rows)) {
1717
            $quidList = [];
1718
1719
            foreach ($rows as $r) {
1720
                $quidList[] = $r['qid'];
1721
            }
1722
1723
            $processId = $this->CLI_buildProcessId();
1724
1725
            //reserve queue entries for process
1726
1727
            //$this->queryBuilder->getConnection()->executeQuery('BEGIN');
1728
            //TODO make sure we're not taking assigned queue-entires
1729
1730
            //save the number of assigned queue entrys to determine who many have been processed later
1731
            $queryBuilderUpdate = GeneralUtility::makeInstance(ConnectionPool::class)->getQueryBuilderForTable($this->tableName);
1732
            $numberOfAffectedRows = $queryBuilderUpdate
1733
                ->update('tx_crawler_queue')
1734
                ->where(
1735
                    $queryBuilderUpdate->expr()->in('qid', $quidList)
1736
                )
1737
                ->set('process_scheduled', $this->getCurrentTime())
1738
                ->set('process_id', $processId)
1739
                ->execute();
1740
1741
            GeneralUtility::makeInstance(ConnectionPool::class)->getConnectionForTable('tx_crawler_process')
1742
                ->update(
1743
                    'tx_crawler_process',
1744
                    [ 'assigned_items_count' => (int)$numberOfAffectedRows ],
1745
                    [ 'process_id' => $processId ]
1746
                );
1747
1748
            if ($numberOfAffectedRows == count($quidList)) {
1749
                //$this->queryBuilder->getConnection()->executeQuery('COMMIT');
1750
            } else {
1751
                //$this->queryBuilder->getConnection()->executeQuery('ROLLBACK');
1752
                $this->CLI_debug("Nothing processed due to multi-process collision (" . $this->CLI_buildProcessId() . ")");
1753
                return ($result | self::CLI_STATUS_ABORTED);
1754
            }
1755
1756
            foreach ($rows as $r) {
1757
                $result |= $this->readUrl($r['qid']);
1758
1759
                $counter++;
1760
                usleep(intval($sleepTime)); // Just to relax the system
1761
1762
                // if during the start and the current read url the cli has been disable we need to return from the function
1763
                // mark the process NOT as ended.
1764
                if ($this->getDisabled()) {
1765
                    return ($result | self::CLI_STATUS_ABORTED);
1766
                }
1767
1768
                if (!$this->CLI_checkIfProcessIsActive($this->CLI_buildProcessId())) {
1769
                    $this->CLI_debug("conflict / timeout (" . $this->CLI_buildProcessId() . ")");
1770
1771
                    //TODO might need an additional returncode
1772
                    $result |= self::CLI_STATUS_ABORTED;
1773
                    break; //possible timeout
1774
                }
1775
            }
1776
1777
            sleep(intval($sleepAfterFinish));
1778
1779
            $msg = 'Rows: ' . $counter;
1780
            $this->CLI_debug($msg . " (" . $this->CLI_buildProcessId() . ")");
1781
        } else {
1782
            $this->CLI_debug("Nothing within queue which needs to be processed (" . $this->CLI_buildProcessId() . ")");
1783
        }
1784
1785
        if ($counter > 0) {
1786
            $result |= self::CLI_STATUS_PROCESSED;
1787
        }
1788
1789
        return $result;
1790
    }
1791
1792
    /**
1793
     * Activate hooks
1794
     *
1795
     * @return void
1796
     */
1797
    public function CLI_runHooks()
1798
    {
1799
        foreach ($GLOBALS['TYPO3_CONF_VARS']['EXTCONF']['crawler']['cli_hooks'] ?? [] as $objRef) {
1800
            $hookObj = GeneralUtility::makeInstance($objRef);
1801
            if (is_object($hookObj)) {
1802
                $hookObj->crawler_init($this);
1803
            }
1804
        }
1805
    }
1806
1807
    /**
1808
     * Try to acquire a new process with the given id
1809
     * also performs some auto-cleanup for orphan processes
1810
     * @todo preemption might not be the most elegant way to clean up
1811
     *
1812
     * @param string $id identification string for the process
1813
     * @return boolean
1814
     */
1815
    public function CLI_checkAndAcquireNewProcess($id)
1816
    {
1817
        $queryBuilder = GeneralUtility::makeInstance(ConnectionPool::class)->getQueryBuilderForTable($this->tableName);
1818
        $ret = true;
1819
1820
        $systemProcessId = getmypid();
1821
        if ($systemProcessId < 1) {
1822
            return false;
1823
        }
1824
1825
        $processCount = 0;
1826
        $orphanProcesses = [];
1827
1828
        //$this->queryBuilder->getConnection()->executeQuery('BEGIN');
1829
1830
        $statement = $queryBuilder
1831
            ->select('process_id', 'ttl')
1832
            ->from('tx_crawler_process')
1833
            ->where(
1834
                'active = 1 AND deleted = 0'
1835
            )
1836
            ->execute();
1837
1838
        $currentTime = $this->getCurrentTime();
1839
1840
        while ($row = $statement->fetch()) {
1841
            if ($row['ttl'] < $currentTime) {
1842
                $orphanProcesses[] = $row['process_id'];
1843
            } else {
1844
                $processCount++;
1845
            }
1846
        }
1847
1848
        // if there are less than allowed active processes then add a new one
1849
        if ($processCount < intval($this->extensionSettings['processLimit'])) {
1850
            $this->CLI_debug("add process " . $this->CLI_buildProcessId() . " (" . ($processCount + 1) . "/" . intval($this->extensionSettings['processLimit']) . ")");
1851
1852
            GeneralUtility::makeInstance(ConnectionPool::class)->getConnectionForTable('tx_crawler_process')->insert(
1853
                'tx_crawler_process',
1854
                [
1855
                    'process_id' => $id,
1856
                    'active' => 1,
1857
                    'ttl' => $currentTime + (int)$this->extensionSettings['processMaxRunTime'],
1858
                    'system_process_id' => $systemProcessId
1859
                ]
1860
            );
1861
        } else {
1862
            $this->CLI_debug("Processlimit reached (" . ($processCount) . "/" . intval($this->extensionSettings['processLimit']) . ")");
1863
            $ret = false;
1864
        }
1865
1866
        $this->processRepository->deleteProcessesMarkedAsDeleted();
1867
        $this->CLI_releaseProcesses($orphanProcesses, true); // maybe this should be somehow included into the current lock
1868
1869
        return $ret;
1870
    }
1871
1872
    /**
1873
     * Release a process and the required resources
1874
     *
1875
     * @param  mixed    $releaseIds   string with a single process-id or array with multiple process-ids
1876
     * @param  boolean  $withinLock   show whether the DB-actions are included within an existing lock
1877
     * @return boolean
1878
     */
1879
    public function CLI_releaseProcesses($releaseIds, $withinLock = false)
1880
    {
1881
        $queryBuilder = GeneralUtility::makeInstance(ConnectionPool::class)->getQueryBuilderForTable($this->tableName);
1882
1883
        if (!is_array($releaseIds)) {
1884
            $releaseIds = [$releaseIds];
1885
        }
1886
1887
        if (empty($releaseIds)) {
1888
            return false;   //nothing to release
1889
        }
1890
1891
        if (!$withinLock) {
1892
            //$this->queryBuilder->getConnection()->executeQuery('BEGIN');
1893
        }
1894
1895
        // some kind of 2nd chance algo - this way you need at least 2 processes to have a real cleanup
1896
        // this ensures that a single process can't mess up the entire process table
1897
1898
        // mark all processes as deleted which have no "waiting" queue-entires and which are not active
1899
1900
        $queryBuilder
1901
        ->update('tx_crawler_queue', 'q')
1902
        ->where(
1903
            'q.process_id IN(SELECT p.process_id FROM tx_crawler_process as p WHERE p.active = 0)'
1904
        )
1905
        ->set('q.process_scheduled', 0)
1906
        ->set('q.process_id', '')
1907
        ->execute();
1908
1909
        // FIXME: Not entirely sure that this is equivalent to the previous version
1910
        $queryBuilder->resetQueryPart('set');
1911
1912
        $queryBuilder
1913
            ->update('tx_crawler_process')
1914
            ->where(
1915
                $queryBuilder->expr()->eq('active', 0),
1916
                'process_id IN(SELECT q.process_id FROM tx_crawler_queue as q WHERE q.exec_time = 0)'
1917
            )
1918
            ->set('system_process_id', 0)
1919
            ->execute();
1920
        // previous version for reference
1921
        /*
1922
        $GLOBALS['TYPO3_DB']->exec_UPDATEquery(
1923
            'tx_crawler_process',
1924
            'active=0 AND deleted=0
1925
            AND NOT EXISTS (
1926
                SELECT * FROM tx_crawler_queue
1927
                WHERE tx_crawler_queue.process_id = tx_crawler_process.process_id
1928
                AND tx_crawler_queue.exec_time = 0
1929
            )',
1930
            [
1931
                'deleted' => '1',
1932
                'system_process_id' => 0
1933
            ]
1934
        );*/
1935
        // mark all requested processes as non-active
1936
        $queryBuilder
1937
            ->update('tx_crawler_process')
1938
            ->where(
1939
                'NOT EXISTS (
1940
                SELECT * FROM tx_crawler_queue
1941
                    WHERE tx_crawler_queue.process_id = tx_crawler_process.process_id
1942
                    AND tx_crawler_queue.exec_time = 0
1943
                )',
1944
                $queryBuilder->expr()->in('process_id', $queryBuilder->createNamedParameter($releaseIds, Connection::PARAM_STR_ARRAY)),
1945
                $queryBuilder->expr()->eq('deleted', 0)
1946
            )
1947
            ->set('active', 0)
1948
            ->execute();
1949
        $queryBuilder->resetQueryPart('set');
1950
        $queryBuilder
1951
            ->update('tx_crawler_queue')
1952
            ->where(
1953
                $queryBuilder->expr()->eq('exec_time', 0),
1954
                $queryBuilder->expr()->in('process_id', $queryBuilder->createNamedParameter($releaseIds, Connection::PARAM_STR_ARRAY))
1955
            )
1956
            ->set('process_scheduled', 0)
1957
            ->set('process_id', '')
1958
            ->execute();
1959
1960
        if (!$withinLock) {
1961
            //$this->queryBuilder->getConnection()->executeQuery('COMMIT');
1962
        }
1963
1964
        return true;
1965
    }
1966
1967
    /**
1968
     * Check if there are still resources left for the process with the given id
1969
     * Used to determine timeouts and to ensure a proper cleanup if there's a timeout
1970
     *
1971
     * @param  string  identification string for the process
1972
     * @return boolean determines if the process is still active / has resources
1973
     *
1974
     * TODO: Please consider moving this to Domain Model for Process or in ProcessRepository
1975
     */
1976 1
    public function CLI_checkIfProcessIsActive($pid)
1977
    {
1978 1
        $queryBuilder = GeneralUtility::makeInstance(ConnectionPool::class)->getQueryBuilderForTable($this->tableName);
1979 1
        $ret = false;
1980
1981
        $statement = $queryBuilder
1982 1
            ->from('tx_crawler_process')
1983 1
            ->select('active')
1984 1
            ->where(
1985 1
                $queryBuilder->expr()->eq('process_id', intval($pid))
1986
            )
1987 1
            ->orderBy('ttl')
1988 1
            ->execute();
1989
1990 1
        if ($row = $statement->fetch(0)) {
1991 1
            $ret = intVal($row['active']) == 1;
1992
        }
1993
1994 1
        return $ret;
1995
    }
1996
1997
    /**
1998
     * Create a unique Id for the current process
1999
     *
2000
     * @return string  the ID
2001
     */
2002 2
    public function CLI_buildProcessId()
2003
    {
2004 2
        if (!$this->processID) {
2005 1
            $this->processID = GeneralUtility::shortMD5($this->microtime(true));
2006
        }
2007 2
        return $this->processID;
2008
    }
2009
2010
    /**
2011
     * @param bool $get_as_float
2012
     *
2013
     * @return mixed
2014
     */
2015
    protected function microtime($get_as_float = false)
2016
    {
2017
        return microtime($get_as_float);
2018
    }
2019
2020
    /**
2021
     * Prints a message to the stdout (only if debug-mode is enabled)
2022
     *
2023
     * @param  string $msg  the message
2024
     */
2025
    public function CLI_debug($msg)
2026
    {
2027
        if (intval($this->extensionSettings['processDebug'])) {
2028
            echo $msg . "\n";
2029
            flush();
2030
        }
2031
    }
2032
2033
    /**
2034
     * Cleans up entries that stayed for too long in the queue. These are:
2035
     * - processed entries that are over 1.5 days in age
2036
     * - scheduled entries that are over 7 days old
2037
     *
2038
     * @return void
2039
     */
2040
    public function cleanUpOldQueueEntries()
2041
    {
2042
        $processedAgeInSeconds = $this->extensionSettings['cleanUpProcessedAge'] * 86400; // 24*60*60 Seconds in 24 hours
2043
        $scheduledAgeInSeconds = $this->extensionSettings['cleanUpScheduledAge'] * 86400;
2044
2045
        $now = time();
2046
        $condition = '(exec_time<>0 AND exec_time<' . ($now - $processedAgeInSeconds) . ') OR scheduled<=' . ($now - $scheduledAgeInSeconds);
2047
        $this->flushQueue($condition);
2048
    }
2049
2050
    /**
2051
     * Returns a md5 hash generated from a serialized configuration array.
2052
     *
2053
     * @param array $configuration
2054
     *
2055
     * @return string
2056
     */
2057 7
    protected function getConfigurationHash(array $configuration)
2058
    {
2059 7
        unset($configuration['paramExpanded']);
2060 7
        unset($configuration['URLs']);
2061 7
        return md5(serialize($configuration));
2062
    }
2063
2064
    /**
2065
     * Build a URL from a Page and the Query String. If the page has a Site configuration, it can be built by using
2066
     * the Site instance.
2067
     *
2068
     * @param int $pageId
2069
     * @param string $queryString
2070
     * @param string|null $alternativeBaseUrl
2071
     * @param int $httpsOrHttp see tx_crawler_configuration.force_ssl
2072
     * @return UriInterface
2073
     * @throws \TYPO3\CMS\Core\Exception\SiteNotFoundException
2074
     * @throws \TYPO3\CMS\Core\Routing\InvalidRouteArgumentsException
2075
     */
2076 2
    protected function getUrlFromPageAndQueryParameters(int $pageId, string $queryString, ?string $alternativeBaseUrl, int $httpsOrHttp): UriInterface
2077
    {
2078 2
        $site = GeneralUtility::makeInstance(SiteMatcher::class)->matchByPageId((int)$pageId);
2079 2
        if ($site instanceof Site) {
0 ignored issues
show
Bug introduced by
The class TYPO3\CMS\Core\Site\Entity\Site does not exist. Did you forget a USE statement, or did you not list all dependencies?

This error could be the result of:

1. Missing dependencies

PHP Analyzer uses your composer.json file (if available) to determine the dependencies of your project and to determine all the available classes and functions. It expects the composer.json to be in the root folder of your repository.

Are you sure this class is defined by one of your dependencies, or did you maybe not list a dependency in either the require or require-dev section?

2. Missing use statement

PHP does not complain about undefined classes in ìnstanceof checks. For example, the following PHP code will work perfectly fine:

if ($x instanceof DoesNotExist) {
    // Do something.
}

If you have not tested against this specific condition, such errors might go unnoticed.

Loading history...
2080
            $queryString = ltrim($queryString, '?&');
2081
            $queryParts = [];
2082
            parse_str($queryString, $queryParts);
2083
            unset($queryParts['id']);
2084
            // workaround as long as we don't have native language support in crawler configurations
2085
            if (isset($queryParts['L'])) {
2086
                $queryParts['_language'] = $queryParts['L'];
2087
                unset($queryParts['L']);
2088
                $siteLanguage = $site->getLanguageById((int)$queryParts['_language']);
0 ignored issues
show
Unused Code introduced by
$siteLanguage is not used, you could remove the assignment.

This check looks for variable assignements that are either overwritten by other assignments or where the variable is not used subsequently.

$myVar = 'Value';
$higher = false;

if (rand(1, 6) > 3) {
    $higher = true;
} else {
    $higher = false;
}

Both the $myVar assignment in line 1 and the $higher assignment in line 2 are dead. The first because $myVar is never used and the second because $higher is always overwritten for every possible time line.

Loading history...
2089
            } else {
2090
                $siteLanguage = $site->getDefaultLanguage();
0 ignored issues
show
Unused Code introduced by
$siteLanguage is not used, you could remove the assignment.

This check looks for variable assignements that are either overwritten by other assignments or where the variable is not used subsequently.

$myVar = 'Value';
$higher = false;

if (rand(1, 6) > 3) {
    $higher = true;
} else {
    $higher = false;
}

Both the $myVar assignment in line 1 and the $higher assignment in line 2 are dead. The first because $myVar is never used and the second because $higher is always overwritten for every possible time line.

Loading history...
2091
            }
2092
            $url = $site->getRouter()->generateUri($pageId, $queryParts);
2093
            if (!empty($alternativeBaseUrl)) {
2094
                $alternativeBaseUrl = new Uri($alternativeBaseUrl);
2095
                $url = $url->withHost($alternativeBaseUrl->getHost());
2096
                $url = $url->withScheme($alternativeBaseUrl->getScheme());
2097
                $url = $url->withPort($alternativeBaseUrl->getPort());
2098
            }
2099
        } else {
2100
            // Technically this is not possible with site handling, but kept for backwards-compatibility reasons
2101
            // Once EXT:crawler is v10-only compatible, this should be removed completely
2102 2
            $baseUrl = ($alternativeBaseUrl ?: GeneralUtility::getIndpEnv('TYPO3_SITE_URL'));
2103 2
            $cacheHashCalculator = GeneralUtility::makeInstance(CacheHashCalculator::class);
2104 2
            $queryString .= '&cHash=' . $cacheHashCalculator->generateForParameters($queryString);
2105 2
            $url = rtrim($baseUrl, '/') . '/index.php' . $queryString;
2106 2
            $url = new Uri($url);
2107
        }
2108
2109 2
        if ($httpsOrHttp === -1) {
2110
            $url = $url->withScheme('http');
2111 2
        } elseif ($httpsOrHttp === 1) {
2112
            $url = $url->withScheme('https');
2113
        }
2114
2115 2
        return $url;
2116
    }
2117
}
2118