Completed
Push — typo3v9 ( 5153b7...aa6d82 )
by Tomas Norre
20:31
created

CrawlerController   F

Complexity

Total Complexity 330

Size/Duplication

Total Lines 2740
Duplicated Lines 0 %

Coupling/Cohesion

Components 3
Dependencies 18

Test Coverage

Coverage 41.22%

Importance

Changes 0
Metric Value
dl 0
loc 2740
ccs 474
cts 1150
cp 0.4122
rs 0.8
c 0
b 0
f 0
wmc 330
lcom 3
cbo 18

62 Methods

Rating   Name   Duplication   Size   Complexity  
B getLogEntriesForPageId() 0 44 6
A cli_argValue() 0 4 2
A getAccessMode() 0 4 1
A setAccessMode() 0 4 1
A setDisabled() 0 10 3
A getDisabled() 0 4 1
A setProcessFilename() 0 4 1
A getProcessFilename() 0 4 1
A getLogger() 0 7 2
A __construct() 0 22 3
A setExtensionSettings() 0 4 1
F checkIfPageShouldBeSkipped() 0 57 16
A getUrlsForPageRow() 0 15 3
A noUnprocessedQueueEntriesForPageWithConfigurationHashExist() 0 22 2
F urlListFromUrlArray() 0 113 20
A drawURLs_PIfilter() 0 12 4
A getPageTSconfigForId() 0 22 4
F getUrlsForPageId() 0 142 26
A getBaseUrlForConfigurationRecord() 0 22 4
C getConfigurationsForBranch() 0 52 11
A getQueryBuilder() 0 6 1
A hasGroupAccess() 0 12 4
A parseParams() 0 16 3
F expandParameters() 0 123 25
B compileUrls() 0 25 7
B getLogEntriesForSetId() 0 44 6
A flushQueue() 0 34 5
A addQueueEntry_callBack() 0 20 3
B addUrl() 0 71 6
B getDuplicateRowsIfExist() 0 49 5
A getCurrentTime() 0 4 1
C readUrl() 0 104 13
A readUrlFromArray() 0 31 1
A readUrl_exec() 0 29 4
D requestUrl() 0 101 15
B getFrontendBasePath() 0 23 8
A executeShellCommand() 0 4 1
A getHttpResponseFromStream() 0 23 5
A log() 0 12 3
A buildRequestHeaderArray() 0 16 4
B getRequestUrlFrom302Header() 0 34 11
A fe_init() 0 26 4
C getPageTreeAndUrls() 0 97 8
B expandExcludeString() 0 45 9
D drawURLs_addRowsForPage() 0 109 15
A cli_echo() 0 4 1
B setCliArgs() 0 23 6
A getConfigurationKeys() 0 5 2
C CLI_run() 0 116 10
A CLI_runHooks() 0 12 4
B CLI_checkAndAcquireNewProcess() 0 58 5
B CLI_releaseProcesses() 0 87 5
A CLI_deleteProcessesMarkedDeleted() 0 8 1
A CLI_checkIfProcessIsActive() 0 20 2
A CLI_buildProcessId() 0 7 2
A microtime() 0 4 1
A CLI_debug() 0 7 2
A sendDirectRequest() 0 31 2
A cleanUpOldQueueEntries() 0 9 1
A initTSFE() 0 17 2
A getConfigurationHash() 0 6 1
A isCrawlingProtocolHttps() 0 13 4

How to fix   Complexity   

Complex Class

Complex classes like CrawlerController often do a lot of different things. To break such a class down, we need to identify a cohesive component within that class. A common approach to find such a component is to look for fields/methods that share the same prefixes, or suffixes. You can also have a look at the cohesion graph to spot any un-connected, or weakly-connected components.

Once you have determined the fields that belong together, you can apply the Extract Class refactoring. If the component makes sense as a sub-class, Extract Subclass is also a candidate, and is often faster.

While breaking up the class, it is a good idea to analyze how other classes use CrawlerController, and based on these observations, apply Extract Interface, too.

1
<?php
2
namespace AOE\Crawler\Controller;
3
4
/***************************************************************
5
 *  Copyright notice
6
 *
7
 *  (c) 2019 AOE GmbH <[email protected]>
8
 *
9
 *  All rights reserved
10
 *
11
 *  This script is part of the TYPO3 project. The TYPO3 project is
12
 *  free software; you can redistribute it and/or modify
13
 *  it under the terms of the GNU General Public License as published by
14
 *  the Free Software Foundation; either version 3 of the License, or
15
 *  (at your option) any later version.
16
 *
17
 *  The GNU General Public License can be found at
18
 *  http://www.gnu.org/copyleft/gpl.html.
19
 *
20
 *  This script is distributed in the hope that it will be useful,
21
 *  but WITHOUT ANY WARRANTY; without even the implied warranty of
22
 *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
23
 *  GNU General Public License for more details.
24
 *
25
 *  This copyright notice MUST APPEAR in all copies of the script!
26
 ***************************************************************/
27
28
use AOE\Crawler\Domain\Repository\ProcessRepository;
29
use AOE\Crawler\Domain\Repository\QueueRepository;
30
use AOE\Crawler\Event\EventDispatcher;
31
use AOE\Crawler\Utility\IconUtility;
32
use AOE\Crawler\Utility\SignalSlotUtility;
33
use TYPO3\CMS\Backend\Tree\View\PageTreeView;
34
use TYPO3\CMS\Backend\Utility\BackendUtility;
35
use TYPO3\CMS\Core\Authentication\BackendUserAuthentication;
36
use TYPO3\CMS\Core\Database\Connection;
37
use TYPO3\CMS\Core\Database\ConnectionPool;
38
use TYPO3\CMS\Core\Database\Query\Restriction\DeletedRestriction;
39
use TYPO3\CMS\Core\Database\Query\Restriction\EndTimeRestriction;
40
use TYPO3\CMS\Core\Database\Query\Restriction\StartTimeRestriction;
41
use TYPO3\CMS\Core\Log\Logger;
42
use TYPO3\CMS\Core\Log\LogLevel;
43
use TYPO3\CMS\Core\TimeTracker\TimeTracker;
44
use TYPO3\CMS\Core\TypoScript\Parser\TypoScriptParser;
45
use TYPO3\CMS\Core\Utility\DebugUtility;
46
use TYPO3\CMS\Core\Utility\ExtensionManagementUtility;
47
use TYPO3\CMS\Core\Utility\GeneralUtility;
48
use TYPO3\CMS\Core\Utility\MathUtility;
49
use TYPO3\CMS\Extbase\Object\ObjectManager;
50
use TYPO3\CMS\Frontend\Controller\TypoScriptFrontendController;
51
use TYPO3\CMS\Frontend\Page\PageRepository;
52
use TYPO3\CMS\Frontend\Utility\EidUtility;
53
use TYPO3\CMS\Lang\LanguageService;
54
55
/**
56
 * Class CrawlerController
57
 *
58
 * @package AOE\Crawler\Controller
59
 */
60
class CrawlerController
61
{
62
    const CLI_STATUS_NOTHING_PROCCESSED = 0;
63
    const CLI_STATUS_REMAIN = 1; //queue not empty
64
    const CLI_STATUS_PROCESSED = 2; //(some) queue items where processed
65
    const CLI_STATUS_ABORTED = 4; //instance didn't finish
66
    const CLI_STATUS_POLLABLE_PROCESSED = 8;
67
68
    /**
69
     * @var integer
70
     */
71
    public $setID = 0;
72
73
    /**
74
     * @var string
75
     */
76
    public $processID = '';
77
78
    /**
79
     * One hour is max stalled time for the CLI
80
     * If the process had the status "start" for 3600 seconds, it will be regarded stalled and a new process is started
81
     *
82
     * @var integer
83
     */
84
    public $max_CLI_exec_time = 3600;
85
86
    /**
87
     * @var array
88
     */
89
    public $duplicateTrack = [];
90
91
    /**
92
     * @var array
93
     */
94
    public $downloadUrls = [];
95
96
    /**
97
     * @var array
98
     */
99
    public $incomingProcInstructions = [];
100
101
    /**
102
     * @var array
103
     */
104
    public $incomingConfigurationSelection = [];
105
106
    /**
107
     * @var bool
108
     */
109
    public $registerQueueEntriesInternallyOnly = false;
110
111
    /**
112
     * @var array
113
     */
114
    public $queueEntries = [];
115
116
    /**
117
     * @var array
118
     */
119
    public $urlList = [];
120
121
    /**
122
     * @var boolean
123
     */
124
    public $debugMode = false;
125
126
    /**
127
     * @var array
128
     */
129
    public $extensionSettings = [];
130
131
    /**
132
     * Mount Point
133
     *
134
     * @var boolean
135
     */
136
    public $MP = false;
137
138
    /**
139
     * @var string
140
     */
141
    protected $processFilename;
142
143
    /**
144
     * Holds the internal access mode can be 'gui','cli' or 'cli_im'
145
     *
146
     * @var string
147
     */
148
    protected $accessMode;
149
150
    /**
151
     * @var BackendUserAuthentication
152
     */
153
    private $backendUser;
154
155
    /**
156
     * @var integer
157
     */
158
    private $scheduledTime = 0;
159
160
    /**
161
     * @var integer
162
     */
163
    private $reqMinute = 0;
164
165
    /**
166
     * @var bool
167
     */
168
    private $submitCrawlUrls = false;
169
170
    /**
171
     * @var bool
172
     */
173
    private $downloadCrawlUrls = false;
174
175
    /**
176
     * @var QueueRepository
177
     */
178
    protected $queueRepository;
179
180
    /**
181
     * @var ProcessRepository
182
     */
183
    protected $processRepository;
184
185
    /**
186
     * @var string
187
     */
188
    protected $tableName = 'tx_crawler_queue';
189
190
    /**
191
     * @var array
192
     */
193
    private $cliArgs;
194
195
    /**
196
     * @var Logger
197
     */
198
    private $logger;
199
200
    /**
201
     * Method to set the accessMode can be gui, cli or cli_im
202
     *
203
     * @return string
204
     */
205 1
    public function getAccessMode()
206
    {
207 1
        return $this->accessMode;
208
    }
209
210
    /**
211
     * @param string $accessMode
212
     */
213 1
    public function setAccessMode($accessMode)
214
    {
215 1
        $this->accessMode = $accessMode;
216 1
    }
217
218
    /**
219
     * Set disabled status to prevent processes from being processed
220
     *
221
     * @param  bool $disabled (optional, defaults to true)
222
     * @return void
223
     */
224 3
    public function setDisabled($disabled = true)
225
    {
226 3
        if ($disabled) {
227 2
            GeneralUtility::writeFile($this->processFilename, '');
228
        } else {
229 1
            if (is_file($this->processFilename)) {
230 1
                unlink($this->processFilename);
231
            }
232
        }
233 3
    }
234
235
    /**
236
     * Get disable status
237
     *
238
     * @return bool true if disabled
239
     */
240 3
    public function getDisabled()
241
    {
242 3
        return is_file($this->processFilename);
243
    }
244
245
    /**
246
     * @param string $filenameWithPath
247
     *
248
     * @return void
249
     */
250 4
    public function setProcessFilename($filenameWithPath)
251
    {
252 4
        $this->processFilename = $filenameWithPath;
253 4
    }
254
255
    /**
256
     * @return string
257
     */
258 1
    public function getProcessFilename()
259
    {
260 1
        return $this->processFilename;
261
    }
262
263
    /**
264
     * @return Logger
265
     */
266
    private function getLogger(): Logger
267
    {
268
        if ($this->logger === null) {
269
            $this->logger = GeneralUtility::makeInstance(\TYPO3\CMS\Core\Log\LogManager::class)->getLogger(__CLASS__);
270
        }
271
        return $this->logger;
272
    }
273
274
    /************************************
275
     *
276
     * Getting URLs based on Page TSconfig
277
     *
278
     ************************************/
279
280 34
    public function __construct()
281
    {
282 34
        $objectManager = GeneralUtility::makeInstance(ObjectManager::class);
283 34
        $this->queueRepository = $objectManager->get(QueueRepository::class);
284 34
        $this->processRepository = $objectManager->get(ProcessRepository::class);
285
286 34
        $this->backendUser = $GLOBALS['BE_USER'];
287 34
        $this->processFilename = PATH_site . 'typo3temp/tx_crawler.proc';
288
289 34
        $settings = unserialize($GLOBALS['TYPO3_CONF_VARS']['EXT']['extConf']['crawler']);
290 34
        $settings = is_array($settings) ? $settings : [];
291
292
        // read ext_em_conf_template settings and set
293 34
        $this->setExtensionSettings($settings);
294
295
        // set defaults:
296 34
        if (MathUtility::convertToPositiveInteger($this->extensionSettings['countInARun']) == 0) {
297
            $this->extensionSettings['countInARun'] = 100;
298
        }
299
300 34
        $this->extensionSettings['processLimit'] = MathUtility::forceIntegerInRange($this->extensionSettings['processLimit'], 1, 99, 1);
301 34
    }
302
303
    /**
304
     * Sets the extensions settings (unserialized pendant of $TYPO3_CONF_VARS['EXT']['extConf']['crawler']).
305
     *
306
     * @param array $extensionSettings
307
     * @return void
308
     */
309 43
    public function setExtensionSettings(array $extensionSettings)
310
    {
311 43
        $this->extensionSettings = $extensionSettings;
312 43
    }
313
314
    /**
315
     * Check if the given page should be crawled
316
     *
317
     * @param array $pageRow
318
     * @return false|string false if the page should be crawled (not excluded), true / skipMessage if it should be skipped
319
     */
320 8
    public function checkIfPageShouldBeSkipped(array $pageRow)
321
    {
322 8
        $skipPage = false;
323 8
        $skipMessage = 'Skipped'; // message will be overwritten later
324
325
        // if page is hidden
326 8
        if (!$this->extensionSettings['crawlHiddenPages']) {
327 8
            if ($pageRow['hidden']) {
328 1
                $skipPage = true;
329 1
                $skipMessage = 'Because page is hidden';
330
            }
331
        }
332
333 8
        if (!$skipPage) {
334 7
            if (GeneralUtility::inList('3,4', $pageRow['doktype']) || $pageRow['doktype'] >= 199) {
335 3
                $skipPage = true;
336 3
                $skipMessage = 'Because doktype is not allowed';
337
            }
338
        }
339
340 8
        if (!$skipPage) {
341 4
            if (is_array($GLOBALS['TYPO3_CONF_VARS']['EXTCONF']['crawler']['excludeDoktype'])) {
342 2
                foreach ($GLOBALS['TYPO3_CONF_VARS']['EXTCONF']['crawler']['excludeDoktype'] as $key => $doktypeList) {
343 1
                    if (GeneralUtility::inList($doktypeList, $pageRow['doktype'])) {
344 1
                        $skipPage = true;
345 1
                        $skipMessage = 'Doktype was excluded by "' . $key . '"';
346 1
                        break;
347
                    }
348
                }
349
            }
350
        }
351
352 8
        if (!$skipPage) {
353
            // veto hook
354 3
            if (is_array($GLOBALS['TYPO3_CONF_VARS']['EXTCONF']['crawler']['pageVeto'])) {
355
                foreach ($GLOBALS['TYPO3_CONF_VARS']['EXTCONF']['crawler']['pageVeto'] as $key => $func) {
356
                    $params = [
357
                        'pageRow' => $pageRow
358
                    ];
359
                    // expects "false" if page is ok and "true" or a skipMessage if this page should _not_ be crawled
360
                    $veto = GeneralUtility::callUserFunction($func, $params, $this);
361
                    if ($veto !== false) {
362
                        $skipPage = true;
363
                        if (is_string($veto)) {
364
                            $skipMessage = $veto;
365
                        } else {
366
                            $skipMessage = 'Veto from hook "' . htmlspecialchars($key) . '"';
367
                        }
368
                        // no need to execute other hooks if a previous one return a veto
369
                        break;
370
                    }
371
                }
372
            }
373
        }
374
375 8
        return $skipPage ? $skipMessage : false;
376
    }
377
378
    /**
379
     * Wrapper method for getUrlsForPageId()
380
     * It returns an array of configurations and no urls!
381
     *
382
     * @param array $pageRow Page record with at least dok-type and uid columns.
383
     * @param string $skipMessage
384
     * @return array
385
     * @see getUrlsForPageId()
386
     */
387 4
    public function getUrlsForPageRow(array $pageRow, &$skipMessage = '')
388
    {
389 4
        $message = $this->checkIfPageShouldBeSkipped($pageRow);
390
391 4
        if ($message === false) {
392 3
            $forceSsl = ($pageRow['url_scheme'] === 2) ? true : false;
393 3
            $res = $this->getUrlsForPageId($pageRow['uid'], $forceSsl);
394 3
            $skipMessage = '';
395
        } else {
396 1
            $skipMessage = $message;
397 1
            $res = [];
398
        }
399
400 4
        return $res;
401
    }
402
403
    /**
404
     * This method is used to count if there are ANY unprocessed queue entries
405
     * of a given page_id and the configuration which matches a given hash.
406
     * If there if none, we can skip an inner detail check
407
     *
408
     * @param  int $uid
409
     * @param  string $configurationHash
410
     * @return boolean
411
     */
412 5
    protected function noUnprocessedQueueEntriesForPageWithConfigurationHashExist($uid, $configurationHash)
413
    {
414 5
        $queryBuilder = GeneralUtility::makeInstance(ConnectionPool::class)->getQueryBuilderForTable($this->tableName);
415 5
        $noUnprocessedQueueEntriesFound = true;
416
417
        $result = $queryBuilder
418 5
            ->count('*')
419 5
            ->from($this->tableName)
420 5
            ->where(
421 5
                $queryBuilder->expr()->eq('page_id', intval($uid)),
422 5
                $queryBuilder->expr()->eq('configuration_hash', $queryBuilder->createNamedParameter($configurationHash)),
423 5
                $queryBuilder->expr()->eq('exec_time', 0)
424
            )
425 5
            ->execute()
426 5
            ->fetchColumn();
427
428 5
        if ($result) {
429 3
            $noUnprocessedQueueEntriesFound = false;
430
        }
431
432 5
        return $noUnprocessedQueueEntriesFound;
433
    }
434
435
    /**
436
     * Creates a list of URLs from input array (and submits them to queue if asked for)
437
     * See Web > Info module script + "indexed_search"'s crawler hook-client using this!
438
     *
439
     * @param    array        Information about URLs from pageRow to crawl.
440
     * @param    array        Page row
441
     * @param    integer        Unix time to schedule indexing to, typically time()
442
     * @param    integer        Number of requests per minute (creates the interleave between requests)
443
     * @param    boolean        If set, submits the URLs to queue
444
     * @param    boolean        If set (and submitcrawlUrls is false) will fill $downloadUrls with entries)
445
     * @param    array        Array which is passed by reference and contains the an id per url to secure we will not crawl duplicates
446
     * @param    array        Array which will be filled with URLS for download if flag is set.
447
     * @param    array        Array of processing instructions
448
     * @return    string        List of URLs (meant for display in backend module)
449
     *
450
     */
451 2
    public function urlListFromUrlArray(
452
        array $vv,
453
        array $pageRow,
454
        $scheduledTime,
455
        $reqMinute,
456
        $submitCrawlUrls,
457
        $downloadCrawlUrls,
458
        array &$duplicateTrack,
459
        array &$downloadUrls,
460
        array $incomingProcInstructions
461
    ) {
462 2
        $urlList = '';
463
        // realurl support (thanks to Ingo Renner)
464 2
        if (ExtensionManagementUtility::isLoaded('realurl') && $vv['subCfg']['realurl']) {
465
466
            /** @var tx_realurl $urlObj */
467
            $urlObj = GeneralUtility::makeInstance('tx_realurl');
468
469
            if (!empty($vv['subCfg']['baseUrl'])) {
470
                $urlParts = parse_url($vv['subCfg']['baseUrl']);
471
                $host = strtolower($urlParts['host']);
472
                $urlObj->host = $host;
473
474
                // First pass, finding configuration OR pointer string:
475
                $urlObj->extConf = isset($GLOBALS['TYPO3_CONF_VARS']['EXTCONF']['realurl'][$urlObj->host]) ? $GLOBALS['TYPO3_CONF_VARS']['EXTCONF']['realurl'][$urlObj->host] : $GLOBALS['TYPO3_CONF_VARS']['EXTCONF']['realurl']['_DEFAULT'];
476
477
                // If it turned out to be a string pointer, then look up the real config:
478
                if (is_string($urlObj->extConf)) {
479
                    $urlObj->extConf = is_array($GLOBALS['TYPO3_CONF_VARS']['EXTCONF']['realurl'][$urlObj->extConf]) ? $GLOBALS['TYPO3_CONF_VARS']['EXTCONF']['realurl'][$urlObj->extConf] : $GLOBALS['TYPO3_CONF_VARS']['EXTCONF']['realurl']['_DEFAULT'];
480
                }
481
            }
482
483
            if (!$GLOBALS['TSFE']->sys_page) {
484
                $GLOBALS['TSFE']->sys_page = GeneralUtility::makeInstance('TYPO3\CMS\Frontend\Page\PageRepository');
485
            }
486
487
            if (!$GLOBALS['TSFE']->tmpl->rootLine[0]['uid']) {
488
                $GLOBALS['TSFE']->tmpl->rootLine[0]['uid'] = $urlObj->extConf['pagePath']['rootpage_id'];
489
            }
490
        }
491
492 2
        if (is_array($vv['URLs'])) {
493 2
            $configurationHash = $this->getConfigurationHash($vv);
494 2
            $skipInnerCheck = $this->noUnprocessedQueueEntriesForPageWithConfigurationHashExist($pageRow['uid'], $configurationHash);
495
496 2
            foreach ($vv['URLs'] as $urlQuery) {
497 2
                if ($this->drawURLs_PIfilter($vv['subCfg']['procInstrFilter'], $incomingProcInstructions)) {
498
499
                    // Calculate cHash:
500 2
                    if ($vv['subCfg']['cHash']) {
501
                        /* @var $cacheHash \TYPO3\CMS\Frontend\Page\CacheHashCalculator */
502
                        $cacheHash = GeneralUtility::makeInstance('TYPO3\CMS\Frontend\Page\CacheHashCalculator');
503
                        $urlQuery .= '&cHash=' . $cacheHash->generateForParameters($urlQuery);
504
                    }
505
506
                    // Create key by which to determine unique-ness:
507 2
                    $uKey = $urlQuery . '|' . $vv['subCfg']['userGroups'] . '|' . $vv['subCfg']['baseUrl'] . '|' . $vv['subCfg']['procInstrFilter'];
508
509
                    // realurl support (thanks to Ingo Renner)
510 2
                    $urlQuery = 'index.php' . $urlQuery;
511 2
                    if (ExtensionManagementUtility::isLoaded('realurl') && $vv['subCfg']['realurl']) {
512
                        $params = [
513
                            'LD' => [
514
                                'totalURL' => $urlQuery
515
                            ],
516
                            'TCEmainHook' => true
517
                        ];
518
                        $urlObj->encodeSpURL($params);
0 ignored issues
show
Bug introduced by
The variable $urlObj does not seem to be defined for all execution paths leading up to this point.

If you define a variable conditionally, it can happen that it is not defined for all execution paths.

Let’s take a look at an example:

function myFunction($a) {
    switch ($a) {
        case 'foo':
            $x = 1;
            break;

        case 'bar':
            $x = 2;
            break;
    }

    // $x is potentially undefined here.
    echo $x;
}

In the above example, the variable $x is defined if you pass “foo” or “bar” as argument for $a. However, since the switch statement has no default case statement, if you pass any other value, the variable $x would be undefined.

Available Fixes

  1. Check for existence of the variable explicitly:

    function myFunction($a) {
        switch ($a) {
            case 'foo':
                $x = 1;
                break;
    
            case 'bar':
                $x = 2;
                break;
        }
    
        if (isset($x)) { // Make sure it's always set.
            echo $x;
        }
    }
    
  2. Define a default value for the variable:

    function myFunction($a) {
        $x = ''; // Set a default which gets overridden for certain paths.
        switch ($a) {
            case 'foo':
                $x = 1;
                break;
    
            case 'bar':
                $x = 2;
                break;
        }
    
        echo $x;
    }
    
  3. Add a value for the missing path:

    function myFunction($a) {
        switch ($a) {
            case 'foo':
                $x = 1;
                break;
    
            case 'bar':
                $x = 2;
                break;
    
            // We add support for the missing case.
            default:
                $x = '';
                break;
        }
    
        echo $x;
    }
    
Loading history...
519
                        $urlQuery = $params['LD']['totalURL'];
520
                    }
521
522
                    // Scheduled time:
523 2
                    $schTime = $scheduledTime + round(count($duplicateTrack) * (60 / $reqMinute));
524 2
                    $schTime = floor($schTime / 60) * 60;
525
526 2
                    if (isset($duplicateTrack[$uKey])) {
527
528
                        //if the url key is registered just display it and do not resubmit is
529
                        $urlList = '<em><span class="typo3-dimmed">' . htmlspecialchars($urlQuery) . '</span></em><br/>';
530
                    } else {
531 2
                        $urlList = '[' . date('d.m.y H:i', $schTime) . '] ' . htmlspecialchars($urlQuery);
532 2
                        $this->urlList[] = '[' . date('d.m.y H:i', $schTime) . '] ' . $urlQuery;
533
534 2
                        $theUrl = ($vv['subCfg']['baseUrl'] ? $vv['subCfg']['baseUrl'] : GeneralUtility::getIndpEnv('TYPO3_SITE_URL')) . $urlQuery;
535
536
                        // Submit for crawling!
537 2
                        if ($submitCrawlUrls) {
538 2
                            $added = $this->addUrl(
539 2
                                $pageRow['uid'],
540 2
                                $theUrl,
541 2
                                $vv['subCfg'],
542 2
                                $scheduledTime,
543 2
                                $configurationHash,
544 2
                                $skipInnerCheck
545
                            );
546 2
                            if ($added === false) {
547 2
                                $urlList .= ' (Url already existed)';
548
                            }
549
                        } elseif ($downloadCrawlUrls) {
550
                            $downloadUrls[$theUrl] = $theUrl;
551
                        }
552
553 2
                        $urlList .= '<br />';
554
                    }
555 2
                    $duplicateTrack[$uKey] = true;
556
                }
557
            }
558
        } else {
559
            $urlList = 'ERROR - no URL generated';
560
        }
561
562 2
        return $urlList;
563
    }
564
565
    /**
566
     * Returns true if input processing instruction is among registered ones.
567
     *
568
     * @param string $piString PI to test
569
     * @param array $incomingProcInstructions Processing instructions
570
     * @return boolean
571
     */
572 5
    public function drawURLs_PIfilter($piString, array $incomingProcInstructions)
573
    {
574 5
        if (empty($incomingProcInstructions)) {
575 1
            return true;
576
        }
577
578 4
        foreach ($incomingProcInstructions as $pi) {
579 4
            if (GeneralUtility::inList($piString, $pi)) {
580 2
                return true;
581
            }
582
        }
583 2
    }
584
585 2
    public function getPageTSconfigForId($id)
586
    {
587 2
        if (!$this->MP) {
588 2
            $pageTSconfig = BackendUtility::getPagesTSconfig($id);
589
        } else {
590
            list(, $mountPointId) = explode('-', $this->MP);
591
            $pageTSconfig = BackendUtility::getPagesTSconfig($mountPointId);
592
        }
593
594
        // Call a hook to alter configuration
595 2
        if (is_array($GLOBALS['TYPO3_CONF_VARS']['EXTCONF']['crawler']['getPageTSconfigForId'])) {
596
            $params = [
597
                'pageId' => $id,
598
                'pageTSConfig' => &$pageTSconfig
599
            ];
600
            foreach ($GLOBALS['TYPO3_CONF_VARS']['EXTCONF']['crawler']['getPageTSconfigForId'] as $userFunc) {
601
                GeneralUtility::callUserFunction($userFunc, $params, $this);
602
            }
603
        }
604
605 2
        return $pageTSconfig;
606
    }
607
608
    /**
609
     * This methods returns an array of configurations.
610
     * And no urls!
611
     *
612
     * @param integer $id Page ID
613
     * @param bool $forceSsl Use https
614
     * @return array
615
     */
616 2
    public function getUrlsForPageId($id, $forceSsl = false)
617
    {
618
619
        /**
620
         * Get configuration from tsConfig
621
         */
622
623
        // Get page TSconfig for page ID:
624 2
        $pageTSconfig = $this->getPageTSconfigForId($id);
625
626 2
        $res = [];
627
628 2
        if (is_array($pageTSconfig) && is_array($pageTSconfig['tx_crawler.']['crawlerCfg.'])) {
629 1
            $crawlerCfg = $pageTSconfig['tx_crawler.']['crawlerCfg.'];
630
631 1
            if (is_array($crawlerCfg['paramSets.'])) {
632 1
                foreach ($crawlerCfg['paramSets.'] as $key => $values) {
633 1
                    if (is_array($values)) {
634 1
                        $key = str_replace('.', '', $key);
635
                        // Sub configuration for a single configuration string:
636 1
                        $subCfg = (array)$crawlerCfg['paramSets.'][$key . '.'];
637 1
                        $subCfg['key'] = $key;
638
639 1
                        if (strcmp($subCfg['procInstrFilter'], '')) {
640 1
                            $subCfg['procInstrFilter'] = implode(',', GeneralUtility::trimExplode(',', $subCfg['procInstrFilter']));
641
                        }
642 1
                        $pidOnlyList = implode(',', GeneralUtility::trimExplode(',', $subCfg['pidsOnly'], true));
643
644
                        // process configuration if it is not page-specific or if the specific page is the current page:
645 1
                        if (!strcmp($subCfg['pidsOnly'], '') || GeneralUtility::inList($pidOnlyList, $id)) {
646
647
                                // add trailing slash if not present
648 1
                            if (!empty($subCfg['baseUrl']) && substr($subCfg['baseUrl'], -1) != '/') {
649
                                $subCfg['baseUrl'] .= '/';
650
                            }
651
652
                            // Explode, process etc.:
653 1
                            $res[$key] = [];
654 1
                            $res[$key]['subCfg'] = $subCfg;
655 1
                            $res[$key]['paramParsed'] = $this->parseParams($crawlerCfg['paramSets.'][$key]);
656 1
                            $res[$key]['paramExpanded'] = $this->expandParameters($res[$key]['paramParsed'], $id);
657 1
                            $res[$key]['origin'] = 'pagets';
658
659
                            // recognize MP value
660 1
                            if (!$this->MP) {
661 1
                                $res[$key]['URLs'] = $this->compileUrls($res[$key]['paramExpanded'], ['?id=' . $id]);
662
                            } else {
663
                                $res[$key]['URLs'] = $this->compileUrls($res[$key]['paramExpanded'], ['?id=' . $id . '&MP=' . $this->MP]);
664
                            }
665
                        }
666
                    }
667
                }
668
            }
669
        }
670
671
        /**
672
         * Get configuration from tx_crawler_configuration records
673
         */
674
675
        // get records along the rootline
676 2
        $rootLine = BackendUtility::BEgetRootLine($id);
677
678 2
        $queryBuilder = GeneralUtility::makeInstance(ConnectionPool::class)->getQueryBuilderForTable('tx_crawler_configuration');
679 2
        $queryBuilder->getRestrictions()->removeAll()->add(GeneralUtility::makeInstance(DeletedRestriction::class));
680
681 2
        foreach ($rootLine as $page) {
682
            $configurationRecordsForCurrentPage = $queryBuilder
683 2
                ->select('*')
684 2
                ->from('tx_crawler_configuration')
685 2
                ->where(
686 2
                    $queryBuilder->expr()->eq('pid', $page['uid']),
687 2
                    substr(BackendUtility::BEenableFields('tx_crawler_configuration'), 4) . BackendUtility::deleteClause('tx_crawler_configuration')
0 ignored issues
show
Deprecated Code introduced by
The method TYPO3\CMS\Backend\Utilit...Utility::deleteClause() has been deprecated with message: since TYPO3 v9, will be removed in TYPO3 v10.0, the DeletedRestriction functionality should be used instead.

This method has been deprecated. The supplier of the class has supplied an explanatory message.

The explanatory message should give you some clue as to whether and when the method will be removed from the class and what other method or class to use instead.

Loading history...
688
                )
689 2
                ->execute()
690 2
                ->fetchAll();
691
692 2
            if (is_array($configurationRecordsForCurrentPage)) {
693 2
                foreach ($configurationRecordsForCurrentPage as $configurationRecord) {
694
695
                        // check access to the configuration record
696 1
                    if (empty($configurationRecord['begroups']) || $GLOBALS['BE_USER']->isAdmin() || $this->hasGroupAccess($GLOBALS['BE_USER']->user['usergroup_cached_list'], $configurationRecord['begroups'])) {
697 1
                        $pidOnlyList = implode(',', GeneralUtility::trimExplode(',', $configurationRecord['pidsonly'], true));
698
699
                        // process configuration if it is not page-specific or if the specific page is the current page:
700 1
                        if (!strcmp($configurationRecord['pidsonly'], '') || GeneralUtility::inList($pidOnlyList, $id)) {
701 1
                            $key = $configurationRecord['name'];
702
703
                            // don't overwrite previously defined paramSets
704 1
                            if (!isset($res[$key])) {
705
706
                                    /* @var $TSparserObject \TYPO3\CMS\Core\TypoScript\Parser\TypoScriptParser */
707 1
                                $TSparserObject = GeneralUtility::makeInstance(TypoScriptParser::class);
708 1
                                $TSparserObject->parse($configurationRecord['processing_instruction_parameters_ts']);
709
710 1
                                $isCrawlingProtocolHttps = $this->isCrawlingProtocolHttps($configurationRecord['force_ssl'], $forceSsl);
711
712
                                $subCfg = [
713 1
                                    'procInstrFilter' => $configurationRecord['processing_instruction_filter'],
714 1
                                    'procInstrParams.' => $TSparserObject->setup,
715 1
                                    'baseUrl' => $this->getBaseUrlForConfigurationRecord(
716 1
                                        $configurationRecord['base_url'],
717 1
                                        $configurationRecord['sys_domain_base_url'],
718 1
                                        $isCrawlingProtocolHttps
719
                                    ),
720 1
                                    'realurl' => $configurationRecord['realurl'],
721 1
                                    'cHash' => $configurationRecord['chash'],
722 1
                                    'userGroups' => $configurationRecord['fegroups'],
723 1
                                    'exclude' => $configurationRecord['exclude'],
724 1
                                    'rootTemplatePid' => (int) $configurationRecord['root_template_pid'],
725 1
                                    'key' => $key
726
                                ];
727
728
                                // add trailing slash if not present
729 1
                                if (!empty($subCfg['baseUrl']) && substr($subCfg['baseUrl'], -1) != '/') {
730
                                    $subCfg['baseUrl'] .= '/';
731
                                }
732 1
                                if (!in_array($id, $this->expandExcludeString($subCfg['exclude']))) {
733 1
                                    $res[$key] = [];
734 1
                                    $res[$key]['subCfg'] = $subCfg;
735 1
                                    $res[$key]['paramParsed'] = $this->parseParams($configurationRecord['configuration']);
736 1
                                    $res[$key]['paramExpanded'] = $this->expandParameters($res[$key]['paramParsed'], $id);
737 1
                                    $res[$key]['URLs'] = $this->compileUrls($res[$key]['paramExpanded'], ['?id=' . $id]);
738 1
                                    $res[$key]['origin'] = 'tx_crawler_configuration_' . $configurationRecord['uid'];
739
                                }
740
                            }
741
                        }
742
                    }
743
                }
744
            }
745
        }
746
747 2
        if (is_array($GLOBALS['TYPO3_CONF_VARS']['EXTCONF']['crawler']['processUrls'])) {
748
            foreach ($GLOBALS['TYPO3_CONF_VARS']['EXTCONF']['crawler']['processUrls'] as $func) {
749
                $params = [
750
                    'res' => &$res,
751
                ];
752
                GeneralUtility::callUserFunction($func, $params, $this);
753
            }
754
        }
755
756 2
        return $res;
757
    }
758
759
    /**
760
     * Checks if a domain record exist and returns the base-url based on the record. If not the given baseUrl string is used.
761
     *
762
     * @param string $baseUrl
763
     * @param integer $sysDomainUid
764
     * @param bool $ssl
765
     * @return string
766
     */
767 4
    protected function getBaseUrlForConfigurationRecord($baseUrl, $sysDomainUid, $ssl = false)
768
    {
769 4
        $queryBuilder = GeneralUtility::makeInstance(ConnectionPool::class)->getQueryBuilderForTable($this->tableName);
770 4
        $sysDomainUid = intval($sysDomainUid);
771 4
        $urlScheme = ($ssl === false) ? 'http' : 'https';
772
773 4
        if ($sysDomainUid > 0) {
774
            $statement = $queryBuilder
775 2
                ->from('sys_domain')
776 2
                ->select('*')
777 2
                ->where(
778 2
                    $queryBuilder->expr()->eq('uid', intval($sysDomainUid))
779
                )
780 2
                ->execute();
781
782 2
            $row = $statement->fetch(0);
783 2
            if ($row['domainName'] != '') {
784 1
                return $urlScheme . '://' . $row['domainName'];
785
            }
786
        }
787 3
        return $baseUrl;
788
    }
789
790
    /**
791
     * @param $rootid
792
     * @param $depth
793
     * @return array
794
     *
795
     * TODO: Write Functional Tests
796
     */
797
    public function getConfigurationsForBranch($rootid, $depth)
798
    {
799
        $configurationsForBranch = [];
800
801
        $pageTSconfig = $this->getPageTSconfigForId($rootid);
802
        if (is_array($pageTSconfig) && is_array($pageTSconfig['tx_crawler.']['crawlerCfg.']) && is_array($pageTSconfig['tx_crawler.']['crawlerCfg.']['paramSets.'])) {
803
            $sets = $pageTSconfig['tx_crawler.']['crawlerCfg.']['paramSets.'];
804
            if (is_array($sets)) {
805
                foreach ($sets as $key => $value) {
806
                    if (!is_array($value)) {
807
                        continue;
808
                    }
809
                    $configurationsForBranch[] = substr($key, -1) == '.' ? substr($key, 0, -1) : $key;
810
                }
811
            }
812
        }
813
        $pids = [];
814
        $rootLine = BackendUtility::BEgetRootLine($rootid);
815
        foreach ($rootLine as $node) {
816
            $pids[] = $node['uid'];
817
        }
818
        /* @var PageTreeView $tree */
819
        $tree = GeneralUtility::makeInstance(PageTreeView::class);
820
        $perms_clause = $GLOBALS['BE_USER']->getPagePermsClause(1);
821
        $tree->init('AND ' . $perms_clause);
822
        $tree->getTree($rootid, $depth, '');
823
        foreach ($tree->tree as $node) {
824
            $pids[] = $node['row']['uid'];
825
        }
826
827
        $queryBuilder = $this->getQueryBuilder('tx_crawler_configuration');
828
829
        $queryBuilder->getRestrictions()
830
            ->removeAll()
831
            ->add(GeneralUtility::makeInstance(DeletedRestriction::class))
832
            ->add(GeneralUtility::makeInstance(StartTimeRestriction::class))
833
            ->add(GeneralUtility::makeInstance(EndTimeRestriction::class));
834
835
        $statement = $queryBuilder
836
            ->select('name')
837
            ->from('tx_crawler_configuration')
838
            ->where(
839
                $queryBuilder->expr()->in('pid', $queryBuilder->createNamedParameter($pids, Connection::PARAM_INT_ARRAY))
840
            )
841
        ->execute();
842
843
        while ($row = $statement->fetch()) {
844
            $configurationsForBranch[] = $row['name'];
845
        }
846
847
        return $configurationsForBranch;
848
    }
849
850
    /**
851
     * Get querybuilder for given table
852
     *
853
     * @param string $table
854
     * @return \TYPO3\CMS\Core\Database\Query\QueryBuilder
855
     */
856 9
    private function getQueryBuilder(string $table)
857
    {
858 9
        return GeneralUtility::makeInstance(ConnectionPool::class)
859 9
            ->getConnectionForTable($table)
860 9
            ->createQueryBuilder();
861
    }
862
863
    /**
864
     * Check if a user has access to an item
865
     * (e.g. get the group list of the current logged in user from $GLOBALS['TSFE']->gr_list)
866
     *
867
     * @see \TYPO3\CMS\Frontend\Page\PageRepository::getMultipleGroupsWhereClause()
868
     * @param  string $groupList    Comma-separated list of (fe_)group UIDs from a user
869
     * @param  string $accessList   Comma-separated list of (fe_)group UIDs of the item to access
870
     * @return bool                 TRUE if at least one of the users group UIDs is in the access list or the access list is empty
871
     */
872 3
    public function hasGroupAccess($groupList, $accessList)
873
    {
874 3
        if (empty($accessList)) {
875 1
            return true;
876
        }
877 2
        foreach (GeneralUtility::intExplode(',', $groupList) as $groupUid) {
878 2
            if (GeneralUtility::inList($accessList, $groupUid)) {
879 1
                return true;
880
            }
881
        }
882 1
        return false;
883
    }
884
885
    /**
886
     * Parse GET vars of input Query into array with key=>value pairs
887
     *
888
     * @param string $inputQuery Input query string
889
     * @return array
890
     */
891 5
    public function parseParams($inputQuery)
892
    {
893
        //echo '<pre>', var_dump($inputQuery), '</pre>';
894
        // Extract all GET parameters into an ARRAY:
895 5
        $paramKeyValues = [];
896 5
        $GETparams = explode('&', $inputQuery);
897
898 5
        foreach ($GETparams as $paramAndValue) {
899 5
            list($p, $v) = explode('=', $paramAndValue, 2);
900 5
            if (strlen($p)) {
901 4
                $paramKeyValues[rawurldecode($p)] = rawurldecode($v);
902
            }
903
        }
904
905 5
        return $paramKeyValues;
906
    }
907
908
    /**
909
     * Will expand the parameters configuration to individual values. This follows a certain syntax of the value of each parameter.
910
     * Syntax of values:
911
     * - Basically: If the value is wrapped in [...] it will be expanded according to the following syntax, otherwise the value is taken literally
912
     * - Configuration is splitted by "|" and the parts are processed individually and finally added together
913
     * - For each configuration part:
914
     *         - "[int]-[int]" = Integer range, will be expanded to all values in between, values included, starting from low to high (max. 1000). Example "1-34" or "-40--30"
915
     *         - "_TABLE:[TCA table name];[_PID:[optional page id, default is current page]];[_ENABLELANG:1]" = Look up of table records from PID, filtering out deleted records. Example "_TABLE:tt_content; _PID:123"
916
     *        _ENABLELANG:1 picks only original records without their language overlays
917
     *         - Default: Literal value
918
     *
919
     * @param array $paramArray Array with key (GET var name) and values (value of GET var which is configuration for expansion)
920
     * @param integer $pid Current page ID
921
     * @return array
922
     *
923
     * TODO: Write Functional Tests
924
     */
925 2
    public function expandParameters($paramArray, $pid)
926
    {
927 2
        global $TCA;
928
929
        // Traverse parameter names:
930 2
        foreach ($paramArray as $p => $v) {
931 2
            $v = trim($v);
932
933
            // If value is encapsulated in square brackets it means there are some ranges of values to find, otherwise the value is literal
934 2
            if (substr($v, 0, 1) === '[' && substr($v, -1) === ']') {
935
                // So, find the value inside brackets and reset the paramArray value as an array.
936 2
                $v = substr($v, 1, -1);
937 2
                $paramArray[$p] = [];
938
939
                // Explode parts and traverse them:
940 2
                $parts = explode('|', $v);
941 2
                foreach ($parts as $pV) {
942
943
                        // Look for integer range: (fx. 1-34 or -40--30 // reads minus 40 to minus 30)
944 2
                    if (preg_match('/^(-?[0-9]+)\s*-\s*(-?[0-9]+)$/', trim($pV), $reg)) {
945
946
                        // Swap if first is larger than last:
947
                        if ($reg[1] > $reg[2]) {
948
                            $temp = $reg[2];
949
                            $reg[2] = $reg[1];
950
                            $reg[1] = $temp;
951
                        }
952
953
                        // Traverse range, add values:
954
                        $runAwayBrake = 1000; // Limit to size of range!
955
                        for ($a = $reg[1]; $a <= $reg[2];$a++) {
956
                            $paramArray[$p][] = $a;
957
                            $runAwayBrake--;
958
                            if ($runAwayBrake <= 0) {
959
                                break;
960
                            }
961
                        }
962 2
                    } elseif (substr(trim($pV), 0, 7) == '_TABLE:') {
963
964
                        // Parse parameters:
965
                        $subparts = GeneralUtility::trimExplode(';', $pV);
966
                        $subpartParams = [];
967
                        foreach ($subparts as $spV) {
968
                            list($pKey, $pVal) = GeneralUtility::trimExplode(':', $spV);
969
                            $subpartParams[$pKey] = $pVal;
970
                        }
971
972
                        // Table exists:
973
                        if (isset($TCA[$subpartParams['_TABLE']])) {
974
                            $lookUpPid = isset($subpartParams['_PID']) ? intval($subpartParams['_PID']) : $pid;
975
                            $pidField = isset($subpartParams['_PIDFIELD']) ? trim($subpartParams['_PIDFIELD']) : 'pid';
976
                            $where = isset($subpartParams['_WHERE']) ? $subpartParams['_WHERE'] : '';
977
                            $addTable = isset($subpartParams['_ADDTABLE']) ? $subpartParams['_ADDTABLE'] : '';
978
979
                            $fieldName = $subpartParams['_FIELD'] ? $subpartParams['_FIELD'] : 'uid';
980
                            if ($fieldName === 'uid' || $TCA[$subpartParams['_TABLE']]['columns'][$fieldName]) {
981
                                $queryBuilder = $this->getQueryBuilder($subpartParams['_TABLE']);
982
983
                                $queryBuilder->getRestrictions()
984
                                    ->removeAll()
985
                                    ->add(GeneralUtility::makeInstance(DeletedRestriction::class));
986
987
                                $queryBuilder
988
                                    ->select($fieldName)
989
                                    ->from($subpartParams['_TABLE'])
990
                                    // TODO: Check if this works as intended!
991
                                    ->add('from', $addTable)
992
                                    ->where(
993
                                        $queryBuilder->expr()->eq($queryBuilder->quoteIdentifier($pidField), $queryBuilder->createNamedParameter($lookUpPid, \PDO::PARAM_INT)),
994
                                        $where
995
                                    );
996
                                $transOrigPointerField = $TCA[$subpartParams['_TABLE']]['ctrl']['transOrigPointerField'];
997
998
                                if ($subpartParams['_ENABLELANG'] && $transOrigPointerField) {
999
                                    $queryBuilder->andWhere(
1000
                                        $queryBuilder->expr()->lte(
1001
                                            $queryBuilder->quoteIdentifier($transOrigPointerField),
1002
                                            0
1003
                                        )
1004
                                    );
1005
                                }
1006
1007
                                $statement = $queryBuilder->execute();
1008
1009
                                $rows = [];
1010
                                while ($row = $statement->fetch()) {
1011
                                    $rows[$fieldName] = $row;
1012
                                }
1013
1014
                                if (is_array($rows)) {
1015
                                    $paramArray[$p] = array_merge($paramArray[$p], array_keys($rows));
1016
                                }
1017
                            }
1018
                        }
1019
                    } else { // Just add value:
1020 2
                        $paramArray[$p][] = $pV;
1021
                    }
1022
                    // Hook for processing own expandParameters place holder
1023 2
                    if (is_array($GLOBALS['TYPO3_CONF_VARS']['SC_OPTIONS']['crawler/class.tx_crawler_lib.php']['expandParameters'])) {
1024
                        $_params = [
1025
                            'pObj' => &$this,
1026
                            'paramArray' => &$paramArray,
1027
                            'currentKey' => $p,
1028
                            'currentValue' => $pV,
1029
                            'pid' => $pid
1030
                        ];
1031
                        foreach ($GLOBALS['TYPO3_CONF_VARS']['SC_OPTIONS']['crawler/class.tx_crawler_lib.php']['expandParameters'] as $key => $_funcRef) {
1032
                            GeneralUtility::callUserFunction($_funcRef, $_params, $this);
1033
                        }
1034
                    }
1035
                }
1036
1037
                // Make unique set of values and sort array by key:
1038 2
                $paramArray[$p] = array_unique($paramArray[$p]);
1039 2
                ksort($paramArray);
1040
            } else {
1041
                // Set the literal value as only value in array:
1042 2
                $paramArray[$p] = [$v];
1043
            }
1044
        }
1045
1046 2
        return $paramArray;
1047
    }
1048
1049
    /**
1050
     * Compiling URLs from parameter array (output of expandParameters())
1051
     * The number of URLs will be the multiplication of the number of parameter values for each key
1052
     *
1053
     * @param array $paramArray Output of expandParameters(): Array with keys (GET var names) and for each an array of values
1054
     * @param array $urls URLs accumulated in this array (for recursion)
1055
     * @return array
1056
     */
1057 5
    public function compileUrls($paramArray, $urls = [])
1058
    {
1059 5
        if (count($paramArray) && is_array($urls)) {
1060
            // shift first off stack:
1061 4
            reset($paramArray);
1062 4
            $varName = key($paramArray);
1063 4
            $valueSet = array_shift($paramArray);
1064
1065
            // Traverse value set:
1066 4
            $newUrls = [];
1067 4
            foreach ($urls as $url) {
1068 3
                foreach ($valueSet as $val) {
1069 3
                    $newUrls[] = $url . (strcmp($val, '') ? '&' . rawurlencode($varName) . '=' . rawurlencode($val) : '');
1070
1071 3
                    if (count($newUrls) > MathUtility::forceIntegerInRange($this->extensionSettings['maxCompileUrls'], 1, 1000000000, 10000)) {
1072
                        break;
1073
                    }
1074
                }
1075
            }
1076 4
            $urls = $newUrls;
1077 4
            $urls = $this->compileUrls($paramArray, $urls);
1078
        }
1079
1080 5
        return $urls;
1081
    }
1082
1083
    /************************************
1084
     *
1085
     * Crawler log
1086
     *
1087
     ************************************/
1088
1089
    /**
1090
     * Return array of records from crawler queue for input page ID
1091
     *
1092
     * @param integer $id Page ID for which to look up log entries.
1093
     * @param string$filter Filter: "all" => all entries, "pending" => all that is not yet run, "finished" => all complete ones
1094
     * @param boolean $doFlush If TRUE, then entries selected at DELETED(!) instead of selected!
1095
     * @param boolean $doFullFlush
1096
     * @param integer $itemsPerPage Limit the amount of entries per page default is 10
1097
     * @return array
1098
     */
1099 4
    public function getLogEntriesForPageId($id, $filter = '', $doFlush = false, $doFullFlush = false, $itemsPerPage = 10)
1100
    {
1101 4
        $queryBuilder = GeneralUtility::makeInstance(ConnectionPool::class)->getQueryBuilderForTable($this->tableName);
1102
        $queryBuilder
1103 4
            ->select('*')
1104 4
            ->from($this->tableName)
1105 4
            ->where(
1106 4
                $queryBuilder->expr()->eq('page_id', $queryBuilder->createNamedParameter($id, \PDO::PARAM_INT))
1107
            )
1108 4
            ->orderBy('scheduled', 'DESC');
1109
1110 4
        $expressionBuilder = GeneralUtility::makeInstance(ConnectionPool::class)
1111 4
            ->getConnectionForTable($this->tableName)
1112 4
            ->getExpressionBuilder();
1113 4
        $query = $expressionBuilder->andX();
1114
        // PHPStorm adds the highlight that the $addWhere is immediately overwritten,
1115
        // but the $query = $expressionBuilder->andX() ensures that the $addWhere is written correctly with AND
1116
        // between the statements, it's not a mistake in the code.
1117 4
        $addWhere = '';
0 ignored issues
show
Unused Code introduced by
$addWhere is not used, you could remove the assignment.

This check looks for variable assignements that are either overwritten by other assignments or where the variable is not used subsequently.

$myVar = 'Value';
$higher = false;

if (rand(1, 6) > 3) {
    $higher = true;
} else {
    $higher = false;
}

Both the $myVar assignment in line 1 and the $higher assignment in line 2 are dead. The first because $myVar is never used and the second because $higher is always overwritten for every possible time line.

Loading history...
1118 4
        switch ($filter) {
1119 4
            case 'pending':
1120
                $queryBuilder->andWhere($queryBuilder->expr()->eq('exec_time', 0));
1121
                $addWhere = ' AND ' . $query->add($expressionBuilder->eq('exec_time', 0));
0 ignored issues
show
Unused Code introduced by
$addWhere is not used, you could remove the assignment.

This check looks for variable assignements that are either overwritten by other assignments or where the variable is not used subsequently.

$myVar = 'Value';
$higher = false;

if (rand(1, 6) > 3) {
    $higher = true;
} else {
    $higher = false;
}

Both the $myVar assignment in line 1 and the $higher assignment in line 2 are dead. The first because $myVar is never used and the second because $higher is always overwritten for every possible time line.

Loading history...
1122
                break;
1123 4
            case 'finished':
1124
                $queryBuilder->andWhere($queryBuilder->expr()->gt('exec_time', 0));
1125
                $addWhere = ' AND ' . $query->add($expressionBuilder->gt('exec_time', 0));
0 ignored issues
show
Unused Code introduced by
$addWhere is not used, you could remove the assignment.

This check looks for variable assignements that are either overwritten by other assignments or where the variable is not used subsequently.

$myVar = 'Value';
$higher = false;

if (rand(1, 6) > 3) {
    $higher = true;
} else {
    $higher = false;
}

Both the $myVar assignment in line 1 and the $higher assignment in line 2 are dead. The first because $myVar is never used and the second because $higher is always overwritten for every possible time line.

Loading history...
1126
                break;
1127
        }
1128
1129
        // FIXME: Write unit test that ensures that the right records are deleted.
1130 4
        if ($doFlush) {
1131 2
            $addWhere = $query->add($expressionBuilder->eq('page_id', intval($id)));
1132 2
            $this->flushQueue($doFullFlush ? '1=1' : $addWhere);
1133 2
            return [];
1134
        } else {
1135 2
            if ($itemsPerPage > 0) {
1136
                $queryBuilder
1137 2
                    ->setMaxResults((int)$itemsPerPage);
1138
            }
1139
1140 2
            return $queryBuilder->execute()->fetchAll();
1141
        }
1142
    }
1143
1144
    /**
1145
     * Return array of records from crawler queue for input set ID
1146
     *
1147
     * @param integer $set_id Set ID for which to look up log entries.
1148
     * @param string $filter Filter: "all" => all entries, "pending" => all that is not yet run, "finished" => all complete ones
1149
     * @param boolean $doFlush If TRUE, then entries selected at DELETED(!) instead of selected!
1150
     * @param integer $itemsPerPage Limit the amount of entires per page default is 10
1151
     * @return array
1152
     */
1153 6
    public function getLogEntriesForSetId($set_id, $filter = '', $doFlush = false, $doFullFlush = false, $itemsPerPage = 10)
1154
    {
1155 6
        $queryBuilder = GeneralUtility::makeInstance(ConnectionPool::class)->getQueryBuilderForTable($this->tableName);
1156
        $queryBuilder
1157 6
            ->select('*')
1158 6
            ->from($this->tableName)
1159 6
            ->where(
1160 6
                $queryBuilder->expr()->eq('set_id', $queryBuilder->createNamedParameter($set_id, \PDO::PARAM_INT))
1161
            )
1162 6
            ->orderBy('scheduled', 'DESC');
1163
1164 6
        $expressionBuilder = GeneralUtility::makeInstance(ConnectionPool::class)
1165 6
            ->getConnectionForTable($this->tableName)
1166 6
            ->getExpressionBuilder();
1167 6
        $query = $expressionBuilder->andX();
1168
        // FIXME: Write Unit tests for Filters
1169
        // PHPStorm adds the highlight that the $addWhere is immediately overwritten,
1170
        // but the $query = $expressionBuilder->andX() ensures that the $addWhere is written correctly with AND
1171
        // between the statements, it's not a mistake in the code.
1172 6
        $addWhere = '';
0 ignored issues
show
Unused Code introduced by
$addWhere is not used, you could remove the assignment.

This check looks for variable assignements that are either overwritten by other assignments or where the variable is not used subsequently.

$myVar = 'Value';
$higher = false;

if (rand(1, 6) > 3) {
    $higher = true;
} else {
    $higher = false;
}

Both the $myVar assignment in line 1 and the $higher assignment in line 2 are dead. The first because $myVar is never used and the second because $higher is always overwritten for every possible time line.

Loading history...
1173 6
        switch ($filter) {
1174 6
            case 'pending':
1175 1
                $queryBuilder->andWhere($queryBuilder->expr()->eq('exec_time', 0));
1176 1
                $addWhere = $query->add($expressionBuilder->eq('exec_time', 0));
0 ignored issues
show
Unused Code introduced by
$addWhere is not used, you could remove the assignment.

This check looks for variable assignements that are either overwritten by other assignments or where the variable is not used subsequently.

$myVar = 'Value';
$higher = false;

if (rand(1, 6) > 3) {
    $higher = true;
} else {
    $higher = false;
}

Both the $myVar assignment in line 1 and the $higher assignment in line 2 are dead. The first because $myVar is never used and the second because $higher is always overwritten for every possible time line.

Loading history...
1177 1
                break;
1178 5
            case 'finished':
1179 1
                $queryBuilder->andWhere($queryBuilder->expr()->gt('exec_time', 0));
1180 1
                $addWhere = $query->add($expressionBuilder->gt('exec_time', 0));
0 ignored issues
show
Unused Code introduced by
$addWhere is not used, you could remove the assignment.

This check looks for variable assignements that are either overwritten by other assignments or where the variable is not used subsequently.

$myVar = 'Value';
$higher = false;

if (rand(1, 6) > 3) {
    $higher = true;
} else {
    $higher = false;
}

Both the $myVar assignment in line 1 and the $higher assignment in line 2 are dead. The first because $myVar is never used and the second because $higher is always overwritten for every possible time line.

Loading history...
1181 1
                break;
1182
        }
1183
        // FIXME: Write unit test that ensures that the right records are deleted.
1184 6
        if ($doFlush) {
1185 4
            $addWhere = $query->add($expressionBuilder->eq('set_id', intval($set_id)));
1186 4
            $this->flushQueue($doFullFlush ? '' : $addWhere);
1187 4
            return [];
1188
        } else {
1189 2
            if ($itemsPerPage > 0) {
1190
                $queryBuilder
1191 2
                    ->setMaxResults((int)$itemsPerPage);
1192
            }
1193
1194 2
            return $queryBuilder->execute()->fetchAll();
1195
        }
1196
    }
1197
1198
    /**
1199
     * Removes queue entries
1200
     *
1201
     * @param string $where SQL related filter for the entries which should be removed
1202
     * @return void
1203
     */
1204 9
    protected function flushQueue($where = '')
1205
    {
1206 9
        $realWhere = strlen($where) > 0 ? $where : '1=1';
1207
1208 9
        $queryBuilder = $this->getQueryBuilder($this->tableName);
1209
1210 9
        if (EventDispatcher::getInstance()->hasObserver('queueEntryFlush')) {
1211
            $groups = $queryBuilder
1212
                ->select('DISTINCT set_id')
1213
                ->from($this->tableName)
1214
                ->where($realWhere)
1215
                ->execute()
1216
                ->fetchAll();
1217
            if (is_array($groups)) {
1218
                foreach ($groups as $group) {
1219
                    $subSet = $queryBuilder
1220
                        ->select('uid', 'set_id')
1221
                        ->from($this->tableName)
1222
                        ->where(
1223
                            $realWhere,
1224
                            $queryBuilder->expr()->eq('set_id', $group['set_id'])
1225
                        )
1226
                        ->execute()
1227
                        ->fetchAll();
1228
                    EventDispatcher::getInstance()->post('queueEntryFlush', $group['set_id'], $subSet);
1229
                }
1230
            }
1231
        }
1232
1233
        $queryBuilder
1234 9
            ->delete($this->tableName)
1235 9
            ->where($realWhere)
1236 9
            ->execute();
1237 9
    }
1238
1239
    /**
1240
     * Adding call back entries to log (called from hooks typically, see indexed search class "class.crawler.php"
1241
     *
1242
     * @param integer $setId Set ID
1243
     * @param array $params Parameters to pass to call back function
1244
     * @param string $callBack Call back object reference, eg. 'EXT:indexed_search/class.crawler.php:&tx_indexedsearch_crawler'
1245
     * @param integer $page_id Page ID to attach it to
1246
     * @param integer $schedule Time at which to activate
1247
     * @return void
1248
     */
1249
    public function addQueueEntry_callBack($setId, $params, $callBack, $page_id = 0, $schedule = 0)
1250
    {
1251
        if (!is_array($params)) {
1252
            $params = [];
1253
        }
1254
        $params['_CALLBACKOBJ'] = $callBack;
1255
1256
        GeneralUtility::makeInstance(ConnectionPool::class)->getConnectionForTable('tx_crawler_queue')
1257
            ->insert(
1258
                'tx_crawler_queue',
1259
                [
1260
                    'page_id' => intval($page_id),
1261
                    'parameters' => serialize($params),
1262
                    'scheduled' => intval($schedule) ? intval($schedule) : $this->getCurrentTime(),
1263
                    'exec_time' => 0,
1264
                    'set_id' => intval($setId),
1265
                    'result_data' => '',
1266
                ]
1267
            );
1268
    }
1269
1270
    /************************************
1271
     *
1272
     * URL setting
1273
     *
1274
     ************************************/
1275
1276
    /**
1277
     * Setting a URL for crawling:
1278
     *
1279
     * @param integer $id Page ID
1280
     * @param string $url Complete URL
1281
     * @param array $subCfg Sub configuration array (from TS config)
1282
     * @param integer $tstamp Scheduled-time
1283
     * @param string $configurationHash (optional) configuration hash
1284
     * @param bool $skipInnerDuplicationCheck (optional) skip inner duplication check
1285
     * @return bool
1286
     */
1287 2
    public function addUrl(
1288
        $id,
1289
        $url,
1290
        array $subCfg,
1291
        $tstamp,
1292
        $configurationHash = '',
1293
        $skipInnerDuplicationCheck = false
1294
    ) {
1295 2
        $urlAdded = false;
1296 2
        $rows = [];
1297
1298
        // Creating parameters:
1299
        $parameters = [
1300 2
            'url' => $url
1301
        ];
1302
1303
        // fe user group simulation:
1304 2
        $uGs = implode(',', array_unique(GeneralUtility::intExplode(',', $subCfg['userGroups'], true)));
1305 2
        if ($uGs) {
1306
            $parameters['feUserGroupList'] = $uGs;
1307
        }
1308
1309
        // Setting processing instructions
1310 2
        $parameters['procInstructions'] = GeneralUtility::trimExplode(',', $subCfg['procInstrFilter']);
1311 2
        if (is_array($subCfg['procInstrParams.'])) {
1312 2
            $parameters['procInstrParams'] = $subCfg['procInstrParams.'];
1313
        }
1314
1315
        // Possible TypoScript Template Parents
1316 2
        $parameters['rootTemplatePid'] = $subCfg['rootTemplatePid'];
1317
1318
        // Compile value array:
1319 2
        $parameters_serialized = serialize($parameters);
1320
        $fieldArray = [
1321 2
            'page_id' => intval($id),
1322 2
            'parameters' => $parameters_serialized,
1323 2
            'parameters_hash' => GeneralUtility::shortMD5($parameters_serialized),
1324 2
            'configuration_hash' => $configurationHash,
1325 2
            'scheduled' => $tstamp,
1326 2
            'exec_time' => 0,
1327 2
            'set_id' => intval($this->setID),
1328 2
            'result_data' => '',
1329 2
            'configuration' => $subCfg['key'],
1330
        ];
1331
1332 2
        if ($this->registerQueueEntriesInternallyOnly) {
1333
            //the entries will only be registered and not stored to the database
1334
            $this->queueEntries[] = $fieldArray;
1335
        } else {
1336 2
            if (!$skipInnerDuplicationCheck) {
1337
                // check if there is already an equal entry
1338 2
                $rows = $this->getDuplicateRowsIfExist($tstamp, $fieldArray);
1339
            }
1340
1341 2
            if (count($rows) == 0) {
1342 2
                $connectionForCrawlerQueue = GeneralUtility::makeInstance(ConnectionPool::class)->getConnectionForTable('tx_crawler_queue');
1343 2
                $connectionForCrawlerQueue->insert(
1344 2
                    'tx_crawler_queue',
1345 2
                    $fieldArray
1346
                );
1347 2
                $uid = $connectionForCrawlerQueue->lastInsertId('tx_crawler_queue', 'qid');
1348 2
                $rows[] = $uid;
1349 2
                $urlAdded = true;
1350 2
                EventDispatcher::getInstance()->post('urlAddedToQueue', $this->setID, ['uid' => $uid, 'fieldArray' => $fieldArray]);
1351
            } else {
1352
                EventDispatcher::getInstance()->post('duplicateUrlInQueue', $this->setID, ['rows' => $rows, 'fieldArray' => $fieldArray]);
1353
            }
1354
        }
1355
1356 2
        return $urlAdded;
1357
    }
1358
1359
    /**
1360
     * This method determines duplicates for a queue entry with the same parameters and this timestamp.
1361
     * If the timestamp is in the past, it will check if there is any unprocessed queue entry in the past.
1362
     * If the timestamp is in the future it will check, if the queued entry has exactly the same timestamp
1363
     *
1364
     * @param int $tstamp
1365
     * @param array $fieldArray
1366
     *
1367
     * @return array
1368
     *
1369
     * TODO: Write Functional Tests
1370
     */
1371 2
    protected function getDuplicateRowsIfExist($tstamp, $fieldArray)
1372
    {
1373 2
        $rows = [];
1374
1375 2
        $currentTime = $this->getCurrentTime();
1376
1377 2
        $queryBuilder = GeneralUtility::makeInstance(ConnectionPool::class)->getQueryBuilderForTable($this->tableName);
1378
        $queryBuilder
1379 2
            ->select('qid')
1380 2
            ->from('tx_crawler_queue');
1381
        //if this entry is scheduled with "now"
1382 2
        if ($tstamp <= $currentTime) {
1383
            if ($this->extensionSettings['enableTimeslot']) {
1384
                $timeBegin = $currentTime - 100;
1385
                $timeEnd = $currentTime + 100;
1386
                $queryBuilder
1387
                    ->where(
1388
                        'scheduled BETWEEN ' . $timeBegin . ' AND ' . $timeEnd . ''
1389
                    )
1390
                    ->orWhere(
1391
                        $queryBuilder->expr()->lte('scheduled', $currentTime)
1392
                    );
1393
            } else {
1394
                $queryBuilder
1395
                    ->where(
1396
                        $queryBuilder->expr()->lte('scheduled', $currentTime)
1397
                    );
1398
            }
1399 2
        } elseif ($tstamp > $currentTime) {
1400
            //entry with a timestamp in the future need to have the same schedule time
1401
            $queryBuilder
1402 2
                ->where(
1403 2
                    $queryBuilder->expr()->eq('scheduled', $tstamp)
1404
                );
1405
        }
1406
1407
        $statement = $queryBuilder
1408 2
            ->andWhere('exec_time != 0')
1409 2
            ->andWhere('process_id != 0')
1410 2
            ->andWhere($queryBuilder->expr()->eq('page_id', $queryBuilder->createNamedParameter($fieldArray['page_id'], \PDO::PARAM_INT)))
1411 2
            ->andWhere($queryBuilder->expr()->eq('parameters_hash', $queryBuilder->createNamedParameter($fieldArray['parameters_hash'], \PDO::PARAM_STR)))
1412 2
            ->execute();
1413
1414 2
        while ($row = $statement->fetch()) {
1415
            $rows[] = $row['qid'];
1416
        }
1417
1418 2
        return $rows;
1419
    }
1420
1421
    /**
1422
     * Returns the current system time
1423
     *
1424
     * @return int
1425
     */
1426
    public function getCurrentTime()
1427
    {
1428
        return time();
1429
    }
1430
1431
    /************************************
1432
     *
1433
     * URL reading
1434
     *
1435
     ************************************/
1436
1437
    /**
1438
     * Read URL for single queue entry
1439
     *
1440
     * @param integer $queueId
1441
     * @param boolean $force If set, will process even if exec_time has been set!
1442
     * @return integer
1443
     */
1444
    public function readUrl($queueId, $force = false)
1445
    {
1446
        $queryBuilder = GeneralUtility::makeInstance(ConnectionPool::class)->getQueryBuilderForTable($this->tableName);
1447
        $ret = 0;
1448
        if ($this->debugMode) {
1449
            $this->getLogger()->log(
1450
                LogLevel::DEBUG,
1451
                'crawler-readurl start ' . microtime(true)
1452
            );
1453
        }
1454
        // Get entry:
1455
        $queryBuilder
1456
            ->select('*')
1457
            ->from('tx_crawler_queue')
1458
            ->where(
1459
                $queryBuilder->expr()->eq('qid', $queryBuilder->createNamedParameter($queueId, \PDO::PARAM_INT))
1460
            );
1461
        if (!$force) {
1462
            $queryBuilder
1463
                ->andWhere('exec_time = 0')
1464
                ->andWhere('process_scheduled > 0');
1465
        }
1466
        $queueRec = $queryBuilder->execute()->fetch();
1467
1468
        if (!is_array($queueRec)) {
1469
            return;
1470
        }
1471
1472
        $parameters = unserialize($queueRec['parameters']);
1473
        if ($parameters['rootTemplatePid']) {
1474
            $this->initTSFE((int)$parameters['rootTemplatePid']);
1475
        } else {
1476
            $this->getLogger()->log(
1477
                LogLevel::WARNING,
1478
                'Page with (' . $queueRec['page_id'] . ') could not be crawled, please check your crawler configuration. Perhaps no Root Template Pid is set'
1479
            );
1480
        }
1481
1482
        SignalSlotUtility::emitSignal(
1483
            __CLASS__,
1484
            SignalSlotUtility::SIGNNAL_QUEUEITEM_PREPROCESS,
1485
            [$queueId, &$queueRec]
1486
        );
1487
1488
        // Set exec_time to lock record:
1489
        $field_array = ['exec_time' => $this->getCurrentTime()];
1490
1491
        if (isset($this->processID)) {
1492
            //if mulitprocessing is used we need to store the id of the process which has handled this entry
1493
            $field_array['process_id_completed'] = $this->processID;
1494
        }
1495
1496
        GeneralUtility::makeInstance(ConnectionPool::class)->getConnectionForTable('tx_crawler_queue')
1497
            ->update(
1498
                'tx_crawler_queue',
1499
                $field_array,
1500
                [ 'qid' => (int)$queueId ]
1501
            );
1502
1503
        $result = $this->readUrl_exec($queueRec);
1504
        $resultData = unserialize($result['content']);
1505
1506
        //atm there's no need to point to specific pollable extensions
1507
        if (is_array($GLOBALS['TYPO3_CONF_VARS']['EXTCONF']['crawler']['pollSuccess'])) {
1508
            foreach ($GLOBALS['TYPO3_CONF_VARS']['EXTCONF']['crawler']['pollSuccess'] as $pollable) {
1509
                // only check the success value if the instruction is runnig
1510
                // it is important to name the pollSuccess key same as the procInstructions key
1511
                if (is_array($resultData['parameters']['procInstructions']) && in_array(
1512
                    $pollable,
1513
                    $resultData['parameters']['procInstructions']
1514
                )
1515
                ) {
1516
                    if (!empty($resultData['success'][$pollable]) && $resultData['success'][$pollable]) {
1517
                        $ret |= self::CLI_STATUS_POLLABLE_PROCESSED;
1518
                    }
1519
                }
1520
            }
1521
        }
1522
1523
        // Set result in log which also denotes the end of the processing of this entry.
1524
        $field_array = ['result_data' => serialize($result)];
1525
1526
        SignalSlotUtility::emitSignal(
1527
            __CLASS__,
1528
            SignalSlotUtility::SIGNNAL_QUEUEITEM_POSTPROCESS,
1529
            [$queueId, &$field_array]
1530
        );
1531
1532
        GeneralUtility::makeInstance(ConnectionPool::class)->getConnectionForTable('tx_crawler_queue')
1533
            ->update(
1534
                'tx_crawler_queue',
1535
                $field_array,
1536
                [ 'qid' => (int)$queueId ]
1537
            );
1538
1539
        if ($this->debugMode) {
1540
            $this->getLogger()->log(
1541
                LogLevel::DEBUG,
1542
                'crawler-readurl stop ' . microtime(true)
1543
            );
1544
        }
1545
1546
        return $ret;
1547
    }
1548
1549
    /**
1550
     * Read URL for not-yet-inserted log-entry
1551
     *
1552
     * @param array $field_array Queue field array,
1553
     *
1554
     * @return string
1555
     */
1556
    public function readUrlFromArray($field_array)
1557
    {
1558
1559
            // Set exec_time to lock record:
1560
        $field_array['exec_time'] = $this->getCurrentTime();
1561
        $connectionForCrawlerQueue = GeneralUtility::makeInstance(ConnectionPool::class)->getConnectionForTable('tx_crawler_queue');
1562
        $connectionForCrawlerQueue->insert(
1563
            'tx_crawler_queue',
1564
            $field_array
1565
        );
1566
        $queueId = $field_array['qid'] = $connectionForCrawlerQueue->lastInsertId('tx_crawler_queue', 'qid');
1567
1568
        $result = $this->readUrl_exec($field_array);
1569
1570
        // Set result in log which also denotes the end of the processing of this entry.
1571
        $field_array = ['result_data' => serialize($result)];
1572
1573
        SignalSlotUtility::emitSignal(
1574
            __CLASS__,
1575
            SignalSlotUtility::SIGNNAL_QUEUEITEM_POSTPROCESS,
1576
            [$queueId, &$field_array]
1577
        );
1578
1579
        $connectionForCrawlerQueue->update(
1580
            'tx_crawler_queue',
1581
            $field_array,
1582
            ['qid' => $queueId]
1583
        );
1584
1585
        return $result;
1586
    }
1587
1588
    /**
1589
     * Read URL for a queue record
1590
     *
1591
     * @param array $queueRec Queue record
1592
     * @return string
1593
     */
1594
    public function readUrl_exec($queueRec)
1595
    {
1596
        // Decode parameters:
1597
        $parameters = unserialize($queueRec['parameters']);
1598
        $result = 'ERROR';
1599
        if (is_array($parameters)) {
1600
            if ($parameters['_CALLBACKOBJ']) { // Calling object:
1601
                $objRef = $parameters['_CALLBACKOBJ'];
1602
                $callBackObj = GeneralUtility::makeInstance($objRef);
1603
                if (is_object($callBackObj)) {
1604
                    unset($parameters['_CALLBACKOBJ']);
1605
                    $result = ['content' => serialize($callBackObj->crawler_execute($parameters, $this))];
1606
                } else {
1607
                    $result = ['content' => 'No object: ' . $objRef];
1608
                }
1609
            } else { // Regular FE request:
1610
1611
                // Prepare:
1612
                $crawlerId = $queueRec['qid'] . ':' . md5($queueRec['qid'] . '|' . $queueRec['set_id'] . '|' . $GLOBALS['TYPO3_CONF_VARS']['SYS']['encryptionKey']);
1613
1614
                // Get result:
1615
                $result = $this->requestUrl($parameters['url'], $crawlerId);
1616
1617
                EventDispatcher::getInstance()->post('urlCrawled', $queueRec['set_id'], ['url' => $parameters['url'], 'result' => $result]);
1618
            }
1619
        }
1620
1621
        return $result;
1622
    }
1623
1624
    /**
1625
     * Gets the content of a URL.
1626
     *
1627
     * @param string $originalUrl URL to read
1628
     * @param string $crawlerId Crawler ID string (qid + hash to verify)
1629
     * @param integer $timeout Timeout time
1630
     * @param integer $recursion Recursion limiter for 302 redirects
1631
     * @return array|boolean
1632
     */
1633 2
    public function requestUrl($originalUrl, $crawlerId, $timeout = 2, $recursion = 10)
1634
    {
1635 2
        if (!$recursion) {
1636
            return false;
1637
        }
1638
1639
        // Parse URL, checking for scheme:
1640 2
        $url = parse_url($originalUrl);
1641
1642 2
        if ($url === false) {
1643
            $this->getLogger()->log(
1644
                LogLevel::DEBUG,
1645
                sprintf('Could not parse_url() for string "%s"', $url),
1646
                ['crawlerId' => $crawlerId]
1647
            );
1648
            return false;
1649
        }
1650
1651 2
        if (!in_array($url['scheme'], ['','http','https'])) {
1652
            $this->getLogger()->log(
1653
                LogLevel::DEBUG,
1654
                sprintf('Scheme does not match for url "%s"', $url),
1655
                ['crawlerId' => $crawlerId]
1656
            );
1657
            return false;
1658
        }
1659
1660
        // direct request
1661 2
        if ($this->extensionSettings['makeDirectRequests']) {
1662 2
            $result = $this->sendDirectRequest($originalUrl, $crawlerId);
1663 2
            return $result;
1664
        }
1665
1666
        $reqHeaders = $this->buildRequestHeaderArray($url, $crawlerId);
1667
1668
        // thanks to Pierrick Caillon for adding proxy support
1669
        $rurl = $url;
1670
1671
        if ($GLOBALS['TYPO3_CONF_VARS']['SYS']['curlUse'] && $GLOBALS['TYPO3_CONF_VARS']['SYS']['curlProxyServer']) {
1672
            $rurl = parse_url($GLOBALS['TYPO3_CONF_VARS']['SYS']['curlProxyServer']);
1673
            $url['path'] = $url['scheme'] . '://' . $url['host'] . ($url['port'] > 0 ? ':' . $url['port'] : '') . $url['path'];
1674
            $reqHeaders = $this->buildRequestHeaderArray($url, $crawlerId);
1675
        }
1676
1677
        $host = $rurl['host'];
1678
1679
        if ($url['scheme'] == 'https') {
1680
            $host = 'ssl://' . $host;
1681
            $port = ($rurl['port'] > 0) ? $rurl['port'] : 443;
1682
        } else {
1683
            $port = ($rurl['port'] > 0) ? $rurl['port'] : 80;
1684
        }
1685
1686
        $startTime = microtime(true);
1687
        $fp = fsockopen($host, $port, $errno, $errstr, $timeout);
1688
1689
        if (!$fp) {
1690
            $this->getLogger()->log(
1691
                LogLevel::DEBUG,
1692
                sprintf('Error while opening "%s"', $url),
1693
                ['crawlerId' => $crawlerId]
1694
            );
1695
            return false;
1696
        } else {
1697
            // Request message:
1698
            $msg = implode("\r\n", $reqHeaders) . "\r\n\r\n";
1699
            fputs($fp, $msg);
1700
1701
            // Read response:
1702
            $d = $this->getHttpResponseFromStream($fp);
1703
            fclose($fp);
1704
1705
            $time = microtime(true) - $startTime;
1706
            $this->log($originalUrl . ' ' . $time);
1707
1708
            // Implode content and headers:
1709
            $result = [
1710
                'request' => $msg,
1711
                'headers' => implode('', $d['headers']),
1712
                'content' => implode('', (array)$d['content'])
1713
            ];
1714
1715
            if (($this->extensionSettings['follow30x']) && ($newUrl = $this->getRequestUrlFrom302Header($d['headers'], $url['user'], $url['pass']))) {
1716
                $result = array_merge(['parentRequest' => $result], $this->requestUrl($newUrl, $crawlerId, $recursion--));
0 ignored issues
show
Bug introduced by
It seems like $newUrl defined by $this->getRequestUrlFrom...['user'], $url['pass']) on line 1715 can also be of type boolean; however, AOE\Crawler\Controller\C...ontroller::requestUrl() does only seem to accept string, maybe add an additional type check?

If a method or function can return multiple different values and unless you are sure that you only can receive a single value in this context, we recommend to add an additional type check:

/**
 * @return array|string
 */
function returnsDifferentValues($x) {
    if ($x) {
        return 'foo';
    }

    return array();
}

$x = returnsDifferentValues($y);
if (is_array($x)) {
    // $x is an array.
}

If this a common case that PHP Analyzer should handle natively, please let us know by opening an issue.

Loading history...
1717
                $newRequestUrl = $this->requestUrl($newUrl, $crawlerId, $timeout, --$recursion);
0 ignored issues
show
Bug introduced by
It seems like $newUrl defined by $this->getRequestUrlFrom...['user'], $url['pass']) on line 1715 can also be of type boolean; however, AOE\Crawler\Controller\C...ontroller::requestUrl() does only seem to accept string, maybe add an additional type check?

If a method or function can return multiple different values and unless you are sure that you only can receive a single value in this context, we recommend to add an additional type check:

/**
 * @return array|string
 */
function returnsDifferentValues($x) {
    if ($x) {
        return 'foo';
    }

    return array();
}

$x = returnsDifferentValues($y);
if (is_array($x)) {
    // $x is an array.
}

If this a common case that PHP Analyzer should handle natively, please let us know by opening an issue.

Loading history...
1718
1719
                if (is_array($newRequestUrl)) {
1720
                    $result = array_merge(['parentRequest' => $result], $newRequestUrl);
1721
                } else {
1722
                    $this->getLogger()->log(
1723
                        LogLevel::DEBUG,
1724
                        sprintf('Error while opening "%s"', $url),
1725
                        ['crawlerId' => $crawlerId]
1726
                    );
1727
                    return false;
1728
                }
1729
            }
1730
1731
            return $result;
1732
        }
1733
    }
1734
1735
    /**
1736
     * Gets the base path of the website frontend.
1737
     * (e.g. if you call http://mydomain.com/cms/index.php in
1738
     * the browser the base path is "/cms/")
1739
     *
1740
     * @return string Base path of the website frontend
1741
     */
1742
    protected function getFrontendBasePath()
1743
    {
1744
        $frontendBasePath = '/';
1745
1746
        // Get the path from the extension settings:
1747
        if (isset($this->extensionSettings['frontendBasePath']) && $this->extensionSettings['frontendBasePath']) {
1748
            $frontendBasePath = $this->extensionSettings['frontendBasePath'];
1749
        // If empty, try to use config.absRefPrefix:
1750
        } elseif (isset($GLOBALS['TSFE']->absRefPrefix) && !empty($GLOBALS['TSFE']->absRefPrefix)) {
1751
            $frontendBasePath = $GLOBALS['TSFE']->absRefPrefix;
1752
        // If not in CLI mode the base path can be determined from $_SERVER environment:
1753
        } elseif (!defined('TYPO3_REQUESTTYPE_CLI') || !TYPO3_REQUESTTYPE_CLI) {
1754
            $frontendBasePath = GeneralUtility::getIndpEnv('TYPO3_SITE_PATH');
1755
        }
1756
1757
        // Base path must be '/<pathSegements>/':
1758
        if ($frontendBasePath !== '/') {
1759
            $frontendBasePath = '/' . ltrim($frontendBasePath, '/');
1760
            $frontendBasePath = rtrim($frontendBasePath, '/') . '/';
1761
        }
1762
1763
        return $frontendBasePath;
1764
    }
1765
1766
    /**
1767
     * Executes a shell command and returns the outputted result.
1768
     *
1769
     * @param string $command Shell command to be executed
1770
     * @return string Outputted result of the command execution
1771
     */
1772
    protected function executeShellCommand($command)
1773
    {
1774
        return shell_exec($command);
1775
    }
1776
1777
    /**
1778
     * Reads HTTP response from the given stream.
1779
     *
1780
     * @param  resource $streamPointer  Pointer to connection stream.
1781
     * @return array                    Associative array with the following items:
1782
     *                                  headers <array> Response headers sent by server.
1783
     *                                  content <array> Content, with each line as an array item.
1784
     */
1785 1
    protected function getHttpResponseFromStream($streamPointer)
1786
    {
1787 1
        $response = ['headers' => [], 'content' => []];
1788
1789 1
        if (is_resource($streamPointer)) {
1790
            // read headers
1791 1
            while ($line = fgets($streamPointer, '2048')) {
1792 1
                $line = trim($line);
1793 1
                if ($line !== '') {
1794 1
                    $response['headers'][] = $line;
1795
                } else {
1796 1
                    break;
1797
                }
1798
            }
1799
1800
            // read content
1801 1
            while ($line = fgets($streamPointer, '2048')) {
1802 1
                $response['content'][] = $line;
1803
            }
1804
        }
1805
1806 1
        return $response;
1807
    }
1808
1809
    /**
1810
     * @param message
1811
     */
1812 2
    protected function log($message)
1813
    {
1814 2
        if (!empty($this->extensionSettings['logFileName'])) {
1815
            $fileResult = @file_put_contents($this->extensionSettings['logFileName'], date('Ymd His') . ' ' . $message . PHP_EOL, FILE_APPEND);
1816
            if (!$fileResult) {
1817
                $this->getLogger()->log(
1818
                    LogLevel::INFO,
1819
                    sprintf('File "%s" could not be written, please check file permissions.', $this->extensionSettings['logFileName'])
1820
                );
1821
            }
1822
        }
1823 2
    }
1824
1825
    /**
1826
     * Builds HTTP request headers.
1827
     *
1828
     * @param array $url
1829
     * @param string $crawlerId
1830
     *
1831
     * @return array
1832
     */
1833 6
    protected function buildRequestHeaderArray(array $url, $crawlerId)
1834
    {
1835 6
        $reqHeaders = [];
1836 6
        $reqHeaders[] = 'GET ' . $url['path'] . ($url['query'] ? '?' . $url['query'] : '') . ' HTTP/1.0';
1837 6
        $reqHeaders[] = 'Host: ' . $url['host'];
1838 6
        if (stristr($url['query'], 'ADMCMD_previewWS')) {
1839 2
            $reqHeaders[] = 'Cookie: $Version="1"; be_typo_user="1"; $Path=/';
1840
        }
1841 6
        $reqHeaders[] = 'Connection: close';
1842 6
        if ($url['user'] != '') {
1843 2
            $reqHeaders[] = 'Authorization: Basic ' . base64_encode($url['user'] . ':' . $url['pass']);
1844
        }
1845 6
        $reqHeaders[] = 'X-T3crawler: ' . $crawlerId;
1846 6
        $reqHeaders[] = 'User-Agent: TYPO3 crawler';
1847 6
        return $reqHeaders;
1848
    }
1849
1850
    /**
1851
     * Check if the submitted HTTP-Header contains a redirect location and built new crawler-url
1852
     *
1853
     * @param array $headers HTTP Header
1854
     * @param string $user HTTP Auth. User
1855
     * @param string $pass HTTP Auth. Password
1856
     * @return bool|string
1857
     */
1858 12
    protected function getRequestUrlFrom302Header($headers, $user = '', $pass = '')
1859
    {
1860 12
        $header = [];
1861 12
        if (!is_array($headers)) {
1862 1
            return false;
1863
        }
1864 11
        if (!(stristr($headers[0], '301 Moved') || stristr($headers[0], '302 Found') || stristr($headers[0], '302 Moved'))) {
1865 2
            return false;
1866
        }
1867
1868 9
        foreach ($headers as $hl) {
1869 9
            $tmp = explode(": ", $hl);
1870 9
            $header[trim($tmp[0])] = trim($tmp[1]);
1871 9
            if (trim($tmp[0]) == 'Location') {
1872 6
                break;
1873
            }
1874
        }
1875 9
        if (!array_key_exists('Location', $header)) {
1876 3
            return false;
1877
        }
1878
1879 6
        if ($user != '') {
1880 3
            if (!($tmp = parse_url($header['Location']))) {
1881 1
                return false;
1882
            }
1883 2
            $newUrl = $tmp['scheme'] . '://' . $user . ':' . $pass . '@' . $tmp['host'] . $tmp['path'];
1884 2
            if ($tmp['query'] != '') {
1885 2
                $newUrl .= '?' . $tmp['query'];
1886
            }
1887
        } else {
1888 3
            $newUrl = $header['Location'];
1889
        }
1890 5
        return $newUrl;
1891
    }
1892
1893
    /**************************
1894
     *
1895
     * tslib_fe hooks:
1896
     *
1897
     **************************/
1898
1899
    /**
1900
     * Initialization hook (called after database connection)
1901
     * Takes the "HTTP_X_T3CRAWLER" header and looks up queue record and verifies if the session comes from the system (by comparing hashes)
1902
     *
1903
     * @param array $params Parameters from frontend
1904
     * @param object $ref TSFE object (reference under PHP5)
1905
     * @return void
1906
     *
1907
     * FIXME: Look like this is not used, in commit 9910d3f40cce15f4e9b7bcd0488bf21f31d53ebc it's added as public,
1908
     * FIXME: I think this can be removed. (TNM)
1909
     */
1910
    public function fe_init(&$params, $ref)
0 ignored issues
show
Unused Code introduced by
The parameter $ref is not used and could be removed.

This check looks from parameters that have been defined for a function or method, but which are not used in the method body.

Loading history...
1911
    {
1912
        // Authenticate crawler request:
1913
        if (isset($_SERVER['HTTP_X_T3CRAWLER'])) {
1914
            $queryBuilder = GeneralUtility::makeInstance(ConnectionPool::class)->getQueryBuilderForTable($this->tableName);
1915
            list($queueId, $hash) = explode(':', $_SERVER['HTTP_X_T3CRAWLER']);
1916
1917
            $queueRec = $queryBuilder
1918
                ->select('*')
1919
                ->from('tx_crawler_queue')
1920
                ->where(
1921
                    $queryBuilder->expr()->eq('qid', $queryBuilder->createNamedParameter($queueId, \PDO::PARAM_INT))
1922
                )
1923
                ->execute()
1924
                ->fetch();
1925
1926
            // If a crawler record was found and hash was matching, set it up:
1927
            if (is_array($queueRec) && $hash === md5($queueRec['qid'] . '|' . $queueRec['set_id'] . '|' . $GLOBALS['TYPO3_CONF_VARS']['SYS']['encryptionKey'])) {
1928
                $params['pObj']->applicationData['tx_crawler']['running'] = true;
1929
                $params['pObj']->applicationData['tx_crawler']['parameters'] = unserialize($queueRec['parameters']);
1930
                $params['pObj']->applicationData['tx_crawler']['log'] = [];
1931
            } else {
1932
                die('No crawler entry found!');
1933
            }
1934
        }
1935
    }
1936
1937
    /*****************************
1938
     *
1939
     * Compiling URLs to crawl - tools
1940
     *
1941
     *****************************/
1942
1943
    /**
1944
     * @param integer $id Root page id to start from.
1945
     * @param integer $depth Depth of tree, 0=only id-page, 1= on sublevel, 99 = infinite
1946
     * @param integer $scheduledTime Unix Time when the URL is timed to be visited when put in queue
1947
     * @param integer $reqMinute Number of requests per minute (creates the interleave between requests)
1948
     * @param boolean $submitCrawlUrls If set, submits the URLs to queue in database (real crawling)
1949
     * @param boolean $downloadCrawlUrls If set (and submitcrawlUrls is false) will fill $downloadUrls with entries)
1950
     * @param array $incomingProcInstructions Array of processing instructions
1951
     * @param array $configurationSelection Array of configuration keys
1952
     * @return string
1953
     */
1954
    public function getPageTreeAndUrls(
1955
        $id,
1956
        $depth,
1957
        $scheduledTime,
1958
        $reqMinute,
1959
        $submitCrawlUrls,
1960
        $downloadCrawlUrls,
1961
        array $incomingProcInstructions,
1962
        array $configurationSelection
1963
    ) {
1964
        global $BACK_PATH;
1965
        global $LANG;
1966
        if (!is_object($LANG)) {
1967
            $LANG = GeneralUtility::makeInstance(LanguageService::class);
1968
            $LANG->init(0);
1969
        }
1970
        $this->scheduledTime = $scheduledTime;
1971
        $this->reqMinute = $reqMinute;
1972
        $this->submitCrawlUrls = $submitCrawlUrls;
1973
        $this->downloadCrawlUrls = $downloadCrawlUrls;
1974
        $this->incomingProcInstructions = $incomingProcInstructions;
1975
        $this->incomingConfigurationSelection = $configurationSelection;
1976
1977
        $this->duplicateTrack = [];
1978
        $this->downloadUrls = [];
1979
1980
        // Drawing tree:
1981
        /* @var PageTreeView $tree */
1982
        $tree = GeneralUtility::makeInstance(PageTreeView::class);
1983
        $perms_clause = $GLOBALS['BE_USER']->getPagePermsClause(1);
1984
        $tree->init('AND ' . $perms_clause);
1985
1986
        $pageInfo = BackendUtility::readPageAccess($id, $perms_clause);
1987
        if (is_array($pageInfo)) {
1988
            // Set root row:
1989
            $tree->tree[] = [
1990
                'row' => $pageInfo,
1991
                'HTML' => IconUtility::getIconForRecord('pages', $pageInfo)
1992
            ];
1993
        }
1994
1995
        // Get branch beneath:
1996
        if ($depth) {
1997
            $tree->getTree($id, $depth, '');
1998
        }
1999
2000
        // Traverse page tree:
2001
        $code = '';
2002
2003
        foreach ($tree->tree as $data) {
2004
            $this->MP = false;
2005
2006
            // recognize mount points
2007
            if ($data['row']['doktype'] == 7) {
2008
                $queryBuilder = GeneralUtility::makeInstance(ConnectionPool::class)->getQueryBuilderForTable($this->tableName);
2009
                $queryBuilder->getRestrictions()->removeAll()->add(GeneralUtility::makeInstance(DeletedRestriction::class));
2010
                $mountpage = $queryBuilder
2011
                    ->select('*')
2012
                    ->from('pages')
2013
                    ->where(
2014
                        $queryBuilder->expr()->eq('uid', $queryBuilder->createNamedParameter($data['row']['uid'], \PDO::PARAM_INT))
2015
                    )
2016
                    ->execute()
2017
                    ->fetchAll();
2018
                $queryBuilder->getRestrictions()->reset();
2019
2020
                // fetch mounted pages
2021
                $this->MP = $mountpage[0]['mount_pid'] . '-' . $data['row']['uid'];
0 ignored issues
show
Documentation Bug introduced by
The property $MP was declared of type boolean, but $mountpage[0]['mount_pid...' . $data['row']['uid'] is of type string. Maybe add a type cast?

This check looks for assignments to scalar types that may be of the wrong type.

To ensure the code behaves as expected, it may be a good idea to add an explicit type cast.

$answer = 42;

$correct = false;

$correct = (bool) $answer;
Loading history...
2022
2023
                $mountTree = GeneralUtility::makeInstance(PageTreeView::class);
2024
                $mountTree->init('AND ' . $perms_clause);
2025
                $mountTree->getTree($mountpage[0]['mount_pid'], $depth, '');
2026
2027
                foreach ($mountTree->tree as $mountData) {
2028
                    $code .= $this->drawURLs_addRowsForPage(
2029
                        $mountData['row'],
2030
                        $mountData['HTML'] . BackendUtility::getRecordTitle('pages', $mountData['row'], true)
2031
                    );
2032
                }
2033
2034
                // replace page when mount_pid_ol is enabled
2035
                if ($mountpage[0]['mount_pid_ol']) {
2036
                    $data['row']['uid'] = $mountpage[0]['mount_pid'];
2037
                } else {
2038
                    // if the mount_pid_ol is not set the MP must not be used for the mountpoint page
2039
                    $this->MP = false;
2040
                }
2041
            }
2042
2043
            $code .= $this->drawURLs_addRowsForPage(
2044
                $data['row'],
2045
                $data['HTML'] . BackendUtility::getRecordTitle('pages', $data['row'], true)
2046
            );
2047
        }
2048
2049
        return $code;
2050
    }
2051
2052
    /**
2053
     * Expands exclude string
2054
     *
2055
     * @param string $excludeString Exclude string
2056
     * @return array
2057
     */
2058 1
    public function expandExcludeString($excludeString)
2059
    {
2060
        // internal static caches;
2061 1
        static $expandedExcludeStringCache;
2062 1
        static $treeCache;
2063
2064 1
        if (empty($expandedExcludeStringCache[$excludeString])) {
2065 1
            $pidList = [];
2066
2067 1
            if (!empty($excludeString)) {
2068
                /** @var PageTreeView $tree */
2069
                $tree = GeneralUtility::makeInstance(PageTreeView::class);
2070
                $tree->init('AND ' . $this->backendUser->getPagePermsClause(1));
2071
2072
                $excludeParts = GeneralUtility::trimExplode(',', $excludeString);
2073
2074
                foreach ($excludeParts as $excludePart) {
2075
                    list($pid, $depth) = GeneralUtility::trimExplode('+', $excludePart);
2076
2077
                    // default is "page only" = "depth=0"
2078
                    if (empty($depth)) {
2079
                        $depth = (stristr($excludePart, '+')) ? 99 : 0;
2080
                    }
2081
2082
                    $pidList[] = $pid;
2083
2084
                    if ($depth > 0) {
2085
                        if (empty($treeCache[$pid][$depth])) {
2086
                            $tree->reset();
2087
                            $tree->getTree($pid, $depth);
2088
                            $treeCache[$pid][$depth] = $tree->tree;
2089
                        }
2090
2091
                        foreach ($treeCache[$pid][$depth] as $data) {
2092
                            $pidList[] = $data['row']['uid'];
2093
                        }
2094
                    }
2095
                }
2096
            }
2097
2098 1
            $expandedExcludeStringCache[$excludeString] = array_unique($pidList);
2099
        }
2100
2101 1
        return $expandedExcludeStringCache[$excludeString];
2102
    }
2103
2104
    /**
2105
     * Create the rows for display of the page tree
2106
     * For each page a number of rows are shown displaying GET variable configuration
2107
     *
2108
     * @param    array        Page row
2109
     * @param    string        Page icon and title for row
2110
     * @return    string        HTML <tr> content (one or more)
2111
     */
2112
    public function drawURLs_addRowsForPage(array $pageRow, $pageTitleAndIcon)
2113
    {
2114
        $skipMessage = '';
2115
2116
        // Get list of configurations
2117
        $configurations = $this->getUrlsForPageRow($pageRow, $skipMessage);
2118
2119
        if (count($this->incomingConfigurationSelection) > 0) {
2120
            // remove configuration that does not match the current selection
2121
            foreach ($configurations as $confKey => $confArray) {
2122
                if (!in_array($confKey, $this->incomingConfigurationSelection)) {
2123
                    unset($configurations[$confKey]);
2124
                }
2125
            }
2126
        }
2127
2128
        // Traverse parameter combinations:
2129
        $c = 0;
2130
        $content = '';
2131
        if (count($configurations)) {
2132
            foreach ($configurations as $confKey => $confArray) {
2133
2134
                    // Title column:
2135
                if (!$c) {
2136
                    $titleClm = '<td rowspan="' . count($configurations) . '">' . $pageTitleAndIcon . '</td>';
2137
                } else {
2138
                    $titleClm = '';
2139
                }
2140
2141
                if (!in_array($pageRow['uid'], $this->expandExcludeString($confArray['subCfg']['exclude']))) {
2142
2143
                        // URL list:
2144
                    $urlList = $this->urlListFromUrlArray(
2145
                        $confArray,
2146
                        $pageRow,
2147
                        $this->scheduledTime,
2148
                        $this->reqMinute,
2149
                        $this->submitCrawlUrls,
2150
                        $this->downloadCrawlUrls,
2151
                        $this->duplicateTrack,
2152
                        $this->downloadUrls,
2153
                        $this->incomingProcInstructions // if empty the urls won't be filtered by processing instructions
2154
                    );
2155
2156
                    // Expanded parameters:
2157
                    $paramExpanded = '';
2158
                    $calcAccu = [];
2159
                    $calcRes = 1;
2160
                    foreach ($confArray['paramExpanded'] as $gVar => $gVal) {
2161
                        $paramExpanded .= '
2162
                            <tr>
2163
                                <td class="bgColor4-20">' . htmlspecialchars('&' . $gVar . '=') . '<br/>' .
2164
                                                '(' . count($gVal) . ')' .
2165
                                                '</td>
2166
                                <td class="bgColor4" nowrap="nowrap">' . nl2br(htmlspecialchars(implode(chr(10), $gVal))) . '</td>
2167
                            </tr>
2168
                        ';
2169
                        $calcRes *= count($gVal);
2170
                        $calcAccu[] = count($gVal);
2171
                    }
2172
                    $paramExpanded = '<table class="lrPadding c-list param-expanded">' . $paramExpanded . '</table>';
2173
                    $paramExpanded .= 'Comb: ' . implode('*', $calcAccu) . '=' . $calcRes;
2174
2175
                    // Options
2176
                    $optionValues = '';
2177
                    if ($confArray['subCfg']['userGroups']) {
2178
                        $optionValues .= 'User Groups: ' . $confArray['subCfg']['userGroups'] . '<br/>';
2179
                    }
2180
                    if ($confArray['subCfg']['baseUrl']) {
2181
                        $optionValues .= 'Base Url: ' . $confArray['subCfg']['baseUrl'] . '<br/>';
2182
                    }
2183
                    if ($confArray['subCfg']['procInstrFilter']) {
2184
                        $optionValues .= 'ProcInstr: ' . $confArray['subCfg']['procInstrFilter'] . '<br/>';
2185
                    }
2186
2187
                    // Compile row:
2188
                    $content .= '
2189
                        <tr class="bgColor' . ($c % 2 ? '-20' : '-10') . '">
2190
                            ' . $titleClm . '
2191
                            <td>' . htmlspecialchars($confKey) . '</td>
2192
                            <td>' . nl2br(htmlspecialchars(rawurldecode(trim(str_replace('&', chr(10) . '&', GeneralUtility::implodeArrayForUrl('', $confArray['paramParsed'])))))) . '</td>
2193
                            <td>' . $paramExpanded . '</td>
2194
                            <td nowrap="nowrap">' . $urlList . '</td>
2195
                            <td nowrap="nowrap">' . $optionValues . '</td>
2196
                            <td nowrap="nowrap">' . DebugUtility::viewArray($confArray['subCfg']['procInstrParams.']) . '</td>
2197
                        </tr>';
2198
                } else {
2199
                    $content .= '<tr class="bgColor' . ($c % 2 ? '-20' : '-10') . '">
2200
                            ' . $titleClm . '
2201
                            <td>' . htmlspecialchars($confKey) . '</td>
2202
                            <td colspan="5"><em>No entries</em> (Page is excluded in this configuration)</td>
2203
                        </tr>';
2204
                }
2205
2206
                $c++;
2207
            }
2208
        } else {
2209
            $message = !empty($skipMessage) ? ' (' . $skipMessage . ')' : '';
2210
2211
            // Compile row:
2212
            $content .= '
2213
                <tr class="bgColor-20" style="border-bottom: 1px solid black;">
2214
                    <td>' . $pageTitleAndIcon . '</td>
2215
                    <td colspan="6"><em>No entries</em>' . $message . '</td>
2216
                </tr>';
2217
        }
2218
2219
        return $content;
2220
    }
2221
2222
    /*****************************
2223
     *
2224
     * CLI functions
2225
     *
2226
     *****************************/
2227
2228
    /**
2229
     * Helper function
2230
     *
2231
     * @param string $option Option string, eg. "-s
2232
     * @param int $idx Value index, default is 0 (zero) = the first one...
2233
     * @return string
2234
     */
2235
    private function cli_argValue($option, $idx)
2236
    {
2237
        return is_array($this->cli_args[$option]) ? $this->cli_args[$option][$idx] : '';
0 ignored issues
show
Bug introduced by
The property cli_args does not exist. Did you maybe forget to declare it?

In PHP it is possible to write to properties without declaring them. For example, the following is perfectly valid PHP code:

class MyClass { }

$x = new MyClass();
$x->foo = true;

Generally, it is a good practice to explictly declare properties to avoid accidental typos and provide IDE auto-completion:

class MyClass {
    public $foo;
}

$x = new MyClass();
$x->foo = true;
Loading history...
2238
    }
2239
2240
    /**
2241
     * Helper function
2242
     *
2243
     * @param string $string The string to output
2244
     */
2245
    private function cli_echo($string)
0 ignored issues
show
Unused Code introduced by
This method is not used, and could be removed.
Loading history...
2246
    {
2247
        $this->outputLine($string);
0 ignored issues
show
Bug introduced by
The method outputLine() does not seem to exist on object<AOE\Crawler\Controller\CrawlerController>.

This check looks for calls to methods that do not seem to exist on a given type. It looks for the method on the type itself as well as in inherited classes or implemented interfaces.

This is most likely a typographical error or the method has been renamed.

Loading history...
2248
    }
2249
2250
    /**
2251
     * Set cli args
2252
     *
2253
     * This is a copy from the CommandLineController from TYPO3 < v9
2254
     *
2255
     * TODO: Rework
2256
     *
2257
     * @param array $argv
2258
     */
2259
    private function setCliArgs(array $argv)
0 ignored issues
show
Unused Code introduced by
This method is not used, and could be removed.
Loading history...
2260
    {
2261
        $cli_options = [];
2262
        $index = '_DEFAULT';
2263
        foreach ($argv as $token) {
2264
            // Options starting with a number is invalid - they could be negative values!
2265
            if ($token[0] === '-' && !MathUtility::canBeInterpretedAsInteger($token[1])) {
2266
                list($index, $opt) = explode('=', $token, 2);
2267
                if (isset($cli_options[$index])) {
2268
                    echo 'ERROR: Option ' . $index . ' was used twice!' . LF;
2269
                    die;
2270
                }
2271
                $cli_options[$index] = [];
2272
                if (isset($opt)) {
2273
                    $cli_options[$index][] = $opt;
2274
                }
2275
            } else {
2276
                $cli_options[$index][] = $token;
2277
            }
2278
        }
2279
2280
        $this->cliArgs = $cli_options;
2281
    }
2282
2283
    /**
2284
     * Obtains configuration keys from the CLI arguments
2285
     *
2286
     * @return mixed                        Array of keys or null if no keys found
2287
     */
2288
    protected function getConfigurationKeys()
2289
    {
2290
        $parameter = trim($this->cli_argValue('-conf'));
0 ignored issues
show
Bug introduced by
The call to cli_argValue() misses a required argument $idx.

This check looks for function calls that miss required arguments.

Loading history...
2291
        return ($parameter != '' ? GeneralUtility::trimExplode(',', $parameter) : []);
2292
    }
2293
2294
    /**
2295
     * Running the functionality of the CLI (crawling URLs from queue)
2296
     *
2297
     * @param int $countInARun
2298
     * @param int $sleepTime
2299
     * @param int $sleepAfterFinish
2300
     * @return string
2301
     */
2302
    public function CLI_run($countInARun, $sleepTime, $sleepAfterFinish)
2303
    {
2304
        $queryBuilder = GeneralUtility::makeInstance(ConnectionPool::class)->getQueryBuilderForTable($this->tableName);
2305
        $result = 0;
2306
        $counter = 0;
2307
2308
        // First, run hooks:
2309
        $this->CLI_runHooks();
2310
2311
        // Clean up the queue
2312
        if (intval($this->extensionSettings['purgeQueueDays']) > 0) {
2313
            $purgeDate = $this->getCurrentTime() - 24 * 60 * 60 * intval($this->extensionSettings['purgeQueueDays']);
2314
2315
            $del = $queryBuilder
2316
                ->delete($this->tableName)
2317
                ->where(
2318
                    'exec_time != 0 AND exec_time < ' . $purgeDate
2319
                );
2320
            if (false == $del) {
2321
                $this->getLogger()->log(
2322
                    LogLevel::INFO,
2323
                    'Records could not be deleted.'
2324
                );
2325
            }
2326
        }
2327
2328
        // Select entries:
2329
        //TODO Shouldn't this reside within the transaction?
2330
        $rows = $queryBuilder
2331
            ->select('qid', 'scheduled')
2332
            ->from('tx_crawler_queue')
2333
            ->where(
2334
                $queryBuilder->expr()->eq('exec_time', 0),
2335
                $queryBuilder->expr()->eq('process_scheduled', 0),
2336
                $queryBuilder->expr()->lte('scheduled', $this->getCurrentTime())
2337
            )
2338
            ->orderBy('scheduled')
2339
            ->addOrderBy('qid')
2340
            ->setMaxResults($countInARun)
2341
            ->execute()
2342
            ->fetchAll();
2343
2344
        if (count($rows) > 0) {
2345
            $quidList = [];
2346
2347
            foreach ($rows as $r) {
2348
                $quidList[] = $r['qid'];
2349
            }
2350
2351
            $processId = $this->CLI_buildProcessId();
2352
2353
            //reserve queue entries for process
2354
2355
            //$this->queryBuilder->getConnection()->executeQuery('BEGIN');
2356
            //TODO make sure we're not taking assigned queue-entires
2357
2358
            //save the number of assigned queue entrys to determine who many have been processed later
2359
            $numberOfAffectedRows = $queryBuilder
2360
                ->update('tx_crawler_queue')
2361
                ->where(
2362
                    $queryBuilder->expr()->in('qid', $quidList)
2363
                )
2364
                ->set('process_scheduled', $queryBuilder->createNamedParamter($this->getCurrentTime(), \PDO::PARAM_INT))
2365
                ->set('process_id', $processId)
2366
                ->execute();
2367
2368
            GeneralUtility::makeInstance(ConnectionPool::class)->getConnectionForTable('tx_crawler_process')
2369
                ->update(
2370
                    'tx_crawler_process',
2371
                    [ 'assigned_items_count' => (int)$numberOfAffectedRows ],
2372
                    [ 'process_id' => (int) $processId ]
2373
                );
2374
2375
            if ($numberOfAffectedRows == count($quidList)) {
2376
                //$this->queryBuilder->getConnection()->executeQuery('COMMIT');
2377
            } else {
2378
                //$this->queryBuilder->getConnection()->executeQuery('ROLLBACK');
2379
                $this->CLI_debug("Nothing processed due to multi-process collision (" . $this->CLI_buildProcessId() . ")");
2380
                return ($result | self::CLI_STATUS_ABORTED);
2381
            }
2382
2383
            foreach ($rows as $r) {
2384
                $result |= $this->readUrl($r['qid']);
2385
2386
                $counter++;
2387
                usleep(intval($sleepTime)); // Just to relax the system
2388
2389
                // if during the start and the current read url the cli has been disable we need to return from the function
2390
                // mark the process NOT as ended.
2391
                if ($this->getDisabled()) {
2392
                    return ($result | self::CLI_STATUS_ABORTED);
2393
                }
2394
2395
                if (!$this->CLI_checkIfProcessIsActive($this->CLI_buildProcessId())) {
2396
                    $this->CLI_debug("conflict / timeout (" . $this->CLI_buildProcessId() . ")");
2397
2398
                    //TODO might need an additional returncode
2399
                    $result |= self::CLI_STATUS_ABORTED;
2400
                    break; //possible timeout
2401
                }
2402
            }
2403
2404
            sleep(intval($sleepAfterFinish));
2405
2406
            $msg = 'Rows: ' . $counter;
2407
            $this->CLI_debug($msg . " (" . $this->CLI_buildProcessId() . ")");
2408
        } else {
2409
            $this->CLI_debug("Nothing within queue which needs to be processed (" . $this->CLI_buildProcessId() . ")");
2410
        }
2411
2412
        if ($counter > 0) {
2413
            $result |= self::CLI_STATUS_PROCESSED;
2414
        }
2415
2416
        return $result;
2417
    }
2418
2419
    /**
2420
     * Activate hooks
2421
     *
2422
     * @return void
2423
     */
2424
    public function CLI_runHooks()
2425
    {
2426
        global $TYPO3_CONF_VARS;
2427
        if (is_array($TYPO3_CONF_VARS['EXTCONF']['crawler']['cli_hooks'])) {
2428
            foreach ($TYPO3_CONF_VARS['EXTCONF']['crawler']['cli_hooks'] as $objRef) {
2429
                $hookObj = GeneralUtility::makeInstance($objRef);
2430
                if (is_object($hookObj)) {
2431
                    $hookObj->crawler_init($this);
2432
                }
2433
            }
2434
        }
2435
    }
2436
2437
    /**
2438
     * Try to acquire a new process with the given id
2439
     * also performs some auto-cleanup for orphan processes
2440
     * @todo preemption might not be the most elegant way to clean up
2441
     *
2442
     * @param string $id identification string for the process
2443
     * @return boolean
2444
     */
2445
    public function CLI_checkAndAcquireNewProcess($id)
2446
    {
2447
        $queryBuilder = GeneralUtility::makeInstance(ConnectionPool::class)->getQueryBuilderForTable($this->tableName);
2448
        $ret = true;
2449
2450
        $systemProcessId = getmypid();
2451
        if ($systemProcessId < 1) {
2452
            return false;
2453
        }
2454
2455
        $processCount = 0;
2456
        $orphanProcesses = [];
2457
2458
        //$this->queryBuilder->getConnection()->executeQuery('BEGIN');
2459
2460
        $statement = $queryBuilder
2461
            ->select('process_id', 'ttl')
2462
            ->from('tx_crawler_process')
2463
            ->where(
2464
                'active = 1 AND deleted = 0'
2465
            )
2466
            ->execute();
2467
2468
        $currentTime = $this->getCurrentTime();
2469
2470
        while ($row = $statement->fetch()) {
2471
            if ($row['ttl'] < $currentTime) {
2472
                $orphanProcesses[] = $row['process_id'];
2473
            } else {
2474
                $processCount++;
2475
            }
2476
        }
2477
2478
        // if there are less than allowed active processes then add a new one
2479
        if ($processCount < intval($this->extensionSettings['processLimit'])) {
2480
            $this->CLI_debug("add process " . $this->CLI_buildProcessId() . " (" . ($processCount + 1) . "/" . intval($this->extensionSettings['processLimit']) . ")");
2481
2482
            GeneralUtility::makeInstance(ConnectionPool::class)->getConnectionForTable('tx_crawler_process')->insert(
2483
                'tx_crawler_process',
2484
                [
2485
                    'process_id' => $id,
2486
                    'active' => 1,
2487
                    'ttl' => $currentTime + (int)$this->extensionSettings['processMaxRunTime'],
2488
                    'system_process_id' => $systemProcessId
2489
                ]
2490
            );
2491
        } else {
2492
            $this->CLI_debug("Processlimit reached (" . ($processCount) . "/" . intval($this->extensionSettings['processLimit']) . ")");
2493
            $ret = false;
2494
        }
2495
2496
        $this->CLI_releaseProcesses($orphanProcesses, true); // maybe this should be somehow included into the current lock
2497
        $this->CLI_deleteProcessesMarkedDeleted();
0 ignored issues
show
Deprecated Code introduced by
The method AOE\Crawler\Controller\C...rocessesMarkedDeleted() has been deprecated with message: since crawler v7.0.0, will be removed in crawler v8.0.0.
Please Consider using $this->processRepository->deleteProcessesMarkedAsDeleted()

This method has been deprecated. The supplier of the class has supplied an explanatory message.

The explanatory message should give you some clue as to whether and when the method will be removed from the class and what other method or class to use instead.

Loading history...
2498
2499
        //$this->queryBuilder->getConnection()->executeQuery('COMMIT');
2500
2501
        return $ret;
2502
    }
2503
2504
    /**
2505
     * Release a process and the required resources
2506
     *
2507
     * @param  mixed    $releaseIds   string with a single process-id or array with multiple process-ids
2508
     * @param  boolean  $withinLock   show whether the DB-actions are included within an existing lock
2509
     * @return boolean
2510
     */
2511
    public function CLI_releaseProcesses($releaseIds, $withinLock = false)
2512
    {
2513
        $queryBuilder = GeneralUtility::makeInstance(ConnectionPool::class)->getQueryBuilderForTable($this->tableName);
2514
2515
        if (!is_array($releaseIds)) {
2516
            $releaseIds = [$releaseIds];
2517
        }
2518
2519
        if (!(count($releaseIds) > 0)) {
2520
            return false;   //nothing to release
2521
        }
2522
2523
        if (!$withinLock) {
2524
            //$this->queryBuilder->getConnection()->executeQuery('BEGIN');
2525
        }
2526
2527
        // some kind of 2nd chance algo - this way you need at least 2 processes to have a real cleanup
2528
        // this ensures that a single process can't mess up the entire process table
2529
2530
        // mark all processes as deleted which have no "waiting" queue-entires and which are not active
2531
2532
        $queryBuilder
2533
        ->update('tx_crawler_queue', 'q')
2534
        ->where(
2535
            'q.process_id IN(SELECT p.process_id FROM tx_crawler_process as p WHERE p.active = 0)'
2536
        )
2537
        ->set('q.process_scheduled', 0)
2538
        ->set('q.process_id', '')
2539
        ->execute();
2540
2541
        // FIXME: Not entirely sure that this is equivalent to the previous version
2542
        $queryBuilder->resetQueryPart('set');
2543
2544
        $queryBuilder
2545
            ->update('tx_crawler_process')
2546
            ->where(
2547
                $queryBuilder->expr()->eq('active', 0),
2548
                'process_id IN(SELECT q.process_id FROM tx_crawler_queue as q WHERE q.exec_time = 0)'
2549
            )
2550
            ->set('system_process_id', 0)
2551
            ->execute();
2552
        // previous version for reference
2553
        /*
2554
        $GLOBALS['TYPO3_DB']->exec_UPDATEquery(
2555
            'tx_crawler_process',
2556
            'active=0 AND deleted=0
2557
            AND NOT EXISTS (
2558
                SELECT * FROM tx_crawler_queue
2559
                WHERE tx_crawler_queue.process_id = tx_crawler_process.process_id
2560
                AND tx_crawler_queue.exec_time = 0
2561
            )',
2562
            [
2563
                'deleted' => '1',
2564
                'system_process_id' => 0
2565
            ]
2566
        );*/
2567
        // mark all requested processes as non-active
2568
        $queryBuilder
2569
            ->update('tx_crawler_process')
2570
            ->where(
2571
                'NOT EXISTS (
2572
                SELECT * FROM tx_crawler_queue
2573
                    WHERE tx_crawler_queue.process_id = tx_crawler_process.process_id
2574
                    AND tx_crawler_queue.exec_time = 0
2575
                )',
2576
                $queryBuilder->expr()->in('process_id', $queryBuilder->createNamedParameter($releaseIds, Connection::PARAM_STR_ARRAY)),
2577
                $queryBuilder->expr()->eq('deleted', 0)
2578
            )
2579
            ->set('active', 0)
2580
            ->execute();
2581
        $queryBuilder->resetQueryPart('set');
2582
        $queryBuilder
2583
            ->update('tx_crawler_queue')
2584
            ->where(
2585
                $queryBuilder->expr()->eq('exec_time', 0),
2586
                $queryBuilder->expr()->in('process_id', $queryBuilder->createNamedParameter($releaseIds, Connection::PARAM_STR_ARRAY))
2587
            )
2588
            ->set('process_scheduled', 0)
2589
            ->set('process_id', '')
2590
            ->execute();
2591
2592
        if (!$withinLock) {
2593
            //$this->queryBuilder->getConnection()->executeQuery('COMMIT');
2594
        }
2595
2596
        return true;
2597
    }
2598
2599
    /**
2600
     * Delete processes marked as deleted
2601
     *
2602
     * @return void
2603
     *
2604
     * @deprecated since crawler v7.0.0, will be removed in crawler v8.0.0.
2605
     * Please Consider using $this->processRepository->deleteProcessesMarkedAsDeleted()
2606
     */
2607 1
    public function CLI_deleteProcessesMarkedDeleted()
2608
    {
2609 1
        $queryBuilder = GeneralUtility::makeInstance(ConnectionPool::class)->getQueryBuilderForTable($this->tableName);
2610
        $queryBuilder
2611 1
            ->delete('tx_crawler_process')
2612 1
            ->where('deleted = 1')
2613 1
            ->execute();
2614 1
    }
2615
2616
    /**
2617
     * Check if there are still resources left for the process with the given id
2618
     * Used to determine timeouts and to ensure a proper cleanup if there's a timeout
2619
     *
2620
     * @param  string  identification string for the process
2621
     * @return boolean determines if the process is still active / has resources
2622
     *
2623
     * TODO: Please consider moving this to Domain Model for Process or in ProcessRepository
2624
     */
2625 1
    public function CLI_checkIfProcessIsActive($pid)
2626
    {
2627 1
        $queryBuilder = GeneralUtility::makeInstance(ConnectionPool::class)->getQueryBuilderForTable($this->tableName);
2628 1
        $ret = false;
2629
2630
        $statement = $queryBuilder
2631 1
            ->from('tx_crawler_process')
2632 1
            ->select('active')
2633 1
            ->where(
2634 1
                $queryBuilder->expr()->eq('process_id', intval($pid))
2635
            )
2636 1
            ->orderBy('ttl')
2637 1
            ->execute();
2638
2639 1
        if ($row = $statement->fetch(0)) {
2640 1
            $ret = intVal($row['active']) == 1;
2641
        }
2642
2643 1
        return $ret;
2644
    }
2645
2646
    /**
2647
     * Create a unique Id for the current process
2648
     *
2649
     * @return string  the ID
2650
     */
2651 2
    public function CLI_buildProcessId()
2652
    {
2653 2
        if (!$this->processID) {
2654 1
            $this->processID = GeneralUtility::shortMD5($this->microtime(true));
2655
        }
2656 2
        return $this->processID;
2657
    }
2658
2659
    /**
2660
     * @param bool $get_as_float
2661
     *
2662
     * @return mixed
2663
     */
2664
    protected function microtime($get_as_float = false)
2665
    {
2666
        return microtime($get_as_float);
2667
    }
2668
2669
    /**
2670
     * Prints a message to the stdout (only if debug-mode is enabled)
2671
     *
2672
     * @param  string $msg  the message
2673
     */
2674
    public function CLI_debug($msg)
2675
    {
2676
        if (intval($this->extensionSettings['processDebug'])) {
2677
            echo $msg . "\n";
2678
            flush();
2679
        }
2680
    }
2681
2682
    /**
2683
     * Get URL content by making direct request to TYPO3.
2684
     *
2685
     * @param  string $url          Page URL
2686
     * @param  int    $crawlerId    Crawler-ID
2687
     * @return array
2688
     */
2689 2
    protected function sendDirectRequest($url, $crawlerId)
2690
    {
2691 2
        $parsedUrl = parse_url($url);
2692 2
        if (!is_array($parsedUrl)) {
2693
            return [];
2694
        }
2695
2696 2
        $requestHeaders = $this->buildRequestHeaderArray($parsedUrl, $crawlerId);
2697
2698 2
        $cmd = escapeshellcmd($this->extensionSettings['phpPath']);
2699 2
        $cmd .= ' ';
2700 2
        $cmd .= escapeshellarg(ExtensionManagementUtility::extPath('crawler') . 'cli/bootstrap.php');
2701 2
        $cmd .= ' ';
2702 2
        $cmd .= escapeshellarg($this->getFrontendBasePath());
2703 2
        $cmd .= ' ';
2704 2
        $cmd .= escapeshellarg($url);
2705 2
        $cmd .= ' ';
2706 2
        $cmd .= escapeshellarg(base64_encode(serialize($requestHeaders)));
2707
2708 2
        $startTime = microtime(true);
2709 2
        $content = $this->executeShellCommand($cmd);
2710 2
        $this->log($url . ' ' . (microtime(true) - $startTime));
2711
2712
        $result = [
2713 2
            'request' => implode("\r\n", $requestHeaders) . "\r\n\r\n",
2714 2
            'headers' => '',
2715 2
            'content' => $content
2716
        ];
2717
2718 2
        return $result;
2719
    }
2720
2721
    /**
2722
     * Cleans up entries that stayed for too long in the queue. These are:
2723
     * - processed entries that are over 1.5 days in age
2724
     * - scheduled entries that are over 7 days old
2725
     *
2726
     * @return void
2727
     */
2728
    public function cleanUpOldQueueEntries()
2729
    {
2730
        $processedAgeInSeconds = $this->extensionSettings['cleanUpProcessedAge'] * 86400; // 24*60*60 Seconds in 24 hours
2731
        $scheduledAgeInSeconds = $this->extensionSettings['cleanUpScheduledAge'] * 86400;
2732
2733
        $now = time();
2734
        $condition = '(exec_time<>0 AND exec_time<' . ($now - $processedAgeInSeconds) . ') OR scheduled<=' . ($now - $scheduledAgeInSeconds);
2735
        $this->flushQueue($condition);
2736
    }
2737
2738
    /**
2739
     * Initializes a TypoScript Frontend necessary for using TypoScript and TypoLink functions
2740
     *
2741
     * @param int $id
2742
     * @param int $typeNum
2743
     *
2744
     * @return void
2745
     */
2746
    protected function initTSFE($id = 1, $typeNum = 0)
2747
    {
2748
        EidUtility::initTCA();
0 ignored issues
show
Deprecated Code introduced by
The method TYPO3\CMS\Frontend\Utility\EidUtility::initTCA() has been deprecated with message: since TYPO3 v9.4, will be removed in TYPO3 v10.0. Is not needed anymore within eID scripts as TCA is now available at any time

This method has been deprecated. The supplier of the class has supplied an explanatory message.

The explanatory message should give you some clue as to whether and when the method will be removed from the class and what other method or class to use instead.

Loading history...
2749
        if (!is_object($GLOBALS['TT'])) {
2750
            $GLOBALS['TT'] = new TimeTracker(false);
2751
            $GLOBALS['TT']->start();
2752
        }
2753
2754
        $GLOBALS['TSFE'] = GeneralUtility::makeInstance(TypoScriptFrontendController::class, $GLOBALS['TYPO3_CONF_VARS'], $id, $typeNum);
2755
        $GLOBALS['TSFE']->sys_page = GeneralUtility::makeInstance(PageRepository::class);
2756
        $GLOBALS['TSFE']->sys_page->init(true);
2757
        $GLOBALS['TSFE']->initFEuser();
2758
        $GLOBALS['TSFE']->determineId();
2759
        $GLOBALS['TSFE']->initTemplate();
2760
        $GLOBALS['TSFE']->rootLine = $GLOBALS['TSFE']->sys_page->getRootLine($id, '');
2761
        $GLOBALS['TSFE']->getConfigArray();
2762
    }
2763
2764
    /**
2765
     * Returns a md5 hash generated from a serialized configuration array.
2766
     *
2767
     * @param array $configuration
2768
     *
2769
     * @return string
2770
     */
2771 7
    protected function getConfigurationHash(array $configuration)
2772
    {
2773 7
        unset($configuration['paramExpanded']);
2774 7
        unset($configuration['URLs']);
2775 7
        return md5(serialize($configuration));
2776
    }
2777
2778
    /**
2779
     * Check whether the Crawling Protocol should be http or https
2780
     *
2781
     * @param $crawlerConfiguration
2782
     * @param $pageConfiguration
2783
     *
2784
     * @return bool
2785
     */
2786 6
    protected function isCrawlingProtocolHttps($crawlerConfiguration, $pageConfiguration)
2787
    {
2788 6
        switch ($crawlerConfiguration) {
2789
            case -1:
2790 1
                return false;
2791 5
            case 0:
2792 3
                return $pageConfiguration;
2793 2
            case 1:
2794 1
                return true;
2795
            default:
2796 1
                return false;
2797
        }
2798
    }
2799
}
2800