Completed
Push — issue/252 ( b288df...34f654 )
by Tomas Norre
10:26 queued 02:05
created

CrawlerController::expandParameters()   F

Complexity

Conditions 24
Paths 501

Size

Total Lines 110

Duplication

Lines 0
Ratio 0 %

Code Coverage

Tests 0
CRAP Score 600

Importance

Changes 0
Metric Value
cc 24
nc 501
nop 2
dl 0
loc 110
ccs 0
cts 62
cp 0
crap 600
rs 0.5544
c 0
b 0
f 0

How to fix   Long Method    Complexity   

Long Method

Small methods make your code easier to understand, in particular if combined with a good name. Besides, if your method is small, finding a good name is usually much easier.

For example, if you find yourself adding comments to a method's body, this is usually a good sign to extract the commented part to a new method, and use the comment as a starting point when coming up with a good name for this new method.

Commonly applied refactorings include:

1
<?php
2
namespace AOE\Crawler\Controller;
3
4
/***************************************************************
5
 *  Copyright notice
6
 *
7
 *  (c) 2017 AOE GmbH <[email protected]>
8
 *
9
 *  All rights reserved
10
 *
11
 *  This script is part of the TYPO3 project. The TYPO3 project is
12
 *  free software; you can redistribute it and/or modify
13
 *  it under the terms of the GNU General Public License as published by
14
 *  the Free Software Foundation; either version 3 of the License, or
15
 *  (at your option) any later version.
16
 *
17
 *  The GNU General Public License can be found at
18
 *  http://www.gnu.org/copyleft/gpl.html.
19
 *
20
 *  This script is distributed in the hope that it will be useful,
21
 *  but WITHOUT ANY WARRANTY; without even the implied warranty of
22
 *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
23
 *  GNU General Public License for more details.
24
 *
25
 *  This copyright notice MUST APPEAR in all copies of the script!
26
 ***************************************************************/
27
28
use AOE\Crawler\Command\CrawlerCommandLineController;
29
use AOE\Crawler\Command\FlushCommandLineController;
30
use AOE\Crawler\Command\QueueCommandLineController;
31
use AOE\Crawler\Domain\Model\Reason;
32
use AOE\Crawler\Domain\Repository\QueueRepository;
33
use AOE\Crawler\Event\EventDispatcher;
34
use AOE\Crawler\Utility\IconUtility;
35
use AOE\Crawler\Utility\SignalSlotUtility;
36
use TYPO3\CMS\Backend\Utility\BackendUtility;
37
use TYPO3\CMS\Backend\Tree\View\PageTreeView;
38
use TYPO3\CMS\Core\Authentication\BackendUserAuthentication;
39
use TYPO3\CMS\Core\Database\DatabaseConnection;
40
use TYPO3\CMS\Core\Log\LogLevel;
41
use TYPO3\CMS\Core\TimeTracker\NullTimeTracker;
42
use TYPO3\CMS\Core\Utility\DebugUtility;
43
use TYPO3\CMS\Core\Utility\ExtensionManagementUtility;
44
use TYPO3\CMS\Core\Utility\GeneralUtility;
45
use TYPO3\CMS\Core\Utility\MathUtility;
46
use TYPO3\CMS\Extbase\Object\ObjectManager;
47
use TYPO3\CMS\Frontend\Controller\TypoScriptFrontendController;
48
use TYPO3\CMS\Frontend\Page\PageGenerator;
49
use TYPO3\CMS\Frontend\Page\PageRepository;
50
use TYPO3\CMS\Frontend\Utility\EidUtility;
51
use TYPO3\CMS\Lang\LanguageService;
52
53
/**
54
 * Class CrawlerController
55
 *
56
 * @package AOE\Crawler\Controller
57
 */
58
class CrawlerController
59
{
60
    const CLI_STATUS_NOTHING_PROCCESSED = 0;
61
    const CLI_STATUS_REMAIN = 1; //queue not empty
62
    const CLI_STATUS_PROCESSED = 2; //(some) queue items where processed
63
    const CLI_STATUS_ABORTED = 4; //instance didn't finish
64
    const CLI_STATUS_POLLABLE_PROCESSED = 8;
65
66
    /**
67
     * @var integer
68
     */
69
    public $setID = 0;
70
71
    /**
72
     * @var string
73
     */
74
    public $processID = '';
75
76
    /**
77
     * One hour is max stalled time for the CLI
78
     * If the process had the status "start" for 3600 seconds, it will be regarded stalled and a new process is started
79
     *
80
     * @var integer
81
     */
82
    public $max_CLI_exec_time = 3600;
83
84
    /**
85
     * @var array
86
     */
87
    public $duplicateTrack = [];
88
89
    /**
90
     * @var array
91
     */
92
    public $downloadUrls = [];
93
94
    /**
95
     * @var array
96
     */
97
    public $incomingProcInstructions = [];
98
99
    /**
100
     * @var array
101
     */
102
    public $incomingConfigurationSelection = [];
103
104
    /**
105
     * @var bool
106
     */
107
    public $registerQueueEntriesInternallyOnly = false;
108
109
    /**
110
     * @var array
111
     */
112
    public $queueEntries = [];
113
114
    /**
115
     * @var array
116
     */
117
    public $urlList = [];
118
119
    /**
120
     * @var boolean
121
     */
122
    public $debugMode = false;
123
124
    /**
125
     * @var array
126
     */
127
    public $extensionSettings = [];
128
129
    /**
130
     * Mount Point
131
     *
132
     * @var boolean
133
     */
134
    public $MP = false;
135
136
    /**
137
     * @var string
138
     */
139
    protected $processFilename;
140
141
    /**
142
     * Holds the internal access mode can be 'gui','cli' or 'cli_im'
143
     *
144
     * @var string
145
     */
146
    protected $accessMode;
147
148
    /**
149
     * @var DatabaseConnection
150
     */
151
    private $db;
152
153
    /**
154
     * @var BackendUserAuthentication
155
     */
156
    private $backendUser;
157
158
    /**
159
     * @var integer
160
     */
161
    private $scheduledTime = 0;
162
163
    /**
164
     * @var integer
165
     */
166
    private $reqMinute = 0;
167
168
    /**
169
     * @var bool
170
     */
171
    private $submitCrawlUrls = false;
172
173
    /**
174
     * @var bool
175
     */
176
    private $downloadCrawlUrls = false;
177
178
    /**
179
     * @var QueueRepository
180
     */
181
    protected  $queueRepository;
182
183
    /**
184
     * Method to set the accessMode can be gui, cli or cli_im
185
     *
186
     * @return string
187
     */
188 1
    public function getAccessMode()
189
    {
190 1
        return $this->accessMode;
191
    }
192
193
    /**
194
     * @param string $accessMode
195
     */
196 1
    public function setAccessMode($accessMode)
197
    {
198 1
        $this->accessMode = $accessMode;
199 1
    }
200
201
    /**
202
     * Set disabled status to prevent processes from being processed
203
     *
204
     * @param  bool $disabled (optional, defaults to true)
205
     * @return void
206
     */
207 3
    public function setDisabled($disabled = true)
208
    {
209 3
        if ($disabled) {
210 2
            GeneralUtility::writeFile($this->processFilename, '');
211
        } else {
212 1
            if (is_file($this->processFilename)) {
213 1
                unlink($this->processFilename);
214
            }
215
        }
216 3
    }
217
218
    /**
219
     * Get disable status
220
     *
221
     * @return bool true if disabled
222
     */
223 3
    public function getDisabled()
224
    {
225 3
        if (is_file($this->processFilename)) {
226 2
            return true;
227
        } else {
228 1
            return false;
229
        }
230
    }
231
232
    /**
233
     * @param string $filenameWithPath
234
     *
235
     * @return void
236
     */
237 4
    public function setProcessFilename($filenameWithPath)
238
    {
239 4
        $this->processFilename = $filenameWithPath;
240 4
    }
241
242
    /**
243
     * @return string
244
     */
245 1
    public function getProcessFilename()
246
    {
247 1
        return $this->processFilename;
248
    }
249
250
    /************************************
251
     *
252
     * Getting URLs based on Page TSconfig
253
     *
254
     ************************************/
255
256 24
    public function __construct()
257
    {
258 24
        $objectManager = GeneralUtility::makeInstance(ObjectManager::class);
259 24
        $this->queueRepository = $objectManager->get(QueueRepository::class);
260
261 24
        $this->db = $GLOBALS['TYPO3_DB'];
262 24
        $this->backendUser = $GLOBALS['BE_USER'];
263 24
        $this->processFilename = PATH_site . 'typo3temp/tx_crawler.proc';
264
265 24
        $settings = unserialize($GLOBALS['TYPO3_CONF_VARS']['EXT']['extConf']['crawler']);
266 24
        $settings = is_array($settings) ? $settings : [];
267
268
        // read ext_em_conf_template settings and set
269 24
        $this->setExtensionSettings($settings);
270
271
        // set defaults:
272 24
        if (MathUtility::convertToPositiveInteger($this->extensionSettings['countInARun']) == 0) {
273 21
            $this->extensionSettings['countInARun'] = 100;
274
        }
275
276 24
        $this->extensionSettings['processLimit'] = MathUtility::forceIntegerInRange($this->extensionSettings['processLimit'], 1, 99, 1);
277 24
    }
278
279
    /**
280
     * Sets the extensions settings (unserialized pendant of $TYPO3_CONF_VARS['EXT']['extConf']['crawler']).
281
     *
282
     * @param array $extensionSettings
283
     * @return void
284
     */
285 33
    public function setExtensionSettings(array $extensionSettings)
286
    {
287 33
        $this->extensionSettings = $extensionSettings;
288 33
    }
289
290
    /**
291
     * Check if the given page should be crawled
292
     *
293
     * @param array $pageRow
294
     * @return false|string false if the page should be crawled (not excluded), true / skipMessage if it should be skipped
295
     */
296 6
    public function checkIfPageShouldBeSkipped(array $pageRow)
297
    {
298 6
        $skipPage = false;
299 6
        $skipMessage = 'Skipped'; // message will be overwritten later
300
301
        // if page is hidden
302 6
        if (!$this->extensionSettings['crawlHiddenPages']) {
303 6
            if ($pageRow['hidden']) {
304 1
                $skipPage = true;
305 1
                $skipMessage = 'Because page is hidden';
306
            }
307
        }
308
309 6
        if (!$skipPage) {
310 5
            if (GeneralUtility::inList('3,4', $pageRow['doktype']) || $pageRow['doktype'] >= 199) {
311 3
                $skipPage = true;
312 3
                $skipMessage = 'Because doktype is not allowed';
313
            }
314
        }
315
316 6
        if (!$skipPage) {
317 2
            if (is_array($GLOBALS['TYPO3_CONF_VARS']['EXTCONF']['crawler']['excludeDoktype'])) {
318 2
                foreach ($GLOBALS['TYPO3_CONF_VARS']['EXTCONF']['crawler']['excludeDoktype'] as $key => $doktypeList) {
319 1
                    if (GeneralUtility::inList($doktypeList, $pageRow['doktype'])) {
320 1
                        $skipPage = true;
321 1
                        $skipMessage = 'Doktype was excluded by "' . $key . '"';
322 1
                        break;
323
                    }
324
                }
325
            }
326
        }
327
328 6
        if (!$skipPage) {
329
            // veto hook
330 1
            if (is_array($GLOBALS['TYPO3_CONF_VARS']['EXTCONF']['crawler']['pageVeto'])) {
331
                foreach ($GLOBALS['TYPO3_CONF_VARS']['EXTCONF']['crawler']['pageVeto'] as $key => $func) {
332
                    $params = [
333
                        'pageRow' => $pageRow
334
                    ];
335
                    // expects "false" if page is ok and "true" or a skipMessage if this page should _not_ be crawled
336
                    $veto = GeneralUtility::callUserFunction($func, $params, $this);
337
                    if ($veto !== false) {
338
                        $skipPage = true;
339
                        if (is_string($veto)) {
340
                            $skipMessage = $veto;
341
                        } else {
342
                            $skipMessage = 'Veto from hook "' . htmlspecialchars($key) . '"';
343
                        }
344
                        // no need to execute other hooks if a previous one return a veto
345
                        break;
346
                    }
347
                }
348
            }
349
        }
350
351 6
        return $skipPage ? $skipMessage : false;
352
    }
353
354
    /**
355
     * Wrapper method for getUrlsForPageId()
356
     * It returns an array of configurations and no urls!
357
     *
358
     * @param array $pageRow Page record with at least dok-type and uid columns.
359
     * @param string $skipMessage
360
     * @return array
361
     * @see getUrlsForPageId()
362
     */
363 2
    public function getUrlsForPageRow(array $pageRow, &$skipMessage = '')
364
    {
365 2
        $message = $this->checkIfPageShouldBeSkipped($pageRow);
366
367 2
        if ($message === false) {
368 1
            $forceSsl = ($pageRow['url_scheme'] === 2) ? true : false;
369 1
            $res = $this->getUrlsForPageId($pageRow['uid'], $forceSsl);
370 1
            $skipMessage = '';
371
        } else {
372 1
            $skipMessage = $message;
373 1
            $res = [];
374
        }
375
376 2
        return $res;
377
    }
378
379
    /**
380
     * This method is used to count if there are ANY unprocessed queue entries
381
     * of a given page_id and the configuration which matches a given hash.
382
     * If there if none, we can skip an inner detail check
383
     *
384
     * @param  int $uid
385
     * @param  string $configurationHash
386
     * @return boolean
387
     */
388 3
    protected function noUnprocessedQueueEntriesForPageWithConfigurationHashExist($uid, $configurationHash)
389
    {
390 3
        $configurationHash = $this->db->fullQuoteStr($configurationHash, 'tx_crawler_queue');
391 3
        $res = $this->db->exec_SELECTquery('count(*) as anz', 'tx_crawler_queue', "page_id=" . intval($uid) . " AND configuration_hash=" . $configurationHash . " AND exec_time=0");
392 3
        $row = $this->db->sql_fetch_assoc($res);
393
394 3
        return ($row['anz'] == 0);
395
    }
396
397
    /**
398
     * Creates a list of URLs from input array (and submits them to queue if asked for)
399
     * See Web > Info module script + "indexed_search"'s crawler hook-client using this!
400
     *
401
     * @param    array        Information about URLs from pageRow to crawl.
402
     * @param    array        Page row
403
     * @param    integer        Unix time to schedule indexing to, typically time()
404
     * @param    integer        Number of requests per minute (creates the interleave between requests)
405
     * @param    boolean        If set, submits the URLs to queue
406
     * @param    boolean        If set (and submitcrawlUrls is false) will fill $downloadUrls with entries)
407
     * @param    array        Array which is passed by reference and contains the an id per url to secure we will not crawl duplicates
408
     * @param    array        Array which will be filled with URLS for download if flag is set.
409
     * @param    array        Array of processing instructions
410
     * @return    string        List of URLs (meant for display in backend module)
411
     *
412
     */
413
    public function urlListFromUrlArray(
414
    array $vv,
415
    array $pageRow,
416
    $scheduledTime,
417
    $reqMinute,
418
    $submitCrawlUrls,
419
    $downloadCrawlUrls,
420
    array &$duplicateTrack,
421
    array &$downloadUrls,
422
    array $incomingProcInstructions
423
    ) {
424
        $urlList = '';
425
        // realurl support (thanks to Ingo Renner)
426
        if (ExtensionManagementUtility::isLoaded('realurl') && $vv['subCfg']['realurl']) {
427
428
            /** @var tx_realurl $urlObj */
429
            $urlObj = GeneralUtility::makeInstance('tx_realurl');
430
431
            if (!empty($vv['subCfg']['baseUrl'])) {
432
                $urlParts = parse_url($vv['subCfg']['baseUrl']);
433
                $host = strtolower($urlParts['host']);
434
                $urlObj->host = $host;
435
436
                // First pass, finding configuration OR pointer string:
437
                $urlObj->extConf = isset($GLOBALS['TYPO3_CONF_VARS']['EXTCONF']['realurl'][$urlObj->host]) ? $GLOBALS['TYPO3_CONF_VARS']['EXTCONF']['realurl'][$urlObj->host] : $GLOBALS['TYPO3_CONF_VARS']['EXTCONF']['realurl']['_DEFAULT'];
438
439
                // If it turned out to be a string pointer, then look up the real config:
440
                if (is_string($urlObj->extConf)) {
441
                    $urlObj->extConf = is_array($GLOBALS['TYPO3_CONF_VARS']['EXTCONF']['realurl'][$urlObj->extConf]) ? $GLOBALS['TYPO3_CONF_VARS']['EXTCONF']['realurl'][$urlObj->extConf] : $GLOBALS['TYPO3_CONF_VARS']['EXTCONF']['realurl']['_DEFAULT'];
442
                }
443
            }
444
445
            if (!$GLOBALS['TSFE']->sys_page) {
446
                $GLOBALS['TSFE']->sys_page = GeneralUtility::makeInstance('TYPO3\CMS\Frontend\Page\PageRepository');
447
            }
448
            if (!$GLOBALS['TSFE']->csConvObj) {
449
                $GLOBALS['TSFE']->csConvObj = GeneralUtility::makeInstance('TYPO3\CMS\Core\Charset\CharsetConverter');
450
            }
451
            if (!$GLOBALS['TSFE']->tmpl->rootLine[0]['uid']) {
452
                $GLOBALS['TSFE']->tmpl->rootLine[0]['uid'] = $urlObj->extConf['pagePath']['rootpage_id'];
453
            }
454
        }
455
456
        if (is_array($vv['URLs'])) {
457
            $configurationHash = $this->getConfigurationHash($vv);
458
            $skipInnerCheck = $this->noUnprocessedQueueEntriesForPageWithConfigurationHashExist($pageRow['uid'], $configurationHash);
459
460
            foreach ($vv['URLs'] as $urlQuery) {
461
                if ($this->drawURLs_PIfilter($vv['subCfg']['procInstrFilter'], $incomingProcInstructions)) {
462
463
                    // Calculate cHash:
464
                    if ($vv['subCfg']['cHash']) {
465
                        /* @var $cacheHash \TYPO3\CMS\Frontend\Page\CacheHashCalculator */
466
                        $cacheHash = GeneralUtility::makeInstance('TYPO3\CMS\Frontend\Page\CacheHashCalculator');
467
                        $urlQuery .= '&cHash=' . $cacheHash->generateForParameters($urlQuery);
468
                    }
469
470
                    // Create key by which to determine unique-ness:
471
                    $uKey = $urlQuery . '|' . $vv['subCfg']['userGroups'] . '|' . $vv['subCfg']['baseUrl'] . '|' . $vv['subCfg']['procInstrFilter'];
472
473
                    // realurl support (thanks to Ingo Renner)
474
                    $urlQuery = 'index.php' . $urlQuery;
475
                    if (ExtensionManagementUtility::isLoaded('realurl') && $vv['subCfg']['realurl']) {
476
                        $params = [
477
                            'LD' => [
478
                                'totalURL' => $urlQuery
479
                            ],
480
                            'TCEmainHook' => true
481
                        ];
482
                        $urlObj->encodeSpURL($params);
0 ignored issues
show
Bug introduced by
The variable $urlObj does not seem to be defined for all execution paths leading up to this point.

If you define a variable conditionally, it can happen that it is not defined for all execution paths.

Let’s take a look at an example:

function myFunction($a) {
    switch ($a) {
        case 'foo':
            $x = 1;
            break;

        case 'bar':
            $x = 2;
            break;
    }

    // $x is potentially undefined here.
    echo $x;
}

In the above example, the variable $x is defined if you pass “foo” or “bar” as argument for $a. However, since the switch statement has no default case statement, if you pass any other value, the variable $x would be undefined.

Available Fixes

  1. Check for existence of the variable explicitly:

    function myFunction($a) {
        switch ($a) {
            case 'foo':
                $x = 1;
                break;
    
            case 'bar':
                $x = 2;
                break;
        }
    
        if (isset($x)) { // Make sure it's always set.
            echo $x;
        }
    }
    
  2. Define a default value for the variable:

    function myFunction($a) {
        $x = ''; // Set a default which gets overridden for certain paths.
        switch ($a) {
            case 'foo':
                $x = 1;
                break;
    
            case 'bar':
                $x = 2;
                break;
        }
    
        echo $x;
    }
    
  3. Add a value for the missing path:

    function myFunction($a) {
        switch ($a) {
            case 'foo':
                $x = 1;
                break;
    
            case 'bar':
                $x = 2;
                break;
    
            // We add support for the missing case.
            default:
                $x = '';
                break;
        }
    
        echo $x;
    }
    
Loading history...
483
                        $urlQuery = $params['LD']['totalURL'];
484
                    }
485
486
                    // Scheduled time:
487
                    $schTime = $scheduledTime + round(count($duplicateTrack) * (60 / $reqMinute));
488
                    $schTime = floor($schTime / 60) * 60;
489
490
                    if (isset($duplicateTrack[$uKey])) {
491
492
                        //if the url key is registered just display it and do not resubmit is
493
                        $urlList = '<em><span class="typo3-dimmed">' . htmlspecialchars($urlQuery) . '</span></em><br/>';
494
                    } else {
495
                        $urlList = '[' . date('d.m.y H:i', $schTime) . '] ' . htmlspecialchars($urlQuery);
496
                        $this->urlList[] = '[' . date('d.m.y H:i', $schTime) . '] ' . $urlQuery;
497
498
                        $theUrl = ($vv['subCfg']['baseUrl'] ? $vv['subCfg']['baseUrl'] : GeneralUtility::getIndpEnv('TYPO3_SITE_URL')) . $urlQuery;
499
500
                        // Submit for crawling!
501
                        if ($submitCrawlUrls) {
502
                            $added = $this->addUrl(
503
                            $pageRow['uid'],
504
                            $theUrl,
505
                            $vv['subCfg'],
506
                            $scheduledTime,
507
                            $configurationHash,
508
                            $skipInnerCheck
509
                            );
510
                            if ($added === false) {
511
                                $urlList .= ' (Url already existed)';
512
                            }
513
                        } elseif ($downloadCrawlUrls) {
514
                            $downloadUrls[$theUrl] = $theUrl;
515
                        }
516
517
                        $urlList .= '<br />';
518
                    }
519
                    $duplicateTrack[$uKey] = true;
520
                }
521
            }
522
        } else {
523
            $urlList = 'ERROR - no URL generated';
524
        }
525
526
        return $urlList;
527
    }
528
529
    /**
530
     * Returns true if input processing instruction is among registered ones.
531
     *
532
     * @param string $piString PI to test
533
     * @param array $incomingProcInstructions Processing instructions
534
     * @return boolean
535
     */
536 5
    public function drawURLs_PIfilter($piString, array $incomingProcInstructions)
537
    {
538 5
        if (empty($incomingProcInstructions)) {
539 1
            return true;
540
        }
541
542 4
        foreach ($incomingProcInstructions as $pi) {
543 4
            if (GeneralUtility::inList($piString, $pi)) {
544 4
                return true;
545
            }
546
        }
547 2
    }
548
549
    public function getPageTSconfigForId($id)
550
    {
551
        if (!$this->MP) {
552
            $pageTSconfig = BackendUtility::getPagesTSconfig($id);
553
        } else {
554
            list(, $mountPointId) = explode('-', $this->MP);
555
            $pageTSconfig = BackendUtility::getPagesTSconfig($mountPointId);
556
        }
557
558
        // Call a hook to alter configuration
559
        if (is_array($GLOBALS['TYPO3_CONF_VARS']['EXTCONF']['crawler']['getPageTSconfigForId'])) {
560
            $params = [
561
                'pageId' => $id,
562
                'pageTSConfig' => &$pageTSconfig
563
            ];
564
            foreach ($GLOBALS['TYPO3_CONF_VARS']['EXTCONF']['crawler']['getPageTSconfigForId'] as $userFunc) {
565
                GeneralUtility::callUserFunction($userFunc, $params, $this);
566
            }
567
        }
568
569
        return $pageTSconfig;
570
    }
571
572
    /**
573
     * This methods returns an array of configurations.
574
     * And no urls!
575
     *
576
     * @param integer $id Page ID
577
     * @param bool $forceSsl Use https
578
     * @return array
579
     *
580
     * TODO: Should be switched back to protected - TNM 2018-11-16
581
     */
582
    public function getUrlsForPageId($id, $forceSsl = false)
583
    {
584
585
        /**
586
         * Get configuration from tsConfig
587
         */
588
589
        // Get page TSconfig for page ID:
590
        $pageTSconfig = $this->getPageTSconfigForId($id);
591
592
        $res = [];
593
594
        if (is_array($pageTSconfig) && is_array($pageTSconfig['tx_crawler.']['crawlerCfg.'])) {
595
            $crawlerCfg = $pageTSconfig['tx_crawler.']['crawlerCfg.'];
596
597
            if (is_array($crawlerCfg['paramSets.'])) {
598
                foreach ($crawlerCfg['paramSets.'] as $key => $values) {
599
                    if (is_array($values)) {
600
                        $key = str_replace('.', '', $key);
601
                        // Sub configuration for a single configuration string:
602
                        $subCfg = (array)$crawlerCfg['paramSets.'][$key . '.'];
603
                        $subCfg['key'] = $key;
604
605
                        if (strcmp($subCfg['procInstrFilter'], '')) {
606
                            $subCfg['procInstrFilter'] = implode(',', GeneralUtility::trimExplode(',', $subCfg['procInstrFilter']));
607
                        }
608
                        $pidOnlyList = implode(',', GeneralUtility::trimExplode(',', $subCfg['pidsOnly'], true));
609
610
                        // process configuration if it is not page-specific or if the specific page is the current page:
611
                        if (!strcmp($subCfg['pidsOnly'], '') || GeneralUtility::inList($pidOnlyList, $id)) {
612
613
                                // add trailing slash if not present
614
                            if (!empty($subCfg['baseUrl']) && substr($subCfg['baseUrl'], -1) != '/') {
615
                                $subCfg['baseUrl'] .= '/';
616
                            }
617
618
                            // Explode, process etc.:
619
                            $res[$key] = [];
620
                            $res[$key]['subCfg'] = $subCfg;
621
                            $res[$key]['paramParsed'] = $this->parseParams($crawlerCfg['paramSets.'][$key]);
622
                            $res[$key]['paramExpanded'] = $this->expandParameters($res[$key]['paramParsed'], $id);
623
                            $res[$key]['origin'] = 'pagets';
624
625
                            // recognize MP value
626
                            if (!$this->MP) {
627
                                $res[$key]['URLs'] = $this->compileUrls($res[$key]['paramExpanded'], ['?id=' . $id]);
628
                            } else {
629
                                $res[$key]['URLs'] = $this->compileUrls($res[$key]['paramExpanded'], ['?id=' . $id . '&MP=' . $this->MP]);
630
                            }
631
                        }
632
                    }
633
                }
634
            }
635
        }
636
637
        /**
638
         * Get configuration from tx_crawler_configuration records
639
         */
640
641
        // get records along the rootline
642
        $rootLine = BackendUtility::BEgetRootLine($id);
643
644
        foreach ($rootLine as $page) {
645
            $configurationRecordsForCurrentPage = BackendUtility::getRecordsByField(
0 ignored issues
show
Deprecated Code introduced by
The method TYPO3\CMS\Backend\Utilit...ty::getRecordsByField() has been deprecated with message: since TYPO3 v8, will be removed in TYPO3 v9

This method has been deprecated. The supplier of the class has supplied an explanatory message.

The explanatory message should give you some clue as to whether and when the method will be removed from the class and what other method or class to use instead.

Loading history...
646
                'tx_crawler_configuration',
647
                'pid',
648
                intval($page['uid']),
649
                BackendUtility::BEenableFields('tx_crawler_configuration') . BackendUtility::deleteClause('tx_crawler_configuration')
650
            );
651
652
            if (is_array($configurationRecordsForCurrentPage)) {
653
                foreach ($configurationRecordsForCurrentPage as $configurationRecord) {
654
655
                        // check access to the configuration record
656
                    if (empty($configurationRecord['begroups']) || $GLOBALS['BE_USER']->isAdmin() || $this->hasGroupAccess($GLOBALS['BE_USER']->user['usergroup_cached_list'], $configurationRecord['begroups'])) {
657
                        $pidOnlyList = implode(',', GeneralUtility::trimExplode(',', $configurationRecord['pidsonly'], true));
658
659
                        // process configuration if it is not page-specific or if the specific page is the current page:
660
                        if (!strcmp($configurationRecord['pidsonly'], '') || GeneralUtility::inList($pidOnlyList, $id)) {
661
                            $key = $configurationRecord['name'];
662
663
                            // don't overwrite previously defined paramSets
664
                            if (!isset($res[$key])) {
665
666
                                    /* @var $TSparserObject \TYPO3\CMS\Core\TypoScript\Parser\TypoScriptParser */
667
                                $TSparserObject = GeneralUtility::makeInstance('TYPO3\CMS\Core\TypoScript\Parser\TypoScriptParser');
668
                                $TSparserObject->parse($configurationRecord['processing_instruction_parameters_ts']);
669
670
                                $isCrawlingProtocolHttps = $this->isCrawlingProtocolHttps($configurationRecord['force_ssl'], $forceSsl);
671
672
                                $subCfg = [
673
                                    'procInstrFilter' => $configurationRecord['processing_instruction_filter'],
674
                                    'procInstrParams.' => $TSparserObject->setup,
675
                                    'baseUrl' => $this->getBaseUrlForConfigurationRecord(
676
                                        $configurationRecord['base_url'],
677
                                        $configurationRecord['sys_domain_base_url'],
678
                                        $isCrawlingProtocolHttps
679
                                    ),
680
                                    'realurl' => $configurationRecord['realurl'],
681
                                    'cHash' => $configurationRecord['chash'],
682
                                    'userGroups' => $configurationRecord['fegroups'],
683
                                    'exclude' => $configurationRecord['exclude'],
684
                                    'rootTemplatePid' => (int) $configurationRecord['root_template_pid'],
685
                                    'key' => $key
686
                                ];
687
688
                                // add trailing slash if not present
689
                                if (!empty($subCfg['baseUrl']) && substr($subCfg['baseUrl'], -1) != '/') {
690
                                    $subCfg['baseUrl'] .= '/';
691
                                }
692
                                if (!in_array($id, $this->expandExcludeString($subCfg['exclude']))) {
693
                                    $res[$key] = [];
694
                                    $res[$key]['subCfg'] = $subCfg;
695
                                    $res[$key]['paramParsed'] = $this->parseParams($configurationRecord['configuration']);
696
                                    $res[$key]['paramExpanded'] = $this->expandParameters($res[$key]['paramParsed'], $id);
697
                                    $res[$key]['URLs'] = $this->compileUrls($res[$key]['paramExpanded'], ['?id=' . $id]);
698
                                    $res[$key]['origin'] = 'tx_crawler_configuration_' . $configurationRecord['uid'];
699
                                }
700
                            }
701
                        }
702
                    }
703
                }
704
            }
705
        }
706
707
        if (is_array($GLOBALS['TYPO3_CONF_VARS']['EXTCONF']['crawler']['processUrls'])) {
708
            foreach ($GLOBALS['TYPO3_CONF_VARS']['EXTCONF']['crawler']['processUrls'] as $func) {
709
                $params = [
710
                    'res' => &$res,
711
                ];
712
                GeneralUtility::callUserFunction($func, $params, $this);
713
            }
714
        }
715
716
        return $res;
717
    }
718
719
    /**
720
     * Checks if a domain record exist and returns the base-url based on the record. If not the given baseUrl string is used.
721
     *
722
     * @param string $baseUrl
723
     * @param integer $sysDomainUid
724
     * @param bool $ssl
725
     * @return string
726
     */
727 3
    protected function getBaseUrlForConfigurationRecord($baseUrl, $sysDomainUid, $ssl = false)
728
    {
729 3
        $sysDomainUid = intval($sysDomainUid);
730 3
        $urlScheme = ($ssl === false) ? 'http' : 'https';
731
732 3
        if ($sysDomainUid > 0) {
733 2
            $res = $this->db->exec_SELECTquery(
734 2
                '*',
735 2
                'sys_domain',
736 2
                'uid = ' . $sysDomainUid .
737 2
                BackendUtility::BEenableFields('sys_domain') .
738 2
                BackendUtility::deleteClause('sys_domain')
739
            );
740 2
            $row = $this->db->sql_fetch_assoc($res);
741 2
            if ($row['domainName'] != '') {
742 1
                return $urlScheme . '://' . $row['domainName'];
743
            }
744
        }
745 2
        return $baseUrl;
746
    }
747
748
    public function getConfigurationsForBranch($rootid, $depth)
749
    {
750
        $configurationsForBranch = [];
751
752
        $pageTSconfig = $this->getPageTSconfigForId($rootid);
753
        if (is_array($pageTSconfig) && is_array($pageTSconfig['tx_crawler.']['crawlerCfg.']) && is_array($pageTSconfig['tx_crawler.']['crawlerCfg.']['paramSets.'])) {
754
            $sets = $pageTSconfig['tx_crawler.']['crawlerCfg.']['paramSets.'];
755
            if (is_array($sets)) {
756
                foreach ($sets as $key => $value) {
757
                    if (!is_array($value)) {
758
                        continue;
759
                    }
760
                    $configurationsForBranch[] = substr($key, -1) == '.' ? substr($key, 0, -1) : $key;
761
                }
762
            }
763
        }
764
        $pids = [];
765
        $rootLine = BackendUtility::BEgetRootLine($rootid);
766
        foreach ($rootLine as $node) {
767
            $pids[] = $node['uid'];
768
        }
769
        /* @var PageTreeView $tree */
770
        $tree = GeneralUtility::makeInstance(PageTreeView::class);
771
        $perms_clause = $GLOBALS['BE_USER']->getPagePermsClause(1);
772
        $tree->init('AND ' . $perms_clause);
773
        $tree->getTree($rootid, $depth, '');
774
        foreach ($tree->tree as $node) {
775
            $pids[] = $node['row']['uid'];
776
        }
777
778
        $res = $this->db->exec_SELECTquery(
779
            '*',
780
            'tx_crawler_configuration',
781
            'pid IN (' . implode(',', $pids) . ') ' .
782
            BackendUtility::BEenableFields('tx_crawler_configuration') .
783
            BackendUtility::deleteClause('tx_crawler_configuration') . ' ' .
784
            BackendUtility::versioningPlaceholderClause('tx_crawler_configuration') . ' '
785
        );
786
787
        while ($row = $this->db->sql_fetch_assoc($res)) {
788
            $configurationsForBranch[] = $row['name'];
789
        }
790
        $this->db->sql_free_result($res);
791
        return $configurationsForBranch;
792
    }
793
794
    /**
795
     * Check if a user has access to an item
796
     * (e.g. get the group list of the current logged in user from $GLOBALS['TSFE']->gr_list)
797
     *
798
     * @see \TYPO3\CMS\Frontend\Page\PageRepository::getMultipleGroupsWhereClause()
799
     * @param  string $groupList    Comma-separated list of (fe_)group UIDs from a user
800
     * @param  string $accessList   Comma-separated list of (fe_)group UIDs of the item to access
801
     * @return bool                 TRUE if at least one of the users group UIDs is in the access list or the access list is empty
802
     */
803 3
    public function hasGroupAccess($groupList, $accessList)
804
    {
805 3
        if (empty($accessList)) {
806 1
            return true;
807
        }
808 2
        foreach (GeneralUtility::intExplode(',', $groupList) as $groupUid) {
809 2
            if (GeneralUtility::inList($accessList, $groupUid)) {
810 2
                return true;
811
            }
812
        }
813 1
        return false;
814
    }
815
816
    /**
817
     * Parse GET vars of input Query into array with key=>value pairs
818
     *
819
     * @param string $inputQuery Input query string
820
     * @return array
821
     */
822 3
    public function parseParams($inputQuery)
823
    {
824
        // Extract all GET parameters into an ARRAY:
825 3
        $paramKeyValues = [];
826 3
        $GETparams = explode('&', $inputQuery);
827
828 3
        foreach ($GETparams as $paramAndValue) {
829 3
            list($p, $v) = explode('=', $paramAndValue, 2);
830 3
            if (strlen($p)) {
831 3
                $paramKeyValues[rawurldecode($p)] = rawurldecode($v);
832
            }
833
        }
834
835 3
        return $paramKeyValues;
836
    }
837
838
    /**
839
     * Will expand the parameters configuration to individual values. This follows a certain syntax of the value of each parameter.
840
     * Syntax of values:
841
     * - Basically: If the value is wrapped in [...] it will be expanded according to the following syntax, otherwise the value is taken literally
842
     * - Configuration is splitted by "|" and the parts are processed individually and finally added together
843
     * - For each configuration part:
844
     *         - "[int]-[int]" = Integer range, will be expanded to all values in between, values included, starting from low to high (max. 1000). Example "1-34" or "-40--30"
845
     *         - "_TABLE:[TCA table name];[_PID:[optional page id, default is current page]];[_ENABLELANG:1]" = Look up of table records from PID, filtering out deleted records. Example "_TABLE:tt_content; _PID:123"
846
     *        _ENABLELANG:1 picks only original records without their language overlays
847
     *         - Default: Literal value
848
     *
849
     * @param array $paramArray Array with key (GET var name) and values (value of GET var which is configuration for expansion)
850
     * @param integer $pid Current page ID
851
     * @return array
852
     */
853
    public function expandParameters($paramArray, $pid)
854
    {
855
        global $TCA;
856
857
        // Traverse parameter names:
858
        foreach ($paramArray as $p => $v) {
859
            $v = trim($v);
860
861
            // If value is encapsulated in square brackets it means there are some ranges of values to find, otherwise the value is literal
862
            if (substr($v, 0, 1) === '[' && substr($v, -1) === ']') {
863
                // So, find the value inside brackets and reset the paramArray value as an array.
864
                $v = substr($v, 1, -1);
865
                $paramArray[$p] = [];
866
867
                // Explode parts and traverse them:
868
                $parts = explode('|', $v);
869
                foreach ($parts as $pV) {
870
871
                        // Look for integer range: (fx. 1-34 or -40--30 // reads minus 40 to minus 30)
872
                    if (preg_match('/^(-?[0-9]+)\s*-\s*(-?[0-9]+)$/', trim($pV), $reg)) {
873
874
                        // Swap if first is larger than last:
875
                        if ($reg[1] > $reg[2]) {
876
                            $temp = $reg[2];
877
                            $reg[2] = $reg[1];
878
                            $reg[1] = $temp;
879
                        }
880
881
                        // Traverse range, add values:
882
                        $runAwayBrake = 1000; // Limit to size of range!
883
                        for ($a = $reg[1]; $a <= $reg[2];$a++) {
884
                            $paramArray[$p][] = $a;
885
                            $runAwayBrake--;
886
                            if ($runAwayBrake <= 0) {
887
                                break;
888
                            }
889
                        }
890
                    } elseif (substr(trim($pV), 0, 7) == '_TABLE:') {
891
892
                        // Parse parameters:
893
                        $subparts = GeneralUtility::trimExplode(';', $pV);
894
                        $subpartParams = [];
895
                        foreach ($subparts as $spV) {
896
                            list($pKey, $pVal) = GeneralUtility::trimExplode(':', $spV);
897
                            $subpartParams[$pKey] = $pVal;
898
                        }
899
900
                        // Table exists:
901
                        if (isset($TCA[$subpartParams['_TABLE']])) {
902
                            $lookUpPid = isset($subpartParams['_PID']) ? intval($subpartParams['_PID']) : $pid;
903
                            $pidField = isset($subpartParams['_PIDFIELD']) ? trim($subpartParams['_PIDFIELD']) : 'pid';
904
                            $where = isset($subpartParams['_WHERE']) ? $subpartParams['_WHERE'] : '';
905
                            $addTable = isset($subpartParams['_ADDTABLE']) ? $subpartParams['_ADDTABLE'] : '';
906
907
                            $fieldName = $subpartParams['_FIELD'] ? $subpartParams['_FIELD'] : 'uid';
908
                            if ($fieldName === 'uid' || $TCA[$subpartParams['_TABLE']]['columns'][$fieldName]) {
909
                                $andWhereLanguage = '';
910
                                $transOrigPointerField = $TCA[$subpartParams['_TABLE']]['ctrl']['transOrigPointerField'];
911
912
                                if ($subpartParams['_ENABLELANG'] && $transOrigPointerField) {
913
                                    $andWhereLanguage = ' AND ' . $this->db->quoteStr($transOrigPointerField, $subpartParams['_TABLE']) . ' <= 0 ';
914
                                }
915
916
                                $where = $this->db->quoteStr($pidField, $subpartParams['_TABLE']) . '=' . intval($lookUpPid) . ' ' .
917
                                    $andWhereLanguage . $where;
918
919
                                $rows = $this->db->exec_SELECTgetRows(
920
                                    $fieldName,
921
                                    $subpartParams['_TABLE'] . $addTable,
922
                                    $where . BackendUtility::deleteClause($subpartParams['_TABLE']),
923
                                    '',
924
                                    '',
925
                                    '',
926
                                    $fieldName
927
                                );
928
929
                                if (is_array($rows)) {
930
                                    $paramArray[$p] = array_merge($paramArray[$p], array_keys($rows));
931
                                }
932
                            }
933
                        }
934
                    } else { // Just add value:
935
                        $paramArray[$p][] = $pV;
936
                    }
937
                    // Hook for processing own expandParameters place holder
938
                    if (is_array($GLOBALS['TYPO3_CONF_VARS']['SC_OPTIONS']['crawler/class.tx_crawler_lib.php']['expandParameters'])) {
939
                        $_params = [
940
                            'pObj' => &$this,
941
                            'paramArray' => &$paramArray,
942
                            'currentKey' => $p,
943
                            'currentValue' => $pV,
944
                            'pid' => $pid
945
                        ];
946
                        foreach ($GLOBALS['TYPO3_CONF_VARS']['SC_OPTIONS']['crawler/class.tx_crawler_lib.php']['expandParameters'] as $key => $_funcRef) {
947
                            GeneralUtility::callUserFunction($_funcRef, $_params, $this);
948
                        }
949
                    }
950
                }
951
952
                // Make unique set of values and sort array by key:
953
                $paramArray[$p] = array_unique($paramArray[$p]);
954
                ksort($paramArray);
955
            } else {
956
                // Set the literal value as only value in array:
957
                $paramArray[$p] = [$v];
958
            }
959
        }
960
961
        return $paramArray;
962
    }
963
964
    /**
965
     * Compiling URLs from parameter array (output of expandParameters())
966
     * The number of URLs will be the multiplication of the number of parameter values for each key
967
     *
968
     * @param array $paramArray Output of expandParameters(): Array with keys (GET var names) and for each an array of values
969
     * @param array $urls URLs accumulated in this array (for recursion)
970
     * @return array
971
     */
972 3
    public function compileUrls($paramArray, $urls = [])
973
    {
974 3
        if (count($paramArray) && is_array($urls)) {
975
            // shift first off stack:
976 2
            reset($paramArray);
977 2
            $varName = key($paramArray);
978 2
            $valueSet = array_shift($paramArray);
979
980
            // Traverse value set:
981 2
            $newUrls = [];
982 2
            foreach ($urls as $url) {
983 1
                foreach ($valueSet as $val) {
984 1
                    $newUrls[] = $url . (strcmp($val, '') ? '&' . rawurlencode($varName) . '=' . rawurlencode($val) : '');
985
986 1
                    if (count($newUrls) > MathUtility::forceIntegerInRange($this->extensionSettings['maxCompileUrls'], 1, 1000000000, 10000)) {
987 1
                        break;
988
                    }
989
                }
990
            }
991 2
            $urls = $newUrls;
992 2
            $urls = $this->compileUrls($paramArray, $urls);
993
        }
994
995 3
        return $urls;
996
    }
997
998
    /************************************
999
     *
1000
     * Crawler log
1001
     *
1002
     ************************************/
1003
1004
    /**
1005
     * Return array of records from crawler queue for input page ID
1006
     *
1007
     * @param integer $id Page ID for which to look up log entries.
1008
     * @param string$filter Filter: "all" => all entries, "pending" => all that is not yet run, "finished" => all complete ones
1009
     * @param boolean $doFlush If TRUE, then entries selected at DELETED(!) instead of selected!
1010
     * @param boolean $doFullFlush
1011
     * @param integer $itemsPerPage Limit the amount of entries per page default is 10
1012
     * @return array
1013
     */
1014 4
    public function getLogEntriesForPageId($id, $filter = '', $doFlush = false, $doFullFlush = false, $itemsPerPage = 10)
1015
    {
1016
        switch ($filter) {
1017 4
            case 'pending':
1018
                $addWhere = ' AND exec_time=0';
1019
                break;
1020 4
            case 'finished':
1021
                $addWhere = ' AND exec_time>0';
1022
                break;
1023
            default:
1024 4
                $addWhere = '';
1025 4
                break;
1026
        }
1027
1028
        // FIXME: Write unit test that ensures that the right records are deleted.
1029 4
        if ($doFlush) {
1030 2
            $this->flushQueue(($doFullFlush ? '1=1' : ('page_id=' . intval($id))) . $addWhere);
1031 2
            return [];
1032
        } else {
1033 2
            return $this->db->exec_SELECTgetRows(
1034 2
                '*',
1035 2
                'tx_crawler_queue',
1036 2
                'page_id=' . intval($id) . $addWhere,
1037 2
                '',
1038 2
                'scheduled DESC',
1039 2
                (intval($itemsPerPage) > 0 ? intval($itemsPerPage) : '')
1040
            );
1041
        }
1042
    }
1043
1044
    /**
1045
     * Return array of records from crawler queue for input set ID
1046
     *
1047
     * @param integer $set_id Set ID for which to look up log entries.
1048
     * @param string $filter Filter: "all" => all entries, "pending" => all that is not yet run, "finished" => all complete ones
1049
     * @param boolean $doFlush If TRUE, then entries selected at DELETED(!) instead of selected!
1050
     * @param integer $itemsPerPage Limit the amount of entires per page default is 10
1051
     * @return array
1052
     */
1053 6
    public function getLogEntriesForSetId($set_id, $filter = '', $doFlush = false, $doFullFlush = false, $itemsPerPage = 10)
1054
    {
1055
        // FIXME: Write Unit tests for Filters
1056
        switch ($filter) {
1057 6
            case 'pending':
1058 1
                $addWhere = ' AND exec_time=0';
1059 1
                break;
1060 5
            case 'finished':
1061 1
                $addWhere = ' AND exec_time>0';
1062 1
                break;
1063
            default:
1064 4
                $addWhere = '';
1065 4
                break;
1066
        }
1067
        // FIXME: Write unit test that ensures that the right records are deleted.
1068 6
        if ($doFlush) {
1069 4
            $this->flushQueue($doFullFlush ? '' : ('set_id=' . intval($set_id) . $addWhere));
1070 4
            return [];
1071
        } else {
1072 2
            return $this->db->exec_SELECTgetRows(
1073 2
                '*',
1074 2
                'tx_crawler_queue',
1075 2
                'set_id=' . intval($set_id) . $addWhere,
1076 2
                '',
1077 2
                'scheduled DESC',
1078 2
                (intval($itemsPerPage) > 0 ? intval($itemsPerPage) : '')
1079
            );
1080
        }
1081
    }
1082
1083
    /**
1084
     * Removes queue entries
1085
     *
1086
     * @param string $where SQL related filter for the entries which should be removed
1087
     * @return void
1088
     */
1089 10
    protected function flushQueue($where = '')
1090
    {
1091 10
        $realWhere = strlen($where) > 0 ? $where : '1=1';
1092
1093 10
        if (EventDispatcher::getInstance()->hasObserver('queueEntryFlush') || SignalSlotUtility::hasSignal(__CLASS__, SignalSlotUtility::SIGNAL_QUEUE_ENTRY_FLUSH)) {
1094
            $groups = $this->db->exec_SELECTgetRows('DISTINCT set_id', 'tx_crawler_queue', $realWhere);
1095
            if (is_array($groups)) {
1096
                foreach ($groups as $group) {
1097
1098
                    // The event dispatcher is deprecated since crawler v6.3.0, will be removed in crawler v7.0.0.
1099
                    // Please use the Signal instead.
1100
                    if (EventDispatcher::getInstance()->hasObserver('queueEntryFlush')) {
1101
                        EventDispatcher::getInstance()->post(
1102
                            'queueEntryFlush',
1103
                            $group['set_id'],
1104
                            $this->db->exec_SELECTgetRows('uid, set_id', 'tx_crawler_queue', $realWhere . ' AND set_id="' . $group['set_id'] . '"')
1105
                        );
1106
                    }
1107
1108
                    if (SignalSlotUtility::hasSignal(__CLASS__, SignalSlotUtility::SIGNAL_QUEUE_ENTRY_FLUSH)) {
1109
                        $signalInputArray = $this->db->exec_SELECTgetRows('uid, set_id', 'tx_crawler_queue', $realWhere . ' AND set_id="' . $group['set_id'] . '"');
1110
                        SignalSlotUtility::emitSignal(
1111
                            __CLASS__,
1112
                            SignalSlotUtility::SIGNAL_QUEUE_ENTRY_FLUSH,
1113
                            $signalInputArray
0 ignored issues
show
Bug introduced by
It seems like $signalInputArray defined by $this->db->exec_SELECTge...$group['set_id'] . '"') on line 1109 can also be of type null; however, AOE\Crawler\Utility\Sign...otUtility::emitSignal() does only seem to accept array, maybe add an additional type check?

If a method or function can return multiple different values and unless you are sure that you only can receive a single value in this context, we recommend to add an additional type check:

/**
 * @return array|string
 */
function returnsDifferentValues($x) {
    if ($x) {
        return 'foo';
    }

    return array();
}

$x = returnsDifferentValues($y);
if (is_array($x)) {
    // $x is an array.
}

If this a common case that PHP Analyzer should handle natively, please let us know by opening an issue.

Loading history...
1114
                        );
1115
                    }
1116
                }
1117
            }
1118
        }
1119
1120 10
        $GLOBALS['TYPO3_DB']->exec_DELETEquery('tx_crawler_queue', $realWhere);
1121 10
    }
1122
1123
    /**
1124
     * Adding call back entries to log (called from hooks typically, see indexed search class "class.crawler.php"
1125
     *
1126
     * @param integer $setId Set ID
1127
     * @param array $params Parameters to pass to call back function
1128
     * @param string $callBack Call back object reference, eg. 'EXT:indexed_search/class.crawler.php:&tx_indexedsearch_crawler'
1129
     * @param integer $page_id Page ID to attach it to
1130
     * @param integer $schedule Time at which to activate
1131
     * @return void
1132
     */
1133
    public function addQueueEntry_callBack($setId, $params, $callBack, $page_id = 0, $schedule = 0)
1134
    {
1135
        if (!is_array($params)) {
1136
            $params = [];
1137
        }
1138
        $params['_CALLBACKOBJ'] = $callBack;
1139
1140
        // Compile value array:
1141
        $fieldArray = [
1142
            'page_id' => intval($page_id),
1143
            'parameters' => serialize($params),
1144
            'scheduled' => intval($schedule) ? intval($schedule) : $this->getCurrentTime(),
1145
            'exec_time' => 0,
1146
            'set_id' => intval($setId),
1147
            'result_data' => '',
1148
        ];
1149
1150
        $this->db->exec_INSERTquery('tx_crawler_queue', $fieldArray);
1151
    }
1152
1153
    /************************************
1154
     *
1155
     * URL setting
1156
     *
1157
     ************************************/
1158
1159
    /**
1160
     * Setting a URL for crawling:
1161
     *
1162
     * @param integer $id Page ID
1163
     * @param string $url Complete URL
1164
     * @param array $subCfg Sub configuration array (from TS config)
1165
     * @param integer $tstamp Scheduled-time
1166
     * @param string $configurationHash (optional) configuration hash
1167
     * @param bool $skipInnerDuplicationCheck (optional) skip inner duplication check
1168
     * @return bool
1169
     */
1170
    public function addUrl(
1171
        $id,
1172
        $url,
1173
        array $subCfg,
1174
        $tstamp,
1175
        $configurationHash = '',
1176
        $skipInnerDuplicationCheck = false
1177
    ) {
1178
        $urlAdded = false;
1179
        $rows = [];
1180
1181
        // Creating parameters:
1182
        $parameters = [
1183
            'url' => $url
1184
        ];
1185
1186
        // fe user group simulation:
1187
        $uGs = implode(',', array_unique(GeneralUtility::intExplode(',', $subCfg['userGroups'], true)));
1188
        if ($uGs) {
1189
            $parameters['feUserGroupList'] = $uGs;
1190
        }
1191
1192
        // Setting processing instructions
1193
        $parameters['procInstructions'] = GeneralUtility::trimExplode(',', $subCfg['procInstrFilter']);
1194
        if (is_array($subCfg['procInstrParams.'])) {
1195
            $parameters['procInstrParams'] = $subCfg['procInstrParams.'];
1196
        }
1197
1198
        // Possible TypoScript Template Parents
1199
        $parameters['rootTemplatePid'] = $subCfg['rootTemplatePid'];
1200
1201
        // Compile value array:
1202
        $parameters_serialized = serialize($parameters);
1203
        $fieldArray = [
1204
            'page_id' => intval($id),
1205
            'parameters' => $parameters_serialized,
1206
            'parameters_hash' => GeneralUtility::shortMD5($parameters_serialized),
1207
            'configuration_hash' => $configurationHash,
1208
            'scheduled' => $tstamp,
1209
            'exec_time' => 0,
1210
            'set_id' => intval($this->setID),
1211
            'result_data' => '',
1212
            'configuration' => $subCfg['key'],
1213
        ];
1214
1215
        if ($this->registerQueueEntriesInternallyOnly) {
1216
            //the entries will only be registered and not stored to the database
1217
            $this->queueEntries[] = $fieldArray;
1218
        } else {
1219
            if (!$skipInnerDuplicationCheck) {
1220
                // check if there is already an equal entry
1221
                $rows = $this->getDuplicateRowsIfExist($tstamp, $fieldArray);
1222
            }
1223
1224
            if (count($rows) == 0) {
1225
                $this->db->exec_INSERTquery('tx_crawler_queue', $fieldArray);
1226
                $uid = $this->db->sql_insert_id();
1227
                $rows[] = $uid;
1228
                $urlAdded = true;
1229
1230
                // The event dispatcher is deprecated since crawler v6.3.0, will be removed in crawler v7.0.0.
1231
                // Please use the Signal instead.
1232
                EventDispatcher::getInstance()->post('urlAddedToQueue', $this->setID, ['uid' => $uid, 'fieldArray' => $fieldArray]);
1233
1234
                SignalSlotUtility::emitSignal(
1235
                    __CLASS__,
1236
                    SignalSlotUtility::SIGNAL_URL_ADDED_TO_QUEUE,
1237
                    ['uid' => $uid, 'fieldArray' => $fieldArray]
0 ignored issues
show
Bug introduced by
array('uid' => $uid, 'fieldArray' => $fieldArray) cannot be passed to emitsignal() as the parameter $payload expects a reference.
Loading history...
1238
                );
1239
1240
            } else {
1241
                // The event dispatcher is deprecated since crawler v6.3.0, will be removed in crawler v7.0.0.
1242
                // Please use the Signal instead.
1243
                EventDispatcher::getInstance()->post('duplicateUrlInQueue', $this->setID, ['rows' => $rows, 'fieldArray' => $fieldArray]);
1244
1245
                SignalSlotUtility::emitSignal(
1246
                    __CLASS__,
1247
                    SignalSlotUtility::SIGNAL_DUPLICATE_URL_IN_QUEUE,
1248
                    ['rows' => $rows, 'fieldArray' => $fieldArray]
0 ignored issues
show
Bug introduced by
array('rows' => $rows, '...dArray' => $fieldArray) cannot be passed to emitsignal() as the parameter $payload expects a reference.
Loading history...
1249
                );
1250
            }
1251
        }
1252
1253
        return $urlAdded;
1254
    }
1255
1256
    /**
1257
     * This method determines duplicates for a queue entry with the same parameters and this timestamp.
1258
     * If the timestamp is in the past, it will check if there is any unprocessed queue entry in the past.
1259
     * If the timestamp is in the future it will check, if the queued entry has exactly the same timestamp
1260
     *
1261
     * @param int $tstamp
1262
     * @param array $fieldArray
1263
     *
1264
     * @return array
1265
     */
1266
    protected function getDuplicateRowsIfExist($tstamp, $fieldArray)
1267
    {
1268
        $rows = [];
1269
1270
        $currentTime = $this->getCurrentTime();
1271
1272
        //if this entry is scheduled with "now"
1273
        if ($tstamp <= $currentTime) {
1274
            if ($this->extensionSettings['enableTimeslot']) {
1275
                $timeBegin = $currentTime - 100;
1276
                $timeEnd = $currentTime + 100;
1277
                $where = ' ((scheduled BETWEEN ' . $timeBegin . ' AND ' . $timeEnd . ' ) OR scheduled <= ' . $currentTime . ') ';
1278
            } else {
1279
                $where = 'scheduled <= ' . $currentTime;
1280
            }
1281
        } elseif ($tstamp > $currentTime) {
1282
            //entry with a timestamp in the future need to have the same schedule time
1283
            $where = 'scheduled = ' . $tstamp ;
1284
        }
1285
1286
        if (!empty($where)) {
1287
            $result = $this->db->exec_SELECTgetRows(
1288
                'qid',
1289
                'tx_crawler_queue',
1290
                $where .
1291
                ' AND NOT exec_time' .
1292
                ' AND NOT process_id ' .
1293
                ' AND page_id=' . intval($fieldArray['page_id']) .
1294
                ' AND parameters_hash = ' . $this->db->fullQuoteStr($fieldArray['parameters_hash'], 'tx_crawler_queue')
1295
            );
1296
1297
            if (is_array($result)) {
1298
                foreach ($result as $value) {
1299
                    $rows[] = $value['qid'];
1300
                }
1301
            }
1302
        }
1303
1304
        return $rows;
1305
    }
1306
1307
    /**
1308
     * Returns the current system time
1309
     *
1310
     * @return int
1311
     */
1312
    public function getCurrentTime()
1313
    {
1314
        return time();
1315
    }
1316
1317
    /************************************
1318
     *
1319
     * URL reading
1320
     *
1321
     ************************************/
1322
1323
    /**
1324
     * Read URL for single queue entry
1325
     *
1326
     * @param integer $queueId
1327
     * @param boolean $force If set, will process even if exec_time has been set!
1328
     * @return integer
1329
     */
1330
    public function readUrl($queueId, $force = false)
1331
    {
1332
        $ret = 0;
1333
        if ($this->debugMode) {
1334
            GeneralUtility::devlog('crawler-readurl start ' . microtime(true), __FUNCTION__);
1335
        }
1336
        // Get entry:
1337
        list($queueRec) = $this->db->exec_SELECTgetRows(
1338
            '*',
1339
            'tx_crawler_queue',
1340
            'qid=' . intval($queueId) . ($force ? '' : ' AND exec_time=0 AND process_scheduled > 0')
1341
        );
1342
1343
        if (!is_array($queueRec)) {
1344
            return;
1345
        }
1346
1347
        $parameters = unserialize($queueRec['parameters']);
1348
        if ($parameters['rootTemplatePid']) {
1349
            $this->initTSFE((int)$parameters['rootTemplatePid']);
1350
        } else {
1351
            GeneralUtility::sysLog(
1352
                'Page with (' . $queueRec['page_id'] . ') could not be crawled, please check your crawler configuration. Perhaps no Root Template Pid is set',
1353
                'crawler',
1354
                GeneralUtility::SYSLOG_SEVERITY_WARNING
1355
            );
1356
        }
1357
1358
        SignalSlotUtility::emitSignal(
1359
            __CLASS__,
1360
            SignalSlotUtility::SIGNAL_QUEUEITEM_PREPROCESS,
1361
            [$queueId, $queueRec]
0 ignored issues
show
Bug introduced by
array($queueId, $queueRec) cannot be passed to emitsignal() as the parameter $payload expects a reference.
Loading history...
1362
        );
1363
1364
        // Set exec_time to lock record:
1365
        $field_array = ['exec_time' => $this->getCurrentTime()];
1366
1367
        if (isset($this->processID)) {
1368
            //if mulitprocessing is used we need to store the id of the process which has handled this entry
1369
            $field_array['process_id_completed'] = $this->processID;
1370
        }
1371
        $this->db->exec_UPDATEquery('tx_crawler_queue', 'qid=' . intval($queueId), $field_array);
1372
1373
        $result = $this->readUrl_exec($queueRec);
1374
        $resultData = unserialize($result['content']);
1375
1376
        //atm there's no need to point to specific pollable extensions
1377
        if (is_array($GLOBALS['TYPO3_CONF_VARS']['EXTCONF']['crawler']['pollSuccess'])) {
1378
            foreach ($GLOBALS['TYPO3_CONF_VARS']['EXTCONF']['crawler']['pollSuccess'] as $pollable) {
1379
                // only check the success value if the instruction is runnig
1380
                // it is important to name the pollSuccess key same as the procInstructions key
1381
                if (is_array($resultData['parameters']['procInstructions']) && in_array(
1382
                    $pollable,
1383
                        $resultData['parameters']['procInstructions']
1384
                )
1385
                ) {
1386
                    if (!empty($resultData['success'][$pollable]) && $resultData['success'][$pollable]) {
1387
                        $ret |= self::CLI_STATUS_POLLABLE_PROCESSED;
1388
                    }
1389
                }
1390
            }
1391
        }
1392
1393
        // Set result in log which also denotes the end of the processing of this entry.
1394
        $field_array = ['result_data' => serialize($result)];
1395
1396
        SignalSlotUtility::emitSignal(
1397
            __CLASS__,
1398
            SignalSlotUtility::SIGNAL_QUEUEITEM_POSTPROCESS,
1399
            [$queueId, $field_array]
0 ignored issues
show
Bug introduced by
array($queueId, $field_array) cannot be passed to emitsignal() as the parameter $payload expects a reference.
Loading history...
1400
        );
1401
1402
        $this->db->exec_UPDATEquery('tx_crawler_queue', 'qid=' . intval($queueId), $field_array);
1403
1404
        if ($this->debugMode) {
1405
            GeneralUtility::devlog('crawler-readurl stop ' . microtime(true), __FUNCTION__);
1406
        }
1407
1408
        return $ret;
1409
    }
1410
1411
    /**
1412
     * Read URL for not-yet-inserted log-entry
1413
     *
1414
     * @param array $field_array Queue field array,
1415
     *
1416
     * @return string
1417
     */
1418
    public function readUrlFromArray($field_array)
1419
    {
1420
1421
            // Set exec_time to lock record:
1422
        $field_array['exec_time'] = $this->getCurrentTime();
1423
        $this->db->exec_INSERTquery('tx_crawler_queue', $field_array);
1424
        $queueId = $field_array['qid'] = $this->db->sql_insert_id();
1425
1426
        $result = $this->readUrl_exec($field_array);
1427
1428
        // Set result in log which also denotes the end of the processing of this entry.
1429
        $field_array = ['result_data' => serialize($result)];
1430
1431
        SignalSlotUtility::emitSignal(
1432
            __CLASS__,
1433
            SignalSlotUtility::SIGNAL_QUEUEITEM_POSTPROCESS,
1434
            [$queueId, $field_array]
0 ignored issues
show
Bug introduced by
array($queueId, $field_array) cannot be passed to emitsignal() as the parameter $payload expects a reference.
Loading history...
1435
        );
1436
1437
        $this->db->exec_UPDATEquery('tx_crawler_queue', 'qid=' . intval($queueId), $field_array);
1438
1439
        return $result;
1440
    }
1441
1442
    /**
1443
     * Read URL for a queue record
1444
     *
1445
     * @param array $queueRec Queue record
1446
     * @return string
1447
     */
1448
    public function readUrl_exec($queueRec)
1449
    {
1450
        // Decode parameters:
1451
        $parameters = unserialize($queueRec['parameters']);
1452
        $result = 'ERROR';
1453
        if (is_array($parameters)) {
1454
            if ($parameters['_CALLBACKOBJ']) { // Calling object:
1455
                $objRef = $parameters['_CALLBACKOBJ'];
1456
                $callBackObj = &GeneralUtility::getUserObj($objRef);
1457
                if (is_object($callBackObj)) {
1458
                    unset($parameters['_CALLBACKOBJ']);
1459
                    $result = ['content' => serialize($callBackObj->crawler_execute($parameters, $this))];
1460
                } else {
1461
                    $result = ['content' => 'No object: ' . $objRef];
1462
                }
1463
            } else { // Regular FE request:
1464
1465
                // Prepare:
1466
                $crawlerId = $queueRec['qid'] . ':' . md5($queueRec['qid'] . '|' . $queueRec['set_id'] . '|' . $GLOBALS['TYPO3_CONF_VARS']['SYS']['encryptionKey']);
1467
1468
                // Get result:
1469
                $result = $this->requestUrl($parameters['url'], $crawlerId);
1470
1471
                // The event dispatcher is deprecated since crawler v6.3.0, will be removed in crawler v7.0.0.
1472
                // Please use the Signal instead.
1473
                EventDispatcher::getInstance()->post('urlCrawled', $queueRec['set_id'], ['url' => $parameters['url'], 'result' => $result]);
1474
1475
                SignalSlotUtility::emitSignal(
1476
                    __CLASS__,
1477
                    SignalSlotUtility::SIGNAL_URL_CRAWLED,
1478
                    ['url' => $parameters['url'], 'result' => $result]
0 ignored issues
show
Bug introduced by
array('url' => $paramete...], 'result' => $result) cannot be passed to emitsignal() as the parameter $payload expects a reference.
Loading history...
1479
                );
1480
            }
1481
        }
1482
1483
        return $result;
1484
    }
1485
1486
    /**
1487
     * Gets the content of a URL.
1488
     *
1489
     * @param string $originalUrl URL to read
1490
     * @param string $crawlerId Crawler ID string (qid + hash to verify)
1491
     * @param integer $timeout Timeout time
1492
     * @param integer $recursion Recursion limiter for 302 redirects
1493
     * @return array
1494
     */
1495 2
    public function requestUrl($originalUrl, $crawlerId, $timeout = 2, $recursion = 10)
1496
    {
1497 2
        if (!$recursion) {
1498
            return false;
1499
        }
1500
1501
        // Parse URL, checking for scheme:
1502 2
        $url = parse_url($originalUrl);
1503
1504 2
        if ($url === false) {
1505
            if (TYPO3_DLOG) {
1506
                GeneralUtility::devLog(sprintf('Could not parse_url() for string "%s"', $url), 'crawler', 4, ['crawlerId' => $crawlerId]);
1507
            }
1508
            return false;
1509
        }
1510
1511 2
        if (!in_array($url['scheme'], ['','http','https'])) {
1512
            if (TYPO3_DLOG) {
1513
                GeneralUtility::devLog(sprintf('Scheme does not match for url "%s"', $url), 'crawler', 4, ['crawlerId' => $crawlerId]);
1514
            }
1515
            return false;
1516
        }
1517
1518
        // direct request
1519 2
        if ($this->extensionSettings['makeDirectRequests']) {
1520 2
            $result = $this->sendDirectRequest($originalUrl, $crawlerId);
1521 2
            return $result;
1522
        }
1523
1524
        $reqHeaders = $this->buildRequestHeaderArray($url, $crawlerId);
1525
1526
        // thanks to Pierrick Caillon for adding proxy support
1527
        $rurl = $url;
1528
1529
        if ($GLOBALS['TYPO3_CONF_VARS']['SYS']['curlUse'] && $GLOBALS['TYPO3_CONF_VARS']['SYS']['curlProxyServer']) {
1530
            $rurl = parse_url($GLOBALS['TYPO3_CONF_VARS']['SYS']['curlProxyServer']);
1531
            $url['path'] = $url['scheme'] . '://' . $url['host'] . ($url['port'] > 0 ? ':' . $url['port'] : '') . $url['path'];
1532
            $reqHeaders = $this->buildRequestHeaderArray($url, $crawlerId);
1533
        }
1534
1535
        $host = $rurl['host'];
1536
1537
        if ($url['scheme'] == 'https') {
1538
            $host = 'ssl://' . $host;
1539
            $port = ($rurl['port'] > 0) ? $rurl['port'] : 443;
1540
        } else {
1541
            $port = ($rurl['port'] > 0) ? $rurl['port'] : 80;
1542
        }
1543
1544
        $startTime = microtime(true);
1545
        $fp = fsockopen($host, $port, $errno, $errstr, $timeout);
1546
1547
        if (!$fp) {
1548
            if (TYPO3_DLOG) {
1549
                GeneralUtility::devLog(sprintf('Error while opening "%s"', $url), 'crawler', 4, ['crawlerId' => $crawlerId]);
1550
            }
1551
            return false;
1552
        } else {
1553
            // Request message:
1554
            $msg = implode("\r\n", $reqHeaders) . "\r\n\r\n";
1555
            fputs($fp, $msg);
1556
1557
            // Read response:
1558
            $d = $this->getHttpResponseFromStream($fp);
1559
            fclose($fp);
1560
1561
            $time = microtime(true) - $startTime;
1562
            $this->log($originalUrl . ' ' . $time);
1563
1564
            // Implode content and headers:
1565
            $result = [
1566
                'request' => $msg,
1567
                'headers' => implode('', $d['headers']),
1568
                'content' => implode('', (array)$d['content'])
1569
            ];
1570
1571
            if (($this->extensionSettings['follow30x']) && ($newUrl = $this->getRequestUrlFrom302Header($d['headers'], $url['user'], $url['pass']))) {
1572
                $result = array_merge(['parentRequest' => $result], $this->requestUrl($newUrl, $crawlerId, $recursion--));
0 ignored issues
show
Bug introduced by
It seems like $newUrl defined by $this->getRequestUrlFrom...['user'], $url['pass']) on line 1571 can also be of type boolean; however, AOE\Crawler\Controller\C...ontroller::requestUrl() does only seem to accept string, maybe add an additional type check?

If a method or function can return multiple different values and unless you are sure that you only can receive a single value in this context, we recommend to add an additional type check:

/**
 * @return array|string
 */
function returnsDifferentValues($x) {
    if ($x) {
        return 'foo';
    }

    return array();
}

$x = returnsDifferentValues($y);
if (is_array($x)) {
    // $x is an array.
}

If this a common case that PHP Analyzer should handle natively, please let us know by opening an issue.

Loading history...
1573
                $newRequestUrl = $this->requestUrl($newUrl, $crawlerId, $timeout, --$recursion);
0 ignored issues
show
Bug introduced by
It seems like $newUrl defined by $this->getRequestUrlFrom...['user'], $url['pass']) on line 1571 can also be of type boolean; however, AOE\Crawler\Controller\C...ontroller::requestUrl() does only seem to accept string, maybe add an additional type check?

If a method or function can return multiple different values and unless you are sure that you only can receive a single value in this context, we recommend to add an additional type check:

/**
 * @return array|string
 */
function returnsDifferentValues($x) {
    if ($x) {
        return 'foo';
    }

    return array();
}

$x = returnsDifferentValues($y);
if (is_array($x)) {
    // $x is an array.
}

If this a common case that PHP Analyzer should handle natively, please let us know by opening an issue.

Loading history...
1574
1575
                if (is_array($newRequestUrl)) {
1576
                    $result = array_merge(['parentRequest' => $result], $newRequestUrl);
1577
                } else {
1578
                    if (TYPO3_DLOG) {
1579
                        GeneralUtility::devLog(sprintf('Error while opening "%s"', $url), 'crawler', 4, ['crawlerId' => $crawlerId]);
1580
                    }
1581
                    return false;
1582
                }
1583
            }
1584
1585
            return $result;
1586
        }
1587
    }
1588
1589
    /**
1590
     * Gets the base path of the website frontend.
1591
     * (e.g. if you call http://mydomain.com/cms/index.php in
1592
     * the browser the base path is "/cms/")
1593
     *
1594
     * @return string Base path of the website frontend
1595
     */
1596
    protected function getFrontendBasePath()
1597
    {
1598
        $frontendBasePath = '/';
1599
1600
        // Get the path from the extension settings:
1601
        if (isset($this->extensionSettings['frontendBasePath']) && $this->extensionSettings['frontendBasePath']) {
1602
            $frontendBasePath = $this->extensionSettings['frontendBasePath'];
1603
            // If empty, try to use config.absRefPrefix:
1604
        } elseif (isset($GLOBALS['TSFE']->absRefPrefix) && !empty($GLOBALS['TSFE']->absRefPrefix)) {
1605
            $frontendBasePath = $GLOBALS['TSFE']->absRefPrefix;
1606
            // If not in CLI mode the base path can be determined from $_SERVER environment:
1607
        } elseif (!defined('TYPO3_REQUESTTYPE_CLI') || !TYPO3_REQUESTTYPE_CLI) {
1608
            $frontendBasePath = GeneralUtility::getIndpEnv('TYPO3_SITE_PATH');
1609
        }
1610
1611
        // Base path must be '/<pathSegements>/':
1612
        if ($frontendBasePath != '/') {
1613
            $frontendBasePath = '/' . ltrim($frontendBasePath, '/');
1614
            $frontendBasePath = rtrim($frontendBasePath, '/') . '/';
1615
        }
1616
1617
        return $frontendBasePath;
1618
    }
1619
1620
    /**
1621
     * Executes a shell command and returns the outputted result.
1622
     *
1623
     * @param string $command Shell command to be executed
1624
     * @return string Outputted result of the command execution
1625
     */
1626
    protected function executeShellCommand($command)
1627
    {
1628
        $result = shell_exec($command);
1629
        return $result;
1630
    }
1631
1632
    /**
1633
     * Reads HTTP response from the given stream.
1634
     *
1635
     * @param  resource $streamPointer  Pointer to connection stream.
1636
     * @return array                    Associative array with the following items:
1637
     *                                  headers <array> Response headers sent by server.
1638
     *                                  content <array> Content, with each line as an array item.
1639
     */
1640 1
    protected function getHttpResponseFromStream($streamPointer)
1641
    {
1642 1
        $response = ['headers' => [], 'content' => []];
1643
1644 1
        if (is_resource($streamPointer)) {
1645
            // read headers
1646 1
            while ($line = fgets($streamPointer, '2048')) {
1647 1
                $line = trim($line);
1648 1
                if ($line !== '') {
1649 1
                    $response['headers'][] = $line;
1650
                } else {
1651 1
                    break;
1652
                }
1653
            }
1654
1655
            // read content
1656 1
            while ($line = fgets($streamPointer, '2048')) {
1657 1
                $response['content'][] = $line;
1658
            }
1659
        }
1660
1661 1
        return $response;
1662
    }
1663
1664
    /**
1665
     * @param message
1666
     */
1667 2
    protected function log($message)
1668
    {
1669 2
        if (!empty($this->extensionSettings['logFileName'])) {
1670
            $fileResult = @file_put_contents($this->extensionSettings['logFileName'], date('Ymd His') . ' ' . $message . PHP_EOL, FILE_APPEND);
1671
            if (!$fileResult) {
1672
                GeneralUtility::devLog('File "' . $this->extensionSettings['logFileName'] . '" could not be written, please check file permissions.', 'crawler', LogLevel::INFO);
1673
            }
1674
        }
1675 2
    }
1676
1677
    /**
1678
     * Builds HTTP request headers.
1679
     *
1680
     * @param array $url
1681
     * @param string $crawlerId
1682
     *
1683
     * @return array
1684
     */
1685 6
    protected function buildRequestHeaderArray(array $url, $crawlerId)
1686
    {
1687 6
        $reqHeaders = [];
1688 6
        $reqHeaders[] = 'GET ' . $url['path'] . ($url['query'] ? '?' . $url['query'] : '') . ' HTTP/1.0';
1689 6
        $reqHeaders[] = 'Host: ' . $url['host'];
1690 6
        if (stristr($url['query'], 'ADMCMD_previewWS')) {
1691 2
            $reqHeaders[] = 'Cookie: $Version="1"; be_typo_user="1"; $Path=/';
1692
        }
1693 6
        $reqHeaders[] = 'Connection: close';
1694 6
        if ($url['user'] != '') {
1695 2
            $reqHeaders[] = 'Authorization: Basic ' . base64_encode($url['user'] . ':' . $url['pass']);
1696
        }
1697 6
        $reqHeaders[] = 'X-T3crawler: ' . $crawlerId;
1698 6
        $reqHeaders[] = 'User-Agent: TYPO3 crawler';
1699 6
        return $reqHeaders;
1700
    }
1701
1702
    /**
1703
     * Check if the submitted HTTP-Header contains a redirect location and built new crawler-url
1704
     *
1705
     * @param array $headers HTTP Header
1706
     * @param string $user HTTP Auth. User
1707
     * @param string $pass HTTP Auth. Password
1708
     * @return bool|string
1709
     */
1710 12
    protected function getRequestUrlFrom302Header($headers, $user = '', $pass = '')
1711
    {
1712 12
        $header = [];
1713 12
        if (!is_array($headers)) {
1714 1
            return false;
1715
        }
1716 11
        if (!(stristr($headers[0], '301 Moved') || stristr($headers[0], '302 Found') || stristr($headers[0], '302 Moved'))) {
1717 2
            return false;
1718
        }
1719
1720 9
        foreach ($headers as $hl) {
1721 9
            $tmp = explode(": ", $hl);
1722 9
            $header[trim($tmp[0])] = trim($tmp[1]);
1723 9
            if (trim($tmp[0]) == 'Location') {
1724 9
                break;
1725
            }
1726
        }
1727 9
        if (!array_key_exists('Location', $header)) {
1728 3
            return false;
1729
        }
1730
1731 6
        if ($user != '') {
1732 3
            if (!($tmp = parse_url($header['Location']))) {
1733 1
                return false;
1734
            }
1735 2
            $newUrl = $tmp['scheme'] . '://' . $user . ':' . $pass . '@' . $tmp['host'] . $tmp['path'];
1736 2
            if ($tmp['query'] != '') {
1737 2
                $newUrl .= '?' . $tmp['query'];
1738
            }
1739
        } else {
1740 3
            $newUrl = $header['Location'];
1741
        }
1742 5
        return $newUrl;
1743
    }
1744
1745
    /**************************
1746
     *
1747
     * tslib_fe hooks:
1748
     *
1749
     **************************/
1750
1751
    /**
1752
     * Initialization hook (called after database connection)
1753
     * Takes the "HTTP_X_T3CRAWLER" header and looks up queue record and verifies if the session comes from the system (by comparing hashes)
1754
     *
1755
     * @param array $params Parameters from frontend
1756
     * @param object $ref TSFE object (reference under PHP5)
1757
     * @return void
1758
     *
1759
     * FIXME: Look like this is not used, in commit 9910d3f40cce15f4e9b7bcd0488bf21f31d53ebc it's added as public,
1760
     * FIXME: I think this can be removed. (TNM)
1761
     */
1762
    public function fe_init(&$params, $ref)
0 ignored issues
show
Unused Code introduced by
The parameter $ref is not used and could be removed.

This check looks from parameters that have been defined for a function or method, but which are not used in the method body.

Loading history...
1763
    {
1764
        // Authenticate crawler request:
1765
        if (isset($_SERVER['HTTP_X_T3CRAWLER'])) {
1766
            list($queueId, $hash) = explode(':', $_SERVER['HTTP_X_T3CRAWLER']);
1767
            list($queueRec) = $this->db->exec_SELECTgetSingleRow('*', 'tx_crawler_queue', 'qid=' . intval($queueId));
1768
1769
            // If a crawler record was found and hash was matching, set it up:
1770
            if (is_array($queueRec) && $hash === md5($queueRec['qid'] . '|' . $queueRec['set_id'] . '|' . $GLOBALS['TYPO3_CONF_VARS']['SYS']['encryptionKey'])) {
1771
                $params['pObj']->applicationData['tx_crawler']['running'] = true;
1772
                $params['pObj']->applicationData['tx_crawler']['parameters'] = unserialize($queueRec['parameters']);
1773
                $params['pObj']->applicationData['tx_crawler']['log'] = [];
1774
            } else {
1775
                die('No crawler entry found!');
0 ignored issues
show
Coding Style Compatibility introduced by
The method fe_init() contains an exit expression.

An exit expression should only be used in rare cases. For example, if you write a short command line script.

In most cases however, using an exit expression makes the code untestable and often causes incompatibilities with other libraries. Thus, unless you are absolutely sure it is required here, we recommend to refactor your code to avoid its usage.

Loading history...
1776
            }
1777
        }
1778
    }
1779
1780
    /*****************************
1781
     *
1782
     * Compiling URLs to crawl - tools
1783
     *
1784
     *****************************/
1785
1786
    /**
1787
     * @param integer $id Root page id to start from.
1788
     * @param integer $depth Depth of tree, 0=only id-page, 1= on sublevel, 99 = infinite
1789
     * @param integer $scheduledTime Unix Time when the URL is timed to be visited when put in queue
1790
     * @param integer $reqMinute Number of requests per minute (creates the interleave between requests)
1791
     * @param boolean $submitCrawlUrls If set, submits the URLs to queue in database (real crawling)
1792
     * @param boolean $downloadCrawlUrls If set (and submitcrawlUrls is false) will fill $downloadUrls with entries)
1793
     * @param array $incomingProcInstructions Array of processing instructions
1794
     * @param array $configurationSelection Array of configuration keys
1795
     * @return string
1796
     */
1797
    public function getPageTreeAndUrls(
1798
        $id,
1799
        $depth,
1800
        $scheduledTime,
1801
        $reqMinute,
1802
        $submitCrawlUrls,
1803
        $downloadCrawlUrls,
1804
        array $incomingProcInstructions,
1805
        array $configurationSelection
1806
    ) {
1807
        global $BACK_PATH;
1808
        global $LANG;
1809
        if (!is_object($LANG)) {
1810
            $LANG = GeneralUtility::makeInstance(LanguageService::class);
1811
            $LANG->init(0);
1812
        }
1813
        $this->scheduledTime = $scheduledTime;
1814
        $this->reqMinute = $reqMinute;
1815
        $this->submitCrawlUrls = $submitCrawlUrls;
1816
        $this->downloadCrawlUrls = $downloadCrawlUrls;
1817
        $this->incomingProcInstructions = $incomingProcInstructions;
1818
        $this->incomingConfigurationSelection = $configurationSelection;
1819
1820
        $this->duplicateTrack = [];
1821
        $this->downloadUrls = [];
1822
1823
        // Drawing tree:
1824
        /* @var PageTreeView $tree */
1825
        $tree = GeneralUtility::makeInstance(PageTreeView::class);
1826
        $perms_clause = $GLOBALS['BE_USER']->getPagePermsClause(1);
1827
        $tree->init('AND ' . $perms_clause);
1828
1829
        $pageInfo = BackendUtility::readPageAccess($id, $perms_clause);
1830
        if (is_array($pageInfo)) {
1831
            // Set root row:
1832
            $tree->tree[] = [
1833
                'row' => $pageInfo,
1834
                'HTML' => IconUtility::getIconForRecord('pages', $pageInfo)
1835
            ];
1836
        }
1837
1838
        // Get branch beneath:
1839
        if ($depth) {
1840
            $tree->getTree($id, $depth, '');
1841
        }
1842
1843
        // Traverse page tree:
1844
        $code = '';
1845
1846
        foreach ($tree->tree as $data) {
1847
            $this->MP = false;
1848
1849
            // recognize mount points
1850
            if ($data['row']['doktype'] == 7) {
1851
                $mountpage = $this->db->exec_SELECTgetRows('*', 'pages', 'uid = ' . $data['row']['uid']);
1852
1853
                // fetch mounted pages
1854
                $this->MP = $mountpage[0]['mount_pid'] . '-' . $data['row']['uid'];
0 ignored issues
show
Documentation Bug introduced by
The property $MP was declared of type boolean, but $mountpage[0]['mount_pid...' . $data['row']['uid'] is of type string. Maybe add a type cast?

This check looks for assignments to scalar types that may be of the wrong type.

To ensure the code behaves as expected, it may be a good idea to add an explicit type cast.

$answer = 42;

$correct = false;

$correct = (bool) $answer;
Loading history...
1855
1856
                $mountTree = GeneralUtility::makeInstance(PageTreeView::class);
1857
                $mountTree->init('AND ' . $perms_clause);
1858
                $mountTree->getTree($mountpage[0]['mount_pid'], $depth, '');
1859
1860
                foreach ($mountTree->tree as $mountData) {
1861
                    $code .= $this->drawURLs_addRowsForPage(
1862
                        $mountData['row'],
1863
                        $mountData['HTML'] . BackendUtility::getRecordTitle('pages', $mountData['row'], true)
1864
                    );
1865
                }
1866
1867
                // replace page when mount_pid_ol is enabled
1868
                if ($mountpage[0]['mount_pid_ol']) {
1869
                    $data['row']['uid'] = $mountpage[0]['mount_pid'];
1870
                } else {
1871
                    // if the mount_pid_ol is not set the MP must not be used for the mountpoint page
1872
                    $this->MP = false;
1873
                }
1874
            }
1875
1876
            $code .= $this->drawURLs_addRowsForPage(
1877
                $data['row'],
1878
                $data['HTML'] . BackendUtility::getRecordTitle('pages', $data['row'], true)
1879
            );
1880
        }
1881
1882
        return $code;
1883
    }
1884
1885
    /**
1886
     * Expands exclude string
1887
     *
1888
     * @param string $excludeString Exclude string
1889
     * @return array
1890
     */
1891
    public function expandExcludeString($excludeString)
1892
    {
1893
        // internal static caches;
1894
        static $expandedExcludeStringCache;
1895
        static $treeCache;
1896
1897
        if (empty($expandedExcludeStringCache[$excludeString])) {
1898
            $pidList = [];
1899
1900
            if (!empty($excludeString)) {
1901
                /** @var PageTreeView $tree */
1902
                $tree = GeneralUtility::makeInstance(PageTreeView::class);
1903
                $tree->init('AND ' . $this->backendUser->getPagePermsClause(1));
1904
1905
                $excludeParts = GeneralUtility::trimExplode(',', $excludeString);
1906
1907
                foreach ($excludeParts as $excludePart) {
1908
                    list($pid, $depth) = GeneralUtility::trimExplode('+', $excludePart);
1909
1910
                    // default is "page only" = "depth=0"
1911
                    if (empty($depth)) {
1912
                        $depth = (stristr($excludePart, '+')) ? 99 : 0;
1913
                    }
1914
1915
                    $pidList[] = $pid;
1916
1917
                    if ($depth > 0) {
1918
                        if (empty($treeCache[$pid][$depth])) {
1919
                            $tree->reset();
1920
                            $tree->getTree($pid, $depth);
1921
                            $treeCache[$pid][$depth] = $tree->tree;
1922
                        }
1923
1924
                        foreach ($treeCache[$pid][$depth] as $data) {
1925
                            $pidList[] = $data['row']['uid'];
1926
                        }
1927
                    }
1928
                }
1929
            }
1930
1931
            $expandedExcludeStringCache[$excludeString] = array_unique($pidList);
1932
        }
1933
1934
        return $expandedExcludeStringCache[$excludeString];
1935
    }
1936
1937
    /**
1938
     * Create the rows for display of the page tree
1939
     * For each page a number of rows are shown displaying GET variable configuration
1940
     *
1941
     * @param    array        Page row
1942
     * @param    string        Page icon and title for row
1943
     * @return    string        HTML <tr> content (one or more)
1944
     */
1945
    public function drawURLs_addRowsForPage(array $pageRow, $pageTitleAndIcon)
1946
    {
1947
        $skipMessage = '';
1948
1949
        // Get list of configurations
1950
        $configurations = $this->getUrlsForPageRow($pageRow, $skipMessage);
1951
1952
        if (count($this->incomingConfigurationSelection) > 0) {
1953
            // remove configuration that does not match the current selection
1954
            foreach ($configurations as $confKey => $confArray) {
1955
                if (!in_array($confKey, $this->incomingConfigurationSelection)) {
1956
                    unset($configurations[$confKey]);
1957
                }
1958
            }
1959
        }
1960
1961
        // Traverse parameter combinations:
1962
        $c = 0;
1963
        $content = '';
1964
        if (count($configurations)) {
1965
            foreach ($configurations as $confKey => $confArray) {
1966
1967
                    // Title column:
1968
                if (!$c) {
1969
                    $titleClm = '<td rowspan="' . count($configurations) . '">' . $pageTitleAndIcon . '</td>';
1970
                } else {
1971
                    $titleClm = '';
1972
                }
1973
1974
                if (!in_array($pageRow['uid'], $this->expandExcludeString($confArray['subCfg']['exclude']))) {
1975
1976
                        // URL list:
1977
                    $urlList = $this->urlListFromUrlArray(
1978
                        $confArray,
1979
                        $pageRow,
1980
                        $this->scheduledTime,
1981
                        $this->reqMinute,
1982
                        $this->submitCrawlUrls,
1983
                        $this->downloadCrawlUrls,
1984
                        $this->duplicateTrack,
1985
                        $this->downloadUrls,
1986
                        $this->incomingProcInstructions // if empty the urls won't be filtered by processing instructions
1987
                    );
1988
1989
                    // Expanded parameters:
1990
                    $paramExpanded = '';
1991
                    $calcAccu = [];
1992
                    $calcRes = 1;
1993
                    foreach ($confArray['paramExpanded'] as $gVar => $gVal) {
1994
                        $paramExpanded .= '
1995
                            <tr>
1996
                                <td class="bgColor4-20">' . htmlspecialchars('&' . $gVar . '=') . '<br/>' .
1997
                                                '(' . count($gVal) . ')' .
1998
                                                '</td>
1999
                                <td class="bgColor4" nowrap="nowrap">' . nl2br(htmlspecialchars(implode(chr(10), $gVal))) . '</td>
2000
                            </tr>
2001
                        ';
2002
                        $calcRes *= count($gVal);
2003
                        $calcAccu[] = count($gVal);
2004
                    }
2005
                    $paramExpanded = '<table class="lrPadding c-list param-expanded">' . $paramExpanded . '</table>';
2006
                    $paramExpanded .= 'Comb: ' . implode('*', $calcAccu) . '=' . $calcRes;
2007
2008
                    // Options
2009
                    $optionValues = '';
2010
                    if ($confArray['subCfg']['userGroups']) {
2011
                        $optionValues .= 'User Groups: ' . $confArray['subCfg']['userGroups'] . '<br/>';
2012
                    }
2013
                    if ($confArray['subCfg']['baseUrl']) {
2014
                        $optionValues .= 'Base Url: ' . $confArray['subCfg']['baseUrl'] . '<br/>';
2015
                    }
2016
                    if ($confArray['subCfg']['procInstrFilter']) {
2017
                        $optionValues .= 'ProcInstr: ' . $confArray['subCfg']['procInstrFilter'] . '<br/>';
2018
                    }
2019
2020
                    // Compile row:
2021
                    $content .= '
2022
                        <tr class="bgColor' . ($c % 2 ? '-20' : '-10') . '">
2023
                            ' . $titleClm . '
2024
                            <td>' . htmlspecialchars($confKey) . '</td>
2025
                            <td>' . nl2br(htmlspecialchars(rawurldecode(trim(str_replace('&', chr(10) . '&', GeneralUtility::implodeArrayForUrl('', $confArray['paramParsed'])))))) . '</td>
2026
                            <td>' . $paramExpanded . '</td>
2027
                            <td nowrap="nowrap">' . $urlList . '</td>
2028
                            <td nowrap="nowrap">' . $optionValues . '</td>
2029
                            <td nowrap="nowrap">' . DebugUtility::viewArray($confArray['subCfg']['procInstrParams.']) . '</td>
2030
                        </tr>';
2031
                } else {
2032
                    $content .= '<tr class="bgColor' . ($c % 2 ? '-20' : '-10') . '">
2033
                            ' . $titleClm . '
2034
                            <td>' . htmlspecialchars($confKey) . '</td>
2035
                            <td colspan="5"><em>No entries</em> (Page is excluded in this configuration)</td>
2036
                        </tr>';
2037
                }
2038
2039
                $c++;
2040
            }
2041
        } else {
2042
            $message = !empty($skipMessage) ? ' (' . $skipMessage . ')' : '';
2043
2044
            // Compile row:
2045
            $content .= '
2046
                <tr class="bgColor-20" style="border-bottom: 1px solid black;">
2047
                    <td>' . $pageTitleAndIcon . '</td>
2048
                    <td colspan="6"><em>No entries</em>' . $message . '</td>
2049
                </tr>';
2050
        }
2051
2052
        return $content;
2053
    }
2054
2055
    /*****************************
2056
     *
2057
     * CLI functions
2058
     *
2059
     *****************************/
2060
2061
    /**
2062
     * Main function for running from Command Line PHP script (cron job)
2063
     * See ext/crawler/cli/crawler_cli.phpsh for details
2064
     *
2065
     * @return int number of remaining items or false if error
2066
     */
2067
    public function CLI_main()
2068
    {
2069
        $this->setAccessMode('cli');
2070
        $result = self::CLI_STATUS_NOTHING_PROCCESSED;
2071
        $cliObj = GeneralUtility::makeInstance(CrawlerCommandLineController::class);
2072
2073
        if (isset($cliObj->cli_args['-h']) || isset($cliObj->cli_args['--help'])) {
2074
            $cliObj->cli_validateArgs();
2075
            $cliObj->cli_help();
2076
            exit;
0 ignored issues
show
Coding Style Compatibility introduced by
The method CLI_main() contains an exit expression.

An exit expression should only be used in rare cases. For example, if you write a short command line script.

In most cases however, using an exit expression makes the code untestable and often causes incompatibilities with other libraries. Thus, unless you are absolutely sure it is required here, we recommend to refactor your code to avoid its usage.

Loading history...
2077
        }
2078
2079
        if (!$this->getDisabled() && $this->CLI_checkAndAcquireNewProcess($this->CLI_buildProcessId())) {
2080
            $countInARun = $cliObj->cli_argValue('--countInARun') ? intval($cliObj->cli_argValue('--countInARun')) : $this->extensionSettings['countInARun'];
2081
            // Seconds
2082
            $sleepAfterFinish = $cliObj->cli_argValue('--sleepAfterFinish') ? intval($cliObj->cli_argValue('--sleepAfterFinish')) : $this->extensionSettings['sleepAfterFinish'];
2083
            // Milliseconds
2084
            $sleepTime = $cliObj->cli_argValue('--sleepTime') ? intval($cliObj->cli_argValue('--sleepTime')) : $this->extensionSettings['sleepTime'];
2085
2086
            try {
2087
                // Run process:
2088
                $result = $this->CLI_run($countInARun, $sleepTime, $sleepAfterFinish);
2089
            } catch (\Exception $e) {
2090
                $this->CLI_debug(get_class($e) . ': ' . $e->getMessage());
2091
                $result = self::CLI_STATUS_ABORTED;
2092
            }
2093
2094
            // Cleanup
2095
            $this->db->exec_DELETEquery('tx_crawler_process', 'assigned_items_count = 0');
2096
2097
            //TODO can't we do that in a clean way?
2098
            $releaseStatus = $this->CLI_releaseProcesses($this->CLI_buildProcessId());
0 ignored issues
show
Unused Code introduced by
$releaseStatus is not used, you could remove the assignment.

This check looks for variable assignements that are either overwritten by other assignments or where the variable is not used subsequently.

$myVar = 'Value';
$higher = false;

if (rand(1, 6) > 3) {
    $higher = true;
} else {
    $higher = false;
}

Both the $myVar assignment in line 1 and the $higher assignment in line 2 are dead. The first because $myVar is never used and the second because $higher is always overwritten for every possible time line.

Loading history...
2099
2100
            $this->CLI_debug("Unprocessed Items remaining:" . $this->queueRepository->countUnprocessedItems() . " (" . $this->CLI_buildProcessId() . ")");
2101
            $result |= ($this->queueRepository->countUnprocessedItems() > 0 ? self::CLI_STATUS_REMAIN : self::CLI_STATUS_NOTHING_PROCCESSED);
2102
        } else {
2103
            $result |= self::CLI_STATUS_ABORTED;
2104
        }
2105
2106
        return $result;
2107
    }
2108
2109
    /**
2110
     * Function executed by crawler_im.php cli script.
2111
     *
2112
     * @return void
2113
     */
2114
    public function CLI_main_im()
2115
    {
2116
        $this->setAccessMode('cli_im');
2117
2118
        $cliObj = GeneralUtility::makeInstance(QueueCommandLineController::class);
2119
2120
        // Force user to admin state and set workspace to "Live":
2121
        $this->backendUser->user['admin'] = 1;
2122
        $this->backendUser->setWorkspace(0);
2123
2124
        // Print help
2125
        if (!isset($cliObj->cli_args['_DEFAULT'][1])) {
2126
            $cliObj->cli_validateArgs();
2127
            $cliObj->cli_help();
2128
            exit;
0 ignored issues
show
Coding Style Compatibility introduced by
The method CLI_main_im() contains an exit expression.

An exit expression should only be used in rare cases. For example, if you write a short command line script.

In most cases however, using an exit expression makes the code untestable and often causes incompatibilities with other libraries. Thus, unless you are absolutely sure it is required here, we recommend to refactor your code to avoid its usage.

Loading history...
2129
        }
2130
2131
        $cliObj->cli_validateArgs();
2132
2133
        if ($cliObj->cli_argValue('-o') === 'exec') {
2134
            $this->registerQueueEntriesInternallyOnly = true;
2135
        }
2136
2137
        if (isset($cliObj->cli_args['_DEFAULT'][2])) {
2138
            // Crawler is called over TYPO3 BE
2139
            $pageId = MathUtility::forceIntegerInRange($cliObj->cli_args['_DEFAULT'][2], 0);
2140
        } else {
2141
            // Crawler is called over cli
2142
            $pageId = MathUtility::forceIntegerInRange($cliObj->cli_args['_DEFAULT'][1], 0);
2143
        }
2144
2145
        $configurationKeys = $this->getConfigurationKeys($cliObj);
0 ignored issues
show
Deprecated Code introduced by
The method AOE\Crawler\Controller\C...:getConfigurationKeys() has been deprecated with message: since crawler v6.3.0, will be removed in crawler v7.0.0.

This method has been deprecated. The supplier of the class has supplied an explanatory message.

The explanatory message should give you some clue as to whether and when the method will be removed from the class and what other method or class to use instead.

Loading history...
2146
2147
        if (!is_array($configurationKeys)) {
2148
            $configurations = $this->getUrlsForPageId($pageId);
2149
            if (is_array($configurations)) {
2150
                $configurationKeys = array_keys($configurations);
2151
            } else {
2152
                $configurationKeys = [];
2153
            }
2154
        }
2155
2156
        if ($cliObj->cli_argValue('-o') === 'queue' || $cliObj->cli_argValue('-o') === 'exec') {
2157
            $reason = new Reason();
2158
            $reason->setReason(Reason::REASON_GUI_SUBMIT);
2159
            $reason->setDetailText('The cli script of the crawler added to the queue');
2160
2161
            // The event dispatcher is deprecated since crawler v6.3.0, will be removed in crawler v7.0.0.
2162
            // Please use the Signal instead.
2163
            EventDispatcher::getInstance()->post(
2164
                'invokeQueueChange',
2165
                $this->setID,
2166
                ['reason' => $reason]
2167
            );
2168
2169
            SignalSlotUtility::emitSignal(
2170
                __CLASS__,
2171
                SignalSlotUtility::SIGNAL_INVOKE_QUEUE_CHANGE,
2172
                ['reason' => $reason]
0 ignored issues
show
Bug introduced by
array('reason' => $reason) cannot be passed to emitsignal() as the parameter $payload expects a reference.
Loading history...
2173
            );
2174
2175
        }
2176
2177
        if ($this->extensionSettings['cleanUpOldQueueEntries']) {
2178
            $this->cleanUpOldQueueEntries();
2179
        }
2180
2181
        $this->setID = (int) GeneralUtility::md5int(microtime());
2182
        $this->getPageTreeAndUrls(
2183
            $pageId,
2184
            MathUtility::forceIntegerInRange($cliObj->cli_argValue('-d'), 0, 99),
2185
            $this->getCurrentTime(),
2186
            MathUtility::forceIntegerInRange($cliObj->cli_isArg('-n') ? $cliObj->cli_argValue('-n') : 30, 1, 1000),
2187
            $cliObj->cli_argValue('-o') === 'queue' || $cliObj->cli_argValue('-o') === 'exec',
2188
            $cliObj->cli_argValue('-o') === 'url',
2189
            GeneralUtility::trimExplode(',', $cliObj->cli_argValue('-proc'), true),
2190
            $configurationKeys
2191
        );
2192
2193
        if ($cliObj->cli_argValue('-o') === 'url') {
2194
            $cliObj->cli_echo(implode(chr(10), $this->downloadUrls) . chr(10), true);
2195
        } elseif ($cliObj->cli_argValue('-o') === 'exec') {
2196
            $cliObj->cli_echo("Executing " . count($this->urlList) . " requests right away:\n\n");
2197
            $cliObj->cli_echo(implode(chr(10), $this->urlList) . chr(10));
2198
            $cliObj->cli_echo("\nProcessing:\n");
2199
2200
            foreach ($this->queueEntries as $queueRec) {
2201
                $p = unserialize($queueRec['parameters']);
2202
                $cliObj->cli_echo($p['url'] . ' (' . implode(',', $p['procInstructions']) . ') => ');
2203
2204
                $result = $this->readUrlFromArray($queueRec);
2205
2206
                $requestResult = unserialize($result['content']);
2207
                if (is_array($requestResult)) {
2208
                    $resLog = is_array($requestResult['log']) ? chr(10) . chr(9) . chr(9) . implode(chr(10) . chr(9) . chr(9), $requestResult['log']) : '';
2209
                    $cliObj->cli_echo('OK: ' . $resLog . chr(10));
2210
                } else {
2211
                    $cliObj->cli_echo('Error checking Crawler Result: ' . substr(preg_replace('/\s+/', ' ', strip_tags($result['content'])), 0, 30000) . '...' . chr(10));
2212
                }
2213
            }
2214
        } elseif ($cliObj->cli_argValue('-o') === 'queue') {
2215
            $cliObj->cli_echo("Putting " . count($this->urlList) . " entries in queue:\n\n");
2216
            $cliObj->cli_echo(implode(chr(10), $this->urlList) . chr(10));
2217
        } else {
2218
            $cliObj->cli_echo(count($this->urlList) . " entries found for processing. (Use -o to decide action):\n\n", true);
2219
            $cliObj->cli_echo(implode(chr(10), $this->urlList) . chr(10), true);
2220
        }
2221
    }
2222
2223
    /**
2224
     * Function executed by crawler_im.php cli script.
2225
     *
2226
     * @return bool
2227
     */
2228
    public function CLI_main_flush()
2229
    {
2230
        $this->setAccessMode('cli_flush');
2231
        $cliObj = GeneralUtility::makeInstance(FlushCommandLineController::class);
2232
2233
        // Force user to admin state and set workspace to "Live":
2234
        $this->backendUser->user['admin'] = 1;
2235
        $this->backendUser->setWorkspace(0);
2236
2237
        // Print help
2238
        if (!isset($cliObj->cli_args['_DEFAULT'][1])) {
2239
            $cliObj->cli_validateArgs();
2240
            $cliObj->cli_help();
2241
            exit;
0 ignored issues
show
Coding Style Compatibility introduced by
The method CLI_main_flush() contains an exit expression.

An exit expression should only be used in rare cases. For example, if you write a short command line script.

In most cases however, using an exit expression makes the code untestable and often causes incompatibilities with other libraries. Thus, unless you are absolutely sure it is required here, we recommend to refactor your code to avoid its usage.

Loading history...
2242
        }
2243
2244
        $cliObj->cli_validateArgs();
2245
        $pageId = MathUtility::forceIntegerInRange($cliObj->cli_args['_DEFAULT'][1], 0);
2246
        $fullFlush = ($pageId == 0);
2247
2248
        $mode = $cliObj->cli_argValue('-o');
2249
2250
        switch ($mode) {
2251
            case 'all':
2252
                $result = $this->getLogEntriesForPageId($pageId, '', true, $fullFlush);
2253
                break;
2254
            case 'finished':
2255
            case 'pending':
2256
                $result = $this->getLogEntriesForPageId($pageId, $mode, true, $fullFlush);
2257
                break;
2258
            default:
2259
                $cliObj->cli_validateArgs();
2260
                $cliObj->cli_help();
2261
                $result = false;
2262
        }
2263
2264
        return $result !== false;
2265
    }
2266
2267
    /**
2268
     * Obtains configuration keys from the CLI arguments
2269
     *
2270
     * @param QueueCommandLineController $cliObj
2271
     * @return array
2272
     *
2273
     * @deprecated since crawler v6.3.0, will be removed in crawler v7.0.0.
2274
     */
2275
    protected function getConfigurationKeys(QueueCommandLineController $cliObj)
2276
    {
2277
        $parameter = trim($cliObj->cli_argValue('-conf'));
2278
        return ($parameter != '' ? GeneralUtility::trimExplode(',', $parameter) : []);
2279
    }
2280
2281
    /**
2282
     * Running the functionality of the CLI (crawling URLs from queue)
2283
     *
2284
     * @param int $countInARun
2285
     * @param int $sleepTime
2286
     * @param int $sleepAfterFinish
2287
     * @return string
2288
     */
2289
    public function CLI_run($countInARun, $sleepTime, $sleepAfterFinish)
2290
    {
2291
        $result = 0;
2292
        $counter = 0;
2293
2294
        // First, run hooks:
2295
        $this->CLI_runHooks();
2296
2297
        // Clean up the queue
2298
        if (intval($this->extensionSettings['purgeQueueDays']) > 0) {
2299
            $purgeDate = $this->getCurrentTime() - 24 * 60 * 60 * intval($this->extensionSettings['purgeQueueDays']);
2300
            $del = $this->db->exec_DELETEquery(
2301
                'tx_crawler_queue',
2302
                'exec_time!=0 AND exec_time<' . $purgeDate
2303
            );
2304
            if (false == $del) {
2305
                GeneralUtility::devLog('Records could not be deleted.', 'crawler', LogLevel::INFO);
2306
            }
2307
        }
2308
2309
        // Select entries:
2310
        //TODO Shouldn't this reside within the transaction?
2311
        $rows = $this->db->exec_SELECTgetRows(
2312
            'qid,scheduled',
2313
            'tx_crawler_queue',
2314
            'exec_time=0
2315
                AND process_scheduled= 0
2316
                AND scheduled<=' . $this->getCurrentTime(),
2317
            '',
2318
            'scheduled, qid',
2319
        intval($countInARun)
2320
        );
2321
2322
        if (count($rows) > 0) {
2323
            $quidList = [];
2324
2325
            foreach ($rows as $r) {
0 ignored issues
show
Bug introduced by
The expression $rows of type null|array is not guaranteed to be traversable. How about adding an additional type check?

There are different options of fixing this problem.

  1. If you want to be on the safe side, you can add an additional type-check:

    $collection = json_decode($data, true);
    if ( ! is_array($collection)) {
        throw new \RuntimeException('$collection must be an array.');
    }
    
    foreach ($collection as $item) { /** ... */ }
    
  2. If you are sure that the expression is traversable, you might want to add a doc comment cast to improve IDE auto-completion and static analysis:

    /** @var array $collection */
    $collection = json_decode($data, true);
    
    foreach ($collection as $item) { /** .. */ }
    
  3. Mark the issue as a false-positive: Just hover the remove button, in the top-right corner of this issue for more options.

Loading history...
2326
                $quidList[] = $r['qid'];
2327
            }
2328
2329
            $processId = $this->CLI_buildProcessId();
2330
2331
            //reserve queue entries for process
2332
            $this->db->sql_query('BEGIN');
2333
            //TODO make sure we're not taking assigned queue-entires
2334
            $this->db->exec_UPDATEquery(
2335
                'tx_crawler_queue',
2336
                'qid IN (' . implode(',', $quidList) . ')',
2337
                [
2338
                    'process_scheduled' => intval($this->getCurrentTime()),
2339
                    'process_id' => $processId
2340
                ]
2341
            );
2342
2343
            //save the number of assigned queue entrys to determine who many have been processed later
2344
            $numberOfAffectedRows = $this->db->sql_affected_rows();
2345
            $this->db->exec_UPDATEquery(
2346
                'tx_crawler_process',
2347
                "process_id = '" . $processId . "'",
2348
                [
2349
                    'assigned_items_count' => intval($numberOfAffectedRows)
2350
                ]
2351
            );
2352
2353
            if ($numberOfAffectedRows == count($quidList)) {
2354
                $this->db->sql_query('COMMIT');
2355
            } else {
2356
                $this->db->sql_query('ROLLBACK');
2357
                $this->CLI_debug("Nothing processed due to multi-process collision (" . $this->CLI_buildProcessId() . ")");
2358
                return ($result | self::CLI_STATUS_ABORTED);
2359
            }
2360
2361
            foreach ($rows as $r) {
0 ignored issues
show
Bug introduced by
The expression $rows of type null|array is not guaranteed to be traversable. How about adding an additional type check?

There are different options of fixing this problem.

  1. If you want to be on the safe side, you can add an additional type-check:

    $collection = json_decode($data, true);
    if ( ! is_array($collection)) {
        throw new \RuntimeException('$collection must be an array.');
    }
    
    foreach ($collection as $item) { /** ... */ }
    
  2. If you are sure that the expression is traversable, you might want to add a doc comment cast to improve IDE auto-completion and static analysis:

    /** @var array $collection */
    $collection = json_decode($data, true);
    
    foreach ($collection as $item) { /** .. */ }
    
  3. Mark the issue as a false-positive: Just hover the remove button, in the top-right corner of this issue for more options.

Loading history...
2362
                $result |= $this->readUrl($r['qid']);
2363
2364
                $counter++;
2365
                usleep(intval($sleepTime)); // Just to relax the system
2366
2367
                // if during the start and the current read url the cli has been disable we need to return from the function
2368
                // mark the process NOT as ended.
2369
                if ($this->getDisabled()) {
2370
                    return ($result | self::CLI_STATUS_ABORTED);
2371
                }
2372
2373
                if (!$this->CLI_checkIfProcessIsActive($this->CLI_buildProcessId())) {
2374
                    $this->CLI_debug("conflict / timeout (" . $this->CLI_buildProcessId() . ")");
2375
2376
                    //TODO might need an additional returncode
2377
                    $result |= self::CLI_STATUS_ABORTED;
2378
                    break; //possible timeout
2379
                }
2380
            }
2381
2382
            sleep(intval($sleepAfterFinish));
2383
2384
            $msg = 'Rows: ' . $counter;
2385
            $this->CLI_debug($msg . " (" . $this->CLI_buildProcessId() . ")");
2386
        } else {
2387
            $this->CLI_debug("Nothing within queue which needs to be processed (" . $this->CLI_buildProcessId() . ")");
2388
        }
2389
2390
        if ($counter > 0) {
2391
            $result |= self::CLI_STATUS_PROCESSED;
2392
        }
2393
2394
        return $result;
2395
    }
2396
2397
    /**
2398
     * Activate hooks
2399
     *
2400
     * @return void
2401
     */
2402
    public function CLI_runHooks()
2403
    {
2404
        global $TYPO3_CONF_VARS;
2405
        if (is_array($TYPO3_CONF_VARS['EXTCONF']['crawler']['cli_hooks'])) {
2406
            foreach ($TYPO3_CONF_VARS['EXTCONF']['crawler']['cli_hooks'] as $objRef) {
2407
                $hookObj = &GeneralUtility::getUserObj($objRef);
2408
                if (is_object($hookObj)) {
2409
                    $hookObj->crawler_init($this);
2410
                }
2411
            }
2412
        }
2413
    }
2414
2415
    /**
2416
     * Try to acquire a new process with the given id
2417
     * also performs some auto-cleanup for orphan processes
2418
     * @todo preemption might not be the most elegant way to clean up
2419
     *
2420
     * @param string $id identification string for the process
2421
     * @return boolean
2422
     */
2423
    public function CLI_checkAndAcquireNewProcess($id)
2424
    {
2425
        $ret = true;
2426
2427
        $systemProcessId = getmypid();
2428
        if ($systemProcessId < 1) {
2429
            return false;
2430
        }
2431
2432
        $processCount = 0;
2433
        $orphanProcesses = [];
2434
2435
        $this->db->sql_query('BEGIN');
2436
2437
        $res = $this->db->exec_SELECTquery(
2438
            'process_id,ttl',
2439
            'tx_crawler_process',
2440
            'active=1 AND deleted=0'
2441
            );
2442
2443
        $currentTime = $this->getCurrentTime();
2444
2445
        while ($row = $this->db->sql_fetch_assoc($res)) {
2446
            if ($row['ttl'] < $currentTime) {
2447
                $orphanProcesses[] = $row['process_id'];
2448
            } else {
2449
                $processCount++;
2450
            }
2451
        }
2452
2453
        // if there are less than allowed active processes then add a new one
2454
        if ($processCount < intval($this->extensionSettings['processLimit'])) {
2455
            $this->CLI_debug("add process " . $this->CLI_buildProcessId() . " (" . ($processCount + 1) . "/" . intval($this->extensionSettings['processLimit']) . ")");
2456
2457
            // create new process record
2458
            $this->db->exec_INSERTquery(
2459
                'tx_crawler_process',
2460
                [
2461
                    'process_id' => $id,
2462
                    'active' => '1',
2463
                    'ttl' => ($currentTime + intval($this->extensionSettings['processMaxRunTime'])),
2464
                    'system_process_id' => $systemProcessId
2465
                ]
2466
                );
2467
        } else {
2468
            $this->CLI_debug("Processlimit reached (" . ($processCount) . "/" . intval($this->extensionSettings['processLimit']) . ")");
2469
            $ret = false;
2470
        }
2471
2472
        $this->CLI_releaseProcesses($orphanProcesses, true); // maybe this should be somehow included into the current lock
2473
        $this->CLI_deleteProcessesMarkedDeleted();
2474
2475
        $this->db->sql_query('COMMIT');
2476
2477
        return $ret;
2478
    }
2479
2480
    /**
2481
     * Release a process and the required resources
2482
     *
2483
     * @param  mixed    $releaseIds   string with a single process-id or array with multiple process-ids
2484
     * @param  boolean  $withinLock   show whether the DB-actions are included within an existing lock
2485
     * @return boolean
2486
     */
2487
    public function CLI_releaseProcesses($releaseIds, $withinLock = false)
2488
    {
2489
        if (!is_array($releaseIds)) {
2490
            $releaseIds = [$releaseIds];
2491
        }
2492
2493
        if (!count($releaseIds) > 0) {
2494
            return false;   //nothing to release
2495
        }
2496
2497
        if (!$withinLock) {
2498
            $this->db->sql_query('BEGIN');
2499
        }
2500
2501
        // some kind of 2nd chance algo - this way you need at least 2 processes to have a real cleanup
2502
        // this ensures that a single process can't mess up the entire process table
2503
2504
        // mark all processes as deleted which have no "waiting" queue-entires and which are not active
2505
        $this->db->exec_UPDATEquery(
2506
            'tx_crawler_queue',
2507
            'process_id IN (SELECT process_id FROM tx_crawler_process WHERE active=0 AND deleted=0)',
2508
            [
2509
                'process_scheduled' => 0,
2510
                'process_id' => ''
2511
            ]
2512
        );
2513
        $this->db->exec_UPDATEquery(
2514
            'tx_crawler_process',
2515
            'active=0 AND deleted=0
2516
            AND NOT EXISTS (
2517
                SELECT * FROM tx_crawler_queue
2518
                WHERE tx_crawler_queue.process_id = tx_crawler_process.process_id
2519
                AND tx_crawler_queue.exec_time = 0
2520
            )',
2521
            [
2522
                'deleted' => '1',
2523
                'system_process_id' => 0
2524
            ]
2525
        );
2526
        // mark all requested processes as non-active
2527
        $this->db->exec_UPDATEquery(
2528
            'tx_crawler_process',
2529
            'process_id IN (\'' . implode('\',\'', $releaseIds) . '\') AND deleted=0',
2530
            [
2531
                'active' => '0'
2532
            ]
2533
        );
2534
        $this->db->exec_UPDATEquery(
2535
            'tx_crawler_queue',
2536
            'exec_time=0 AND process_id IN ("' . implode('","', $releaseIds) . '")',
2537
            [
2538
                'process_scheduled' => 0,
2539
                'process_id' => ''
2540
            ]
2541
        );
2542
2543
        if (!$withinLock) {
2544
            $this->db->sql_query('COMMIT');
2545
        }
2546
2547
        return true;
2548
    }
2549
2550
    /**
2551
     * Delete processes marked as deleted
2552
     *
2553
     * @return void
2554
     */
2555 1
    public function CLI_deleteProcessesMarkedDeleted()
2556
    {
2557 1
        $this->db->exec_DELETEquery('tx_crawler_process', 'deleted = 1');
2558 1
    }
2559
2560
    /**
2561
     * Check if there are still resources left for the process with the given id
2562
     * Used to determine timeouts and to ensure a proper cleanup if there's a timeout
2563
     *
2564
     * @param  string  identification string for the process
2565
     * @return boolean determines if the process is still active / has resources
2566
     *
2567
     * FIXME: Please remove Transaction, not needed as only a select query.
2568
     */
2569
    public function CLI_checkIfProcessIsActive($pid)
2570
    {
2571
        $ret = false;
2572
        $this->db->sql_query('BEGIN');
2573
        $res = $this->db->exec_SELECTquery(
2574
            'process_id,active,ttl',
2575
            'tx_crawler_process',
2576
            'process_id = \'' . $pid . '\'  AND deleted=0',
2577
            '',
2578
            'ttl',
2579
            '0,1'
2580
        );
2581
        if ($row = $this->db->sql_fetch_assoc($res)) {
2582
            $ret = intVal($row['active']) == 1;
2583
        }
2584
        $this->db->sql_query('COMMIT');
2585
2586
        return $ret;
2587
    }
2588
2589
    /**
2590
     * Create a unique Id for the current process
2591
     *
2592
     * @return string  the ID
2593
     */
2594 2
    public function CLI_buildProcessId()
2595
    {
2596 2
        if (!$this->processID) {
2597 1
            $this->processID = GeneralUtility::shortMD5($this->microtime(true));
2598
        }
2599 2
        return $this->processID;
2600
    }
2601
2602
    /**
2603
     * @param bool $get_as_float
2604
     *
2605
     * @return mixed
2606
     */
2607
    protected function microtime($get_as_float = false)
2608
    {
2609
        return microtime($get_as_float);
2610
    }
2611
2612
    /**
2613
     * Prints a message to the stdout (only if debug-mode is enabled)
2614
     *
2615
     * @param  string $msg  the message
2616
     */
2617
    public function CLI_debug($msg)
2618
    {
2619
        if (intval($this->extensionSettings['processDebug'])) {
2620
            echo $msg . "\n";
2621
            flush();
2622
        }
2623
    }
2624
2625
    /**
2626
     * Get URL content by making direct request to TYPO3.
2627
     *
2628
     * @param  string $url          Page URL
2629
     * @param  int    $crawlerId    Crawler-ID
2630
     * @return array
2631
     */
2632 2
    protected function sendDirectRequest($url, $crawlerId)
2633
    {
2634 2
        $parsedUrl = parse_url($url);
2635 2
        if (!is_array($parsedUrl)) {
2636
            return [];
2637
        }
2638
2639 2
        $requestHeaders = $this->buildRequestHeaderArray($parsedUrl, $crawlerId);
2640
2641 2
        $cmd = escapeshellcmd($this->extensionSettings['phpPath']);
2642 2
        $cmd .= ' ';
2643 2
        $cmd .= escapeshellarg(ExtensionManagementUtility::extPath('crawler') . 'cli/bootstrap.php');
2644 2
        $cmd .= ' ';
2645 2
        $cmd .= escapeshellarg($this->getFrontendBasePath());
2646 2
        $cmd .= ' ';
2647 2
        $cmd .= escapeshellarg($url);
2648 2
        $cmd .= ' ';
2649 2
        $cmd .= escapeshellarg(base64_encode(serialize($requestHeaders)));
2650
2651 2
        $startTime = microtime(true);
2652 2
        $content = $this->executeShellCommand($cmd);
2653 2
        $this->log($url . ' ' . (microtime(true) - $startTime));
2654
2655
        $result = [
2656 2
            'request' => implode("\r\n", $requestHeaders) . "\r\n\r\n",
2657 2
            'headers' => '',
2658 2
            'content' => $content
2659
        ];
2660
2661 2
        return $result;
2662
    }
2663
2664
    /**
2665
     * Cleans up entries that stayed for too long in the queue. These are:
2666
     * - processed entries that are over 1.5 days in age
2667
     * - scheduled entries that are over 7 days old
2668
     *
2669
     * @return void
2670
     *
2671
     * TODO: Should be switched back to protected - TNM 2018-11-16
2672
     */
2673
    public function cleanUpOldQueueEntries()
2674
    {
2675
        $processedAgeInSeconds = $this->extensionSettings['cleanUpProcessedAge'] * 86400; // 24*60*60 Seconds in 24 hours
2676
        $scheduledAgeInSeconds = $this->extensionSettings['cleanUpScheduledAge'] * 86400;
2677
2678
        $now = time();
2679
        $condition = '(exec_time<>0 AND exec_time<' . ($now - $processedAgeInSeconds) . ') OR scheduled<=' . ($now - $scheduledAgeInSeconds);
2680
        $this->flushQueue($condition);
2681
    }
2682
2683
    /**
2684
     * Initializes a TypoScript Frontend necessary for using TypoScript and TypoLink functions
2685
     *
2686
     * @param int $id
2687
     * @param int $typeNum
2688
     *
2689
     * @return void
2690
     */
2691
    protected function initTSFE($id = 1, $typeNum = 0)
2692
    {
2693
        EidUtility::initTCA();
2694
        if (!is_object($GLOBALS['TT'])) {
2695
            $GLOBALS['TT'] = new NullTimeTracker();
0 ignored issues
show
Deprecated Code introduced by
The class TYPO3\CMS\Core\TimeTracker\NullTimeTracker has been deprecated with message: since TYPO3 v8, will be removed in v9

This class, trait or interface has been deprecated. The supplier of the file has supplied an explanatory message.

The explanatory message should give you some clue as to whether and when the type will be removed from the class and what other constant to use instead.

Loading history...
2696
            $GLOBALS['TT']->start();
0 ignored issues
show
Deprecated Code introduced by
The method TYPO3\CMS\Core\TimeTrack...ullTimeTracker::start() has been deprecated with message: since TYPO3 v8, will be removed in v9, use the regular time tracking

This method has been deprecated. The supplier of the class has supplied an explanatory message.

The explanatory message should give you some clue as to whether and when the method will be removed from the class and what other method or class to use instead.

Loading history...
2697
        }
2698
2699
        $GLOBALS['TSFE'] = GeneralUtility::makeInstance(TypoScriptFrontendController::class, $GLOBALS['TYPO3_CONF_VARS'], $id, $typeNum);
2700
        $GLOBALS['TSFE']->sys_page = GeneralUtility::makeInstance(PageRepository::class);
2701
        $GLOBALS['TSFE']->sys_page->init(true);
2702
        $GLOBALS['TSFE']->connectToDB();
2703
        $GLOBALS['TSFE']->initFEuser();
2704
        $GLOBALS['TSFE']->determineId();
2705
        $GLOBALS['TSFE']->initTemplate();
2706
        $GLOBALS['TSFE']->rootLine = $GLOBALS['TSFE']->sys_page->getRootLine($id, '');
2707
        $GLOBALS['TSFE']->getConfigArray();
2708
        PageGenerator::pagegenInit();
0 ignored issues
show
Deprecated Code introduced by
The method TYPO3\CMS\Frontend\Page\...enerator::pagegenInit() has been deprecated with message: since TYPO3 v8, will be removed in TYPO3 v9

This method has been deprecated. The supplier of the class has supplied an explanatory message.

The explanatory message should give you some clue as to whether and when the method will be removed from the class and what other method or class to use instead.

Loading history...
2709
    }
2710
2711
    /**
2712
     * Returns a md5 hash generated from a serialized configuration array.
2713
     *
2714
     * @param array $configuration
2715
     *
2716
     * @return string
2717
     */
2718 5
    protected function getConfigurationHash(array $configuration) {
2719 5
        unset($configuration['paramExpanded']);
2720 5
        unset($configuration['URLs']);
2721 5
        return md5(serialize($configuration));
2722
    }
2723
2724
    /**
2725
     * Check whether the Crawling Protocol should be http or https
2726
     *
2727
     * @param $crawlerConfiguration
2728
     * @param $pageConfiguration
2729
     *
2730
     * @return bool
2731
     */
2732 5
    protected function isCrawlingProtocolHttps($crawlerConfiguration, $pageConfiguration) {
2733
        switch($crawlerConfiguration) {
2734 5
            case -1:
2735 1
                return false;
2736 4
            case 0:
2737 2
                return $pageConfiguration;
2738 2
            case 1:
2739 1
                return true;
2740
            default:
2741 1
                return false;
2742
        }
2743
    }
2744
}
2745