Completed
Push — dependencies/composer ( 16ed26...3d6911 )
by Christian
06:09
created

CrawlerController::sendDirectRequest()   A

Complexity

Conditions 2
Paths 2

Size

Total Lines 31

Duplication

Lines 0
Ratio 0 %

Code Coverage

Tests 20
CRAP Score 2.0004

Importance

Changes 0
Metric Value
cc 2
nc 2
nop 2
dl 0
loc 31
rs 9.424
c 0
b 0
f 0
ccs 20
cts 21
cp 0.9524
crap 2.0004
1
<?php
2
namespace AOE\Crawler\Controller;
3
4
/***************************************************************
5
 *  Copyright notice
6
 *
7
 *  (c) 2017 AOE GmbH <[email protected]>
8
 *
9
 *  All rights reserved
10
 *
11
 *  This script is part of the TYPO3 project. The TYPO3 project is
12
 *  free software; you can redistribute it and/or modify
13
 *  it under the terms of the GNU General Public License as published by
14
 *  the Free Software Foundation; either version 3 of the License, or
15
 *  (at your option) any later version.
16
 *
17
 *  The GNU General Public License can be found at
18
 *  http://www.gnu.org/copyleft/gpl.html.
19
 *
20
 *  This script is distributed in the hope that it will be useful,
21
 *  but WITHOUT ANY WARRANTY; without even the implied warranty of
22
 *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
23
 *  GNU General Public License for more details.
24
 *
25
 *  This copyright notice MUST APPEAR in all copies of the script!
26
 ***************************************************************/
27
28
use AOE\Crawler\Command\CrawlerCommandLineController;
29
use AOE\Crawler\Command\FlushCommandLineController;
30
use AOE\Crawler\Command\QueueCommandLineController;
31
use AOE\Crawler\Domain\Model\Reason;
32
use AOE\Crawler\Domain\Repository\QueueRepository;
33
use AOE\Crawler\Event\EventDispatcher;
34
use AOE\Crawler\Utility\IconUtility;
35
use AOE\Crawler\Utility\SignalSlotUtility;
36
use TYPO3\CMS\Backend\Utility\BackendUtility;
37
use TYPO3\CMS\Backend\Tree\View\PageTreeView;
38
use TYPO3\CMS\Core\Authentication\BackendUserAuthentication;
39
use TYPO3\CMS\Core\Database\DatabaseConnection;
40
use TYPO3\CMS\Core\Log\LogLevel;
41
use TYPO3\CMS\Core\TimeTracker\NullTimeTracker;
42
use TYPO3\CMS\Core\TimeTracker\TimeTracker;
43
use TYPO3\CMS\Core\Utility\DebugUtility;
44
use TYPO3\CMS\Core\Utility\ExtensionManagementUtility;
45
use TYPO3\CMS\Core\Utility\GeneralUtility;
46
use TYPO3\CMS\Core\Utility\MathUtility;
47
use TYPO3\CMS\Core\Utility\VersionNumberUtility;
48
use TYPO3\CMS\Extbase\Object\ObjectManager;
49
use TYPO3\CMS\Frontend\Controller\TypoScriptFrontendController;
50
use TYPO3\CMS\Frontend\Page\PageGenerator;
51
use TYPO3\CMS\Frontend\Page\PageRepository;
52
use TYPO3\CMS\Frontend\Utility\EidUtility;
53
use TYPO3\CMS\Lang\LanguageService;
54
55
/**
56
 * Class CrawlerController
57
 *
58
 * @package AOE\Crawler\Controller
59
 */
60
class CrawlerController
61
{
62
    const CLI_STATUS_NOTHING_PROCCESSED = 0;
63
    const CLI_STATUS_REMAIN = 1; //queue not empty
64
    const CLI_STATUS_PROCESSED = 2; //(some) queue items where processed
65
    const CLI_STATUS_ABORTED = 4; //instance didn't finish
66
    const CLI_STATUS_POLLABLE_PROCESSED = 8;
67
68
    /**
69
     * @var integer
70
     */
71
    public $setID = 0;
72
73
    /**
74
     * @var string
75
     */
76
    public $processID = '';
77
78
    /**
79
     * One hour is max stalled time for the CLI
80
     * If the process had the status "start" for 3600 seconds, it will be regarded stalled and a new process is started
81
     *
82
     * @var integer
83
     */
84
    public $max_CLI_exec_time = 3600;
85
86
    /**
87
     * @var array
88
     */
89
    public $duplicateTrack = [];
90
91
    /**
92
     * @var array
93
     */
94
    public $downloadUrls = [];
95
96
    /**
97
     * @var array
98
     */
99
    public $incomingProcInstructions = [];
100
101
    /**
102
     * @var array
103
     */
104
    public $incomingConfigurationSelection = [];
105
106
    /**
107
     * @var bool
108
     */
109
    public $registerQueueEntriesInternallyOnly = false;
110
111
    /**
112
     * @var array
113
     */
114
    public $queueEntries = [];
115
116
    /**
117
     * @var array
118
     */
119
    public $urlList = [];
120
121
    /**
122
     * @var boolean
123
     */
124
    public $debugMode = false;
125
126
    /**
127
     * @var array
128
     */
129
    public $extensionSettings = [];
130
131
    /**
132
     * Mount Point
133
     *
134
     * @var boolean
135
     */
136
    public $MP = false;
137
138
    /**
139
     * @var string
140
     */
141
    protected $processFilename;
142
143
    /**
144
     * Holds the internal access mode can be 'gui','cli' or 'cli_im'
145
     *
146
     * @var string
147
     */
148
    protected $accessMode;
149
150
    /**
151
     * @var DatabaseConnection
152
     */
153
    private $db;
154
155
    /**
156
     * @var BackendUserAuthentication
157
     */
158
    private $backendUser;
159
160
    /**
161
     * @var integer
162
     */
163
    private $scheduledTime = 0;
164
165
    /**
166
     * @var integer
167
     */
168
    private $reqMinute = 0;
169
170
    /**
171
     * @var bool
172
     */
173
    private $submitCrawlUrls = false;
174
175
    /**
176
     * @var bool
177
     */
178
    private $downloadCrawlUrls = false;
179
180
    /**
181
     * @var QueueRepository
182
     */
183
    protected  $queueRepository;
184
185
    /**
186
     * Method to set the accessMode can be gui, cli or cli_im
187
     *
188
     * @return string
189
     */
190 1
    public function getAccessMode()
191
    {
192 1
        return $this->accessMode;
193
    }
194
195
    /**
196
     * @param string $accessMode
197
     */
198 1
    public function setAccessMode($accessMode)
199
    {
200 1
        $this->accessMode = $accessMode;
201 1
    }
202
203
    /**
204
     * Set disabled status to prevent processes from being processed
205
     *
206
     * @param  bool $disabled (optional, defaults to true)
207
     * @return void
208
     */
209 3
    public function setDisabled($disabled = true)
210
    {
211 3
        if ($disabled) {
212 2
            GeneralUtility::writeFile($this->processFilename, '');
213
        } else {
214 1
            if (is_file($this->processFilename)) {
215 1
                unlink($this->processFilename);
216
            }
217
        }
218 3
    }
219
220
    /**
221
     * Get disable status
222
     *
223
     * @return bool true if disabled
224
     */
225 3
    public function getDisabled()
226
    {
227 3
        if (is_file($this->processFilename)) {
228 2
            return true;
229
        } else {
230 1
            return false;
231
        }
232
    }
233
234
    /**
235
     * @param string $filenameWithPath
236
     *
237
     * @return void
238
     */
239 4
    public function setProcessFilename($filenameWithPath)
240
    {
241 4
        $this->processFilename = $filenameWithPath;
242 4
    }
243
244
    /**
245
     * @return string
246
     */
247 1
    public function getProcessFilename()
248
    {
249 1
        return $this->processFilename;
250
    }
251
252
    /************************************
253
     *
254
     * Getting URLs based on Page TSconfig
255
     *
256
     ************************************/
257
258 28
    public function __construct()
259
    {
260 28
        $objectManager = GeneralUtility::makeInstance(ObjectManager::class);
261 28
        $this->queueRepository = $objectManager->get(QueueRepository::class);
262
263 28
        $this->db = $GLOBALS['TYPO3_DB'];
264 28
        $this->backendUser = $GLOBALS['BE_USER'];
265 28
        $this->processFilename = PATH_site . 'typo3temp/tx_crawler.proc';
266
267 28
        $settings = unserialize($GLOBALS['TYPO3_CONF_VARS']['EXT']['extConf']['crawler']);
268 28
        $settings = is_array($settings) ? $settings : [];
269
270
        // read ext_em_conf_template settings and set
271 28
        $this->setExtensionSettings($settings);
272
273
        // set defaults:
274 28
        if (MathUtility::convertToPositiveInteger($this->extensionSettings['countInARun']) == 0) {
275 21
            $this->extensionSettings['countInARun'] = 100;
276
        }
277
278 28
        $this->extensionSettings['processLimit'] = MathUtility::forceIntegerInRange($this->extensionSettings['processLimit'], 1, 99, 1);
279 28
    }
280
281
    /**
282
     * Sets the extensions settings (unserialized pendant of $TYPO3_CONF_VARS['EXT']['extConf']['crawler']).
283
     *
284
     * @param array $extensionSettings
285
     * @return void
286
     */
287 37
    public function setExtensionSettings(array $extensionSettings)
288
    {
289 37
        $this->extensionSettings = $extensionSettings;
290 37
    }
291
292
    /**
293
     * Check if the given page should be crawled
294
     *
295
     * @param array $pageRow
296
     * @return false|string false if the page should be crawled (not excluded), true / skipMessage if it should be skipped
297
     */
298 10
    public function checkIfPageShouldBeSkipped(array $pageRow)
299
    {
300 10
        $skipPage = false;
301 10
        $skipMessage = 'Skipped'; // message will be overwritten later
302
303
        // if page is hidden
304 10
        if (!$this->extensionSettings['crawlHiddenPages']) {
305 10
            if ($pageRow['hidden']) {
306 1
                $skipPage = true;
307 1
                $skipMessage = 'Because page is hidden';
308
            }
309
        }
310
311 10
        if (!$skipPage) {
312 9
            if (GeneralUtility::inList('3,4', $pageRow['doktype']) || $pageRow['doktype'] >= 199) {
313 3
                $skipPage = true;
314 3
                $skipMessage = 'Because doktype is not allowed';
315
            }
316
        }
317
318 10
        if (!$skipPage) {
319 6
            if (is_array($GLOBALS['TYPO3_CONF_VARS']['EXTCONF']['crawler']['excludeDoktype'])) {
320 2
                foreach ($GLOBALS['TYPO3_CONF_VARS']['EXTCONF']['crawler']['excludeDoktype'] as $key => $doktypeList) {
321 1
                    if (GeneralUtility::inList($doktypeList, $pageRow['doktype'])) {
322 1
                        $skipPage = true;
323 1
                        $skipMessage = 'Doktype was excluded by "' . $key . '"';
324 1
                        break;
325
                    }
326
                }
327
            }
328
        }
329
330 10
        if (!$skipPage) {
331
            // veto hook
332 5
            if (is_array($GLOBALS['TYPO3_CONF_VARS']['EXTCONF']['crawler']['pageVeto'])) {
333
                foreach ($GLOBALS['TYPO3_CONF_VARS']['EXTCONF']['crawler']['pageVeto'] as $key => $func) {
334
                    $params = [
335
                        'pageRow' => $pageRow
336
                    ];
337
                    // expects "false" if page is ok and "true" or a skipMessage if this page should _not_ be crawled
338
                    $veto = GeneralUtility::callUserFunction($func, $params, $this);
339
                    if ($veto !== false) {
340
                        $skipPage = true;
341
                        if (is_string($veto)) {
342
                            $skipMessage = $veto;
343
                        } else {
344
                            $skipMessage = 'Veto from hook "' . htmlspecialchars($key) . '"';
345
                        }
346
                        // no need to execute other hooks if a previous one return a veto
347
                        break;
348
                    }
349
                }
350
            }
351
        }
352
353 10
        return $skipPage ? $skipMessage : false;
354
    }
355
356
    /**
357
     * Wrapper method for getUrlsForPageId()
358
     * It returns an array of configurations and no urls!
359
     *
360
     * @param array $pageRow Page record with at least dok-type and uid columns.
361
     * @param string $skipMessage
362
     * @return array
363
     * @see getUrlsForPageId()
364
     */
365 6
    public function getUrlsForPageRow(array $pageRow, &$skipMessage = '')
366
    {
367 6
        $message = $this->checkIfPageShouldBeSkipped($pageRow);
368
369 6
        if ($message === false) {
370 5
            $forceSsl = ($pageRow['url_scheme'] === 2) ? true : false;
371 5
            $res = $this->getUrlsForPageId($pageRow['uid'], $forceSsl);
372 5
            $skipMessage = '';
373
        } else {
374 1
            $skipMessage = $message;
375 1
            $res = [];
376
        }
377
378 6
        return $res;
379
    }
380
381
    /**
382
     * This method is used to count if there are ANY unprocessed queue entries
383
     * of a given page_id and the configuration which matches a given hash.
384
     * If there if none, we can skip an inner detail check
385
     *
386
     * @param  int $uid
387
     * @param  string $configurationHash
388
     * @return boolean
389
     */
390 7
    protected function noUnprocessedQueueEntriesForPageWithConfigurationHashExist($uid, $configurationHash)
391
    {
392 7
        $configurationHash = $this->db->fullQuoteStr($configurationHash, 'tx_crawler_queue');
393 7
        $res = $this->db->exec_SELECTquery('count(*) as anz', 'tx_crawler_queue', "page_id=" . intval($uid) . " AND configuration_hash=" . $configurationHash . " AND exec_time=0");
394 7
        $row = $this->db->sql_fetch_assoc($res);
395
396 7
        return ($row['anz'] == 0);
397
    }
398
399
    /**
400
     * Creates a list of URLs from input array (and submits them to queue if asked for)
401
     * See Web > Info module script + "indexed_search"'s crawler hook-client using this!
402
     *
403
     * @param    array        Information about URLs from pageRow to crawl.
404
     * @param    array        Page row
405
     * @param    integer        Unix time to schedule indexing to, typically time()
406
     * @param    integer        Number of requests per minute (creates the interleave between requests)
407
     * @param    boolean        If set, submits the URLs to queue
408
     * @param    boolean        If set (and submitcrawlUrls is false) will fill $downloadUrls with entries)
409
     * @param    array        Array which is passed by reference and contains the an id per url to secure we will not crawl duplicates
410
     * @param    array        Array which will be filled with URLS for download if flag is set.
411
     * @param    array        Array of processing instructions
412
     * @return    string        List of URLs (meant for display in backend module)
413
     *
414
     */
415 4
    public function urlListFromUrlArray(
416
    array $vv,
417
    array $pageRow,
418
    $scheduledTime,
419
    $reqMinute,
420
    $submitCrawlUrls,
421
    $downloadCrawlUrls,
422
    array &$duplicateTrack,
423
    array &$downloadUrls,
424
    array $incomingProcInstructions
425
    ) {
426 4
        $urlList = '';
427
        // realurl support (thanks to Ingo Renner)
428 4
        if (ExtensionManagementUtility::isLoaded('realurl') && $vv['subCfg']['realurl']) {
429
430
            /** @var tx_realurl $urlObj */
431
            $urlObj = GeneralUtility::makeInstance('tx_realurl');
432
433
            if (!empty($vv['subCfg']['baseUrl'])) {
434
                $urlParts = parse_url($vv['subCfg']['baseUrl']);
435
                $host = strtolower($urlParts['host']);
436
                $urlObj->host = $host;
437
438
                // First pass, finding configuration OR pointer string:
439
                $urlObj->extConf = isset($GLOBALS['TYPO3_CONF_VARS']['EXTCONF']['realurl'][$urlObj->host]) ? $GLOBALS['TYPO3_CONF_VARS']['EXTCONF']['realurl'][$urlObj->host] : $GLOBALS['TYPO3_CONF_VARS']['EXTCONF']['realurl']['_DEFAULT'];
440
441
                // If it turned out to be a string pointer, then look up the real config:
442
                if (is_string($urlObj->extConf)) {
443
                    $urlObj->extConf = is_array($GLOBALS['TYPO3_CONF_VARS']['EXTCONF']['realurl'][$urlObj->extConf]) ? $GLOBALS['TYPO3_CONF_VARS']['EXTCONF']['realurl'][$urlObj->extConf] : $GLOBALS['TYPO3_CONF_VARS']['EXTCONF']['realurl']['_DEFAULT'];
444
                }
445
            }
446
447
            if (!$GLOBALS['TSFE']->sys_page) {
448
                $GLOBALS['TSFE']->sys_page = GeneralUtility::makeInstance('TYPO3\CMS\Frontend\Page\PageRepository');
449
            }
450
451
            if (!$GLOBALS['TSFE']->tmpl->rootLine[0]['uid']) {
452
                $GLOBALS['TSFE']->tmpl->rootLine[0]['uid'] = $urlObj->extConf['pagePath']['rootpage_id'];
453
            }
454
        }
455
456 4
        if (is_array($vv['URLs'])) {
457 4
            $configurationHash = $this->getConfigurationHash($vv);
458 4
            $skipInnerCheck = $this->noUnprocessedQueueEntriesForPageWithConfigurationHashExist($pageRow['uid'], $configurationHash);
459
460 4
            foreach ($vv['URLs'] as $urlQuery) {
461 4
                if ($this->drawURLs_PIfilter($vv['subCfg']['procInstrFilter'], $incomingProcInstructions)) {
462
463
                    // Calculate cHash:
464 4
                    if ($vv['subCfg']['cHash']) {
465
                        /* @var $cacheHash \TYPO3\CMS\Frontend\Page\CacheHashCalculator */
466
                        $cacheHash = GeneralUtility::makeInstance('TYPO3\CMS\Frontend\Page\CacheHashCalculator');
467
                        $urlQuery .= '&cHash=' . $cacheHash->generateForParameters($urlQuery);
468
                    }
469
470
                    // Create key by which to determine unique-ness:
471 4
                    $uKey = $urlQuery . '|' . $vv['subCfg']['userGroups'] . '|' . $vv['subCfg']['baseUrl'] . '|' . $vv['subCfg']['procInstrFilter'];
472
473
                    // realurl support (thanks to Ingo Renner)
474 4
                    $urlQuery = 'index.php' . $urlQuery;
475 4
                    if (ExtensionManagementUtility::isLoaded('realurl') && $vv['subCfg']['realurl']) {
476
                        $params = [
477
                            'LD' => [
478
                                'totalURL' => $urlQuery
479
                            ],
480
                            'TCEmainHook' => true
481
                        ];
482
                        $urlObj->encodeSpURL($params);
0 ignored issues
show
Bug introduced by
The variable $urlObj does not seem to be defined for all execution paths leading up to this point.

If you define a variable conditionally, it can happen that it is not defined for all execution paths.

Let’s take a look at an example:

function myFunction($a) {
    switch ($a) {
        case 'foo':
            $x = 1;
            break;

        case 'bar':
            $x = 2;
            break;
    }

    // $x is potentially undefined here.
    echo $x;
}

In the above example, the variable $x is defined if you pass “foo” or “bar” as argument for $a. However, since the switch statement has no default case statement, if you pass any other value, the variable $x would be undefined.

Available Fixes

  1. Check for existence of the variable explicitly:

    function myFunction($a) {
        switch ($a) {
            case 'foo':
                $x = 1;
                break;
    
            case 'bar':
                $x = 2;
                break;
        }
    
        if (isset($x)) { // Make sure it's always set.
            echo $x;
        }
    }
    
  2. Define a default value for the variable:

    function myFunction($a) {
        $x = ''; // Set a default which gets overridden for certain paths.
        switch ($a) {
            case 'foo':
                $x = 1;
                break;
    
            case 'bar':
                $x = 2;
                break;
        }
    
        echo $x;
    }
    
  3. Add a value for the missing path:

    function myFunction($a) {
        switch ($a) {
            case 'foo':
                $x = 1;
                break;
    
            case 'bar':
                $x = 2;
                break;
    
            // We add support for the missing case.
            default:
                $x = '';
                break;
        }
    
        echo $x;
    }
    
Loading history...
483
                        $urlQuery = $params['LD']['totalURL'];
484
                    }
485
486
                    // Scheduled time:
487 4
                    $schTime = $scheduledTime + round(count($duplicateTrack) * (60 / $reqMinute));
488 4
                    $schTime = floor($schTime / 60) * 60;
489
490 4
                    if (isset($duplicateTrack[$uKey])) {
491
492
                        //if the url key is registered just display it and do not resubmit is
493
                        $urlList = '<em><span class="typo3-dimmed">' . htmlspecialchars($urlQuery) . '</span></em><br/>';
494
                    } else {
495 4
                        $urlList = '[' . date('d.m.y H:i', $schTime) . '] ' . htmlspecialchars($urlQuery);
496 4
                        $this->urlList[] = '[' . date('d.m.y H:i', $schTime) . '] ' . $urlQuery;
497
498 4
                        $theUrl = ($vv['subCfg']['baseUrl'] ? $vv['subCfg']['baseUrl'] : GeneralUtility::getIndpEnv('TYPO3_SITE_URL')) . $urlQuery;
499
500
                        // Submit for crawling!
501 4
                        if ($submitCrawlUrls) {
502 4
                            $added = $this->addUrl(
503 4
                            $pageRow['uid'],
504 4
                            $theUrl,
505 4
                            $vv['subCfg'],
506 4
                            $scheduledTime,
507 4
                            $configurationHash,
508 4
                            $skipInnerCheck
509
                            );
510 4
                            if ($added === false) {
511 4
                                $urlList .= ' (Url already existed)';
512
                            }
513
                        } elseif ($downloadCrawlUrls) {
514
                            $downloadUrls[$theUrl] = $theUrl;
515
                        }
516
517 4
                        $urlList .= '<br />';
518
                    }
519 4
                    $duplicateTrack[$uKey] = true;
520
                }
521
            }
522
        } else {
523
            $urlList = 'ERROR - no URL generated';
524
        }
525
526 4
        return $urlList;
527
    }
528
529
    /**
530
     * Returns true if input processing instruction is among registered ones.
531
     *
532
     * @param string $piString PI to test
533
     * @param array $incomingProcInstructions Processing instructions
534
     * @return boolean
535
     */
536 5
    public function drawURLs_PIfilter($piString, array $incomingProcInstructions)
537
    {
538 5
        if (empty($incomingProcInstructions)) {
539 1
            return true;
540
        }
541
542 4
        foreach ($incomingProcInstructions as $pi) {
543 4
            if (GeneralUtility::inList($piString, $pi)) {
544 4
                return true;
545
            }
546
        }
547 2
    }
548
549 4
    public function getPageTSconfigForId($id)
550
    {
551 4
        if (!$this->MP) {
552 4
            $pageTSconfig = BackendUtility::getPagesTSconfig($id);
553
        } else {
554
            list(, $mountPointId) = explode('-', $this->MP);
555
            $pageTSconfig = BackendUtility::getPagesTSconfig($mountPointId);
556
        }
557
558
        // Call a hook to alter configuration
559 4
        if (is_array($GLOBALS['TYPO3_CONF_VARS']['EXTCONF']['crawler']['getPageTSconfigForId'])) {
560
            $params = [
561
                'pageId' => $id,
562
                'pageTSConfig' => &$pageTSconfig
563
            ];
564
            foreach ($GLOBALS['TYPO3_CONF_VARS']['EXTCONF']['crawler']['getPageTSconfigForId'] as $userFunc) {
565
                GeneralUtility::callUserFunction($userFunc, $params, $this);
566
            }
567
        }
568
569 4
        return $pageTSconfig;
570
    }
571
572
    /**
573
     * This methods returns an array of configurations.
574
     * And no urls!
575
     *
576
     * @param integer $id Page ID
577
     * @param bool $forceSsl Use https
578
     * @return array
579
     *
580
     * TODO: Should be switched back to protected - TNM 2018-11-16
581
     */
582 4
    public function getUrlsForPageId($id, $forceSsl = false)
583
    {
584
585
        /**
586
         * Get configuration from tsConfig
587
         */
588
589
        // Get page TSconfig for page ID:
590 4
        $pageTSconfig = $this->getPageTSconfigForId($id);
591
592 4
        $res = [];
593
594 4
        if (is_array($pageTSconfig) && is_array($pageTSconfig['tx_crawler.']['crawlerCfg.'])) {
595 3
            $crawlerCfg = $pageTSconfig['tx_crawler.']['crawlerCfg.'];
596
597 3
            if (is_array($crawlerCfg['paramSets.'])) {
598 3
                foreach ($crawlerCfg['paramSets.'] as $key => $values) {
599 3
                    if (is_array($values)) {
600 3
                        $key = str_replace('.', '', $key);
601
                        // Sub configuration for a single configuration string:
602 3
                        $subCfg = (array)$crawlerCfg['paramSets.'][$key . '.'];
603 3
                        $subCfg['key'] = $key;
604
605 3
                        if (strcmp($subCfg['procInstrFilter'], '')) {
606 3
                            $subCfg['procInstrFilter'] = implode(',', GeneralUtility::trimExplode(',', $subCfg['procInstrFilter']));
607
                        }
608 3
                        $pidOnlyList = implode(',', GeneralUtility::trimExplode(',', $subCfg['pidsOnly'], true));
609
610
                        // process configuration if it is not page-specific or if the specific page is the current page:
611 3
                        if (!strcmp($subCfg['pidsOnly'], '') || GeneralUtility::inList($pidOnlyList, $id)) {
612
613
                                // add trailing slash if not present
614 3
                            if (!empty($subCfg['baseUrl']) && substr($subCfg['baseUrl'], -1) != '/') {
615
                                $subCfg['baseUrl'] .= '/';
616
                            }
617
618
                            // Explode, process etc.:
619 3
                            $res[$key] = [];
620 3
                            $res[$key]['subCfg'] = $subCfg;
621 3
                            $res[$key]['paramParsed'] = $this->parseParams($crawlerCfg['paramSets.'][$key]);
622 3
                            $res[$key]['paramExpanded'] = $this->expandParameters($res[$key]['paramParsed'], $id);
623 3
                            $res[$key]['origin'] = 'pagets';
624
625
                            // recognize MP value
626 3
                            if (!$this->MP) {
627 3
                                $res[$key]['URLs'] = $this->compileUrls($res[$key]['paramExpanded'], ['?id=' . $id]);
628
                            } else {
629 3
                                $res[$key]['URLs'] = $this->compileUrls($res[$key]['paramExpanded'], ['?id=' . $id . '&MP=' . $this->MP]);
630
                            }
631
                        }
632
                    }
633
                }
634
            }
635
        }
636
637
        /**
638
         * Get configuration from tx_crawler_configuration records
639
         */
640
641
        // get records along the rootline
642 4
        $rootLine = BackendUtility::BEgetRootLine($id);
643
644 4
        foreach ($rootLine as $page) {
645 4
            $configurationRecordsForCurrentPage = BackendUtility::getRecordsByField(
0 ignored issues
show
Deprecated Code introduced by
The method TYPO3\CMS\Backend\Utilit...ty::getRecordsByField() has been deprecated with message: since TYPO3 v8, will be removed in TYPO3 v9

This method has been deprecated. The supplier of the class has supplied an explanatory message.

The explanatory message should give you some clue as to whether and when the method will be removed from the class and what other method or class to use instead.

Loading history...
646 4
                'tx_crawler_configuration',
647 4
                'pid',
648 4
                intval($page['uid']),
649 4
                BackendUtility::BEenableFields('tx_crawler_configuration') . BackendUtility::deleteClause('tx_crawler_configuration')
650
            );
651
652 4
            if (is_array($configurationRecordsForCurrentPage)) {
653 1
                foreach ($configurationRecordsForCurrentPage as $configurationRecord) {
654
655
                        // check access to the configuration record
656 1
                    if (empty($configurationRecord['begroups']) || $GLOBALS['BE_USER']->isAdmin() || $this->hasGroupAccess($GLOBALS['BE_USER']->user['usergroup_cached_list'], $configurationRecord['begroups'])) {
657 1
                        $pidOnlyList = implode(',', GeneralUtility::trimExplode(',', $configurationRecord['pidsonly'], true));
658
659
                        // process configuration if it is not page-specific or if the specific page is the current page:
660 1
                        if (!strcmp($configurationRecord['pidsonly'], '') || GeneralUtility::inList($pidOnlyList, $id)) {
661 1
                            $key = $configurationRecord['name'];
662
663
                            // don't overwrite previously defined paramSets
664 1
                            if (!isset($res[$key])) {
665
666
                                    /* @var $TSparserObject \TYPO3\CMS\Core\TypoScript\Parser\TypoScriptParser */
667 1
                                $TSparserObject = GeneralUtility::makeInstance('TYPO3\CMS\Core\TypoScript\Parser\TypoScriptParser');
668 1
                                $TSparserObject->parse($configurationRecord['processing_instruction_parameters_ts']);
669
670 1
                                $isCrawlingProtocolHttps = $this->isCrawlingProtocolHttps($configurationRecord['force_ssl'], $forceSsl);
671
672
                                $subCfg = [
673 1
                                    'procInstrFilter' => $configurationRecord['processing_instruction_filter'],
674 1
                                    'procInstrParams.' => $TSparserObject->setup,
675 1
                                    'baseUrl' => $this->getBaseUrlForConfigurationRecord(
676 1
                                        $configurationRecord['base_url'],
677 1
                                        $configurationRecord['sys_domain_base_url'],
678 1
                                        $isCrawlingProtocolHttps
679
                                    ),
680 1
                                    'realurl' => $configurationRecord['realurl'],
681 1
                                    'cHash' => $configurationRecord['chash'],
682 1
                                    'userGroups' => $configurationRecord['fegroups'],
683 1
                                    'exclude' => $configurationRecord['exclude'],
684 1
                                    'rootTemplatePid' => (int) $configurationRecord['root_template_pid'],
685 1
                                    'key' => $key
686
                                ];
687
688
                                // add trailing slash if not present
689 1
                                if (!empty($subCfg['baseUrl']) && substr($subCfg['baseUrl'], -1) != '/') {
690
                                    $subCfg['baseUrl'] .= '/';
691
                                }
692 1
                                if (!in_array($id, $this->expandExcludeString($subCfg['exclude']))) {
693 1
                                    $res[$key] = [];
694 1
                                    $res[$key]['subCfg'] = $subCfg;
695 1
                                    $res[$key]['paramParsed'] = $this->parseParams($configurationRecord['configuration']);
696 1
                                    $res[$key]['paramExpanded'] = $this->expandParameters($res[$key]['paramParsed'], $id);
697 1
                                    $res[$key]['URLs'] = $this->compileUrls($res[$key]['paramExpanded'], ['?id=' . $id]);
698 4
                                    $res[$key]['origin'] = 'tx_crawler_configuration_' . $configurationRecord['uid'];
699
                                }
700
                            }
701
                        }
702
                    }
703
                }
704
            }
705
        }
706
707 4
        if (is_array($GLOBALS['TYPO3_CONF_VARS']['EXTCONF']['crawler']['processUrls'])) {
708
            foreach ($GLOBALS['TYPO3_CONF_VARS']['EXTCONF']['crawler']['processUrls'] as $func) {
709
                $params = [
710
                    'res' => &$res,
711
                ];
712
                GeneralUtility::callUserFunction($func, $params, $this);
713
            }
714
        }
715
716 4
        return $res;
717
    }
718
719
    /**
720
     * Checks if a domain record exist and returns the base-url based on the record. If not the given baseUrl string is used.
721
     *
722
     * @param string $baseUrl
723
     * @param integer $sysDomainUid
724
     * @param bool $ssl
725
     * @return string
726
     */
727 4
    protected function getBaseUrlForConfigurationRecord($baseUrl, $sysDomainUid, $ssl = false)
728
    {
729 4
        $sysDomainUid = intval($sysDomainUid);
730 4
        $urlScheme = ($ssl === false) ? 'http' : 'https';
731
732 4
        if ($sysDomainUid > 0) {
733 2
            $res = $this->db->exec_SELECTquery(
734 2
                '*',
735 2
                'sys_domain',
736 2
                'uid = ' . $sysDomainUid .
737 2
                BackendUtility::BEenableFields('sys_domain') .
738 2
                BackendUtility::deleteClause('sys_domain')
739
            );
740 2
            $row = $this->db->sql_fetch_assoc($res);
741 2
            if ($row['domainName'] != '') {
742 1
                return $urlScheme . '://' . $row['domainName'];
743
            }
744
        }
745 3
        return $baseUrl;
746
    }
747
748
    public function getConfigurationsForBranch($rootid, $depth)
749
    {
750
        $configurationsForBranch = [];
751
752
        $pageTSconfig = $this->getPageTSconfigForId($rootid);
753
        if (is_array($pageTSconfig) && is_array($pageTSconfig['tx_crawler.']['crawlerCfg.']) && is_array($pageTSconfig['tx_crawler.']['crawlerCfg.']['paramSets.'])) {
754
            $sets = $pageTSconfig['tx_crawler.']['crawlerCfg.']['paramSets.'];
755
            if (is_array($sets)) {
756
                foreach ($sets as $key => $value) {
757
                    if (!is_array($value)) {
758
                        continue;
759
                    }
760
                    $configurationsForBranch[] = substr($key, -1) == '.' ? substr($key, 0, -1) : $key;
761
                }
762
            }
763
        }
764
        $pids = [];
765
        $rootLine = BackendUtility::BEgetRootLine($rootid);
766
        foreach ($rootLine as $node) {
767
            $pids[] = $node['uid'];
768
        }
769
        /* @var PageTreeView $tree */
770
        $tree = GeneralUtility::makeInstance(PageTreeView::class);
771
        $perms_clause = $GLOBALS['BE_USER']->getPagePermsClause(1);
772
        $tree->init('AND ' . $perms_clause);
773
        $tree->getTree($rootid, $depth, '');
774
        foreach ($tree->tree as $node) {
775
            $pids[] = $node['row']['uid'];
776
        }
777
778
        $res = $this->db->exec_SELECTquery(
779
            '*',
780
            'tx_crawler_configuration',
781
            'pid IN (' . implode(',', $pids) . ') ' .
782
            BackendUtility::BEenableFields('tx_crawler_configuration') .
783
            BackendUtility::deleteClause('tx_crawler_configuration') . ' ' .
784
            BackendUtility::versioningPlaceholderClause('tx_crawler_configuration') . ' '
785
        );
786
787
        while ($row = $this->db->sql_fetch_assoc($res)) {
788
            $configurationsForBranch[] = $row['name'];
789
        }
790
        $this->db->sql_free_result($res);
791
        return $configurationsForBranch;
792
    }
793
794
    /**
795
     * Check if a user has access to an item
796
     * (e.g. get the group list of the current logged in user from $GLOBALS['TSFE']->gr_list)
797
     *
798
     * @see \TYPO3\CMS\Frontend\Page\PageRepository::getMultipleGroupsWhereClause()
799
     * @param  string $groupList    Comma-separated list of (fe_)group UIDs from a user
800
     * @param  string $accessList   Comma-separated list of (fe_)group UIDs of the item to access
801
     * @return bool                 TRUE if at least one of the users group UIDs is in the access list or the access list is empty
802
     */
803 3
    public function hasGroupAccess($groupList, $accessList)
804
    {
805 3
        if (empty($accessList)) {
806 1
            return true;
807
        }
808 2
        foreach (GeneralUtility::intExplode(',', $groupList) as $groupUid) {
809 2
            if (GeneralUtility::inList($accessList, $groupUid)) {
810 2
                return true;
811
            }
812
        }
813 1
        return false;
814
    }
815
816
    /**
817
     * Parse GET vars of input Query into array with key=>value pairs
818
     *
819
     * @param string $inputQuery Input query string
820
     * @return array
821
     */
822 7
    public function parseParams($inputQuery)
823
    {
824
        // Extract all GET parameters into an ARRAY:
825 7
        $paramKeyValues = [];
826 7
        $GETparams = explode('&', $inputQuery);
827
828 7
        foreach ($GETparams as $paramAndValue) {
829 7
            list($p, $v) = explode('=', $paramAndValue, 2);
830 7
            if (strlen($p)) {
831 7
                $paramKeyValues[rawurldecode($p)] = rawurldecode($v);
832
            }
833
        }
834
835 7
        return $paramKeyValues;
836
    }
837
838
    /**
839
     * Will expand the parameters configuration to individual values. This follows a certain syntax of the value of each parameter.
840
     * Syntax of values:
841
     * - Basically: If the value is wrapped in [...] it will be expanded according to the following syntax, otherwise the value is taken literally
842
     * - Configuration is splitted by "|" and the parts are processed individually and finally added together
843
     * - For each configuration part:
844
     *         - "[int]-[int]" = Integer range, will be expanded to all values in between, values included, starting from low to high (max. 1000). Example "1-34" or "-40--30"
845
     *         - "_TABLE:[TCA table name];[_PID:[optional page id, default is current page]];[_ENABLELANG:1]" = Look up of table records from PID, filtering out deleted records. Example "_TABLE:tt_content; _PID:123"
846
     *        _ENABLELANG:1 picks only original records without their language overlays
847
     *         - Default: Literal value
848
     *
849
     * @param array $paramArray Array with key (GET var name) and values (value of GET var which is configuration for expansion)
850
     * @param integer $pid Current page ID
851
     * @return array
852
     */
853 4
    public function expandParameters($paramArray, $pid)
854
    {
855 4
        global $TCA;
856
857
        // Traverse parameter names:
858 4
        foreach ($paramArray as $p => $v) {
859 4
            $v = trim($v);
860
861
            // If value is encapsulated in square brackets it means there are some ranges of values to find, otherwise the value is literal
862 4
            if (substr($v, 0, 1) === '[' && substr($v, -1) === ']') {
863
                // So, find the value inside brackets and reset the paramArray value as an array.
864 4
                $v = substr($v, 1, -1);
865 4
                $paramArray[$p] = [];
866
867
                // Explode parts and traverse them:
868 4
                $parts = explode('|', $v);
869 4
                foreach ($parts as $pV) {
870
871
                        // Look for integer range: (fx. 1-34 or -40--30 // reads minus 40 to minus 30)
872 4
                    if (preg_match('/^(-?[0-9]+)\s*-\s*(-?[0-9]+)$/', trim($pV), $reg)) {
873
874
                        // Swap if first is larger than last:
875
                        if ($reg[1] > $reg[2]) {
876
                            $temp = $reg[2];
877
                            $reg[2] = $reg[1];
878
                            $reg[1] = $temp;
879
                        }
880
881
                        // Traverse range, add values:
882
                        $runAwayBrake = 1000; // Limit to size of range!
883
                        for ($a = $reg[1]; $a <= $reg[2];$a++) {
884
                            $paramArray[$p][] = $a;
885
                            $runAwayBrake--;
886
                            if ($runAwayBrake <= 0) {
887
                                break;
888
                            }
889
                        }
890 4
                    } elseif (substr(trim($pV), 0, 7) == '_TABLE:') {
891
892
                        // Parse parameters:
893
                        $subparts = GeneralUtility::trimExplode(';', $pV);
894
                        $subpartParams = [];
895
                        foreach ($subparts as $spV) {
896
                            list($pKey, $pVal) = GeneralUtility::trimExplode(':', $spV);
897
                            $subpartParams[$pKey] = $pVal;
898
                        }
899
900
                        // Table exists:
901
                        if (isset($TCA[$subpartParams['_TABLE']])) {
902
                            $lookUpPid = isset($subpartParams['_PID']) ? intval($subpartParams['_PID']) : $pid;
903
                            $pidField = isset($subpartParams['_PIDFIELD']) ? trim($subpartParams['_PIDFIELD']) : 'pid';
904
                            $where = isset($subpartParams['_WHERE']) ? $subpartParams['_WHERE'] : '';
905
                            $addTable = isset($subpartParams['_ADDTABLE']) ? $subpartParams['_ADDTABLE'] : '';
906
907
                            $fieldName = $subpartParams['_FIELD'] ? $subpartParams['_FIELD'] : 'uid';
908
                            if ($fieldName === 'uid' || $TCA[$subpartParams['_TABLE']]['columns'][$fieldName]) {
909
                                $andWhereLanguage = '';
910
                                $transOrigPointerField = $TCA[$subpartParams['_TABLE']]['ctrl']['transOrigPointerField'];
911
912
                                if ($subpartParams['_ENABLELANG'] && $transOrigPointerField) {
913
                                    $andWhereLanguage = ' AND ' . $this->db->quoteStr($transOrigPointerField, $subpartParams['_TABLE']) . ' <= 0 ';
914
                                }
915
916
                                $where = $this->db->quoteStr($pidField, $subpartParams['_TABLE']) . '=' . intval($lookUpPid) . ' ' .
917
                                    $andWhereLanguage . $where;
918
919
                                $rows = $this->db->exec_SELECTgetRows(
920
                                    $fieldName,
921
                                    $subpartParams['_TABLE'] . $addTable,
922
                                    $where . BackendUtility::deleteClause($subpartParams['_TABLE']),
923
                                    '',
924
                                    '',
925
                                    '',
926
                                    $fieldName
927
                                );
928
929
                                if (is_array($rows)) {
930
                                    $paramArray[$p] = array_merge($paramArray[$p], array_keys($rows));
931
                                }
932
                            }
933
                        }
934
                    } else { // Just add value:
935 4
                        $paramArray[$p][] = $pV;
936
                    }
937
                    // Hook for processing own expandParameters place holder
938 4
                    if (is_array($GLOBALS['TYPO3_CONF_VARS']['SC_OPTIONS']['crawler/class.tx_crawler_lib.php']['expandParameters'])) {
939
                        $_params = [
940
                            'pObj' => &$this,
941
                            'paramArray' => &$paramArray,
942
                            'currentKey' => $p,
943
                            'currentValue' => $pV,
944
                            'pid' => $pid
945
                        ];
946
                        foreach ($GLOBALS['TYPO3_CONF_VARS']['SC_OPTIONS']['crawler/class.tx_crawler_lib.php']['expandParameters'] as $key => $_funcRef) {
947 4
                            GeneralUtility::callUserFunction($_funcRef, $_params, $this);
948
                        }
949
                    }
950
                }
951
952
                // Make unique set of values and sort array by key:
953 4
                $paramArray[$p] = array_unique($paramArray[$p]);
954 4
                ksort($paramArray);
955
            } else {
956
                // Set the literal value as only value in array:
957 4
                $paramArray[$p] = [$v];
958
            }
959
        }
960
961 4
        return $paramArray;
962
    }
963
964
    /**
965
     * Compiling URLs from parameter array (output of expandParameters())
966
     * The number of URLs will be the multiplication of the number of parameter values for each key
967
     *
968
     * @param array $paramArray Output of expandParameters(): Array with keys (GET var names) and for each an array of values
969
     * @param array $urls URLs accumulated in this array (for recursion)
970
     * @return array
971
     */
972 7
    public function compileUrls($paramArray, $urls = [])
973
    {
974 7
        if (count($paramArray) && is_array($urls)) {
975
            // shift first off stack:
976 6
            reset($paramArray);
977 6
            $varName = key($paramArray);
978 6
            $valueSet = array_shift($paramArray);
979
980
            // Traverse value set:
981 6
            $newUrls = [];
982 6
            foreach ($urls as $url) {
983 5
                foreach ($valueSet as $val) {
984 5
                    $newUrls[] = $url . (strcmp($val, '') ? '&' . rawurlencode($varName) . '=' . rawurlencode($val) : '');
985
986 5
                    if (count($newUrls) > MathUtility::forceIntegerInRange($this->extensionSettings['maxCompileUrls'], 1, 1000000000, 10000)) {
987 5
                        break;
988
                    }
989
                }
990
            }
991 6
            $urls = $newUrls;
992 6
            $urls = $this->compileUrls($paramArray, $urls);
993
        }
994
995 7
        return $urls;
996
    }
997
998
    /************************************
999
     *
1000
     * Crawler log
1001
     *
1002
     ************************************/
1003
1004
    /**
1005
     * Return array of records from crawler queue for input page ID
1006
     *
1007
     * @param integer $id Page ID for which to look up log entries.
1008
     * @param string$filter Filter: "all" => all entries, "pending" => all that is not yet run, "finished" => all complete ones
1009
     * @param boolean $doFlush If TRUE, then entries selected at DELETED(!) instead of selected!
1010
     * @param boolean $doFullFlush
1011
     * @param integer $itemsPerPage Limit the amount of entries per page default is 10
1012
     * @return array
1013
     */
1014 4
    public function getLogEntriesForPageId($id, $filter = '', $doFlush = false, $doFullFlush = false, $itemsPerPage = 10)
1015
    {
1016
        switch ($filter) {
1017 4
            case 'pending':
1018
                $addWhere = ' AND exec_time=0';
1019
                break;
1020 4
            case 'finished':
1021
                $addWhere = ' AND exec_time>0';
1022
                break;
1023
            default:
1024 4
                $addWhere = '';
1025 4
                break;
1026
        }
1027
1028
        // FIXME: Write unit test that ensures that the right records are deleted.
1029 4
        if ($doFlush) {
1030 2
            $this->flushQueue(($doFullFlush ? '1=1' : ('page_id=' . intval($id))) . $addWhere);
1031 2
            return [];
1032
        } else {
1033 2
            return $this->db->exec_SELECTgetRows(
1034 2
                '*',
1035 2
                'tx_crawler_queue',
1036 2
                'page_id=' . intval($id) . $addWhere,
1037 2
                '',
1038 2
                'scheduled DESC',
1039 2
                (intval($itemsPerPage) > 0 ? intval($itemsPerPage) : '')
1040
            );
1041
        }
1042
    }
1043
1044
    /**
1045
     * Return array of records from crawler queue for input set ID
1046
     *
1047
     * @param integer $set_id Set ID for which to look up log entries.
1048
     * @param string $filter Filter: "all" => all entries, "pending" => all that is not yet run, "finished" => all complete ones
1049
     * @param boolean $doFlush If TRUE, then entries selected at DELETED(!) instead of selected!
1050
     * @param integer $itemsPerPage Limit the amount of entires per page default is 10
1051
     * @return array
1052
     */
1053 6
    public function getLogEntriesForSetId($set_id, $filter = '', $doFlush = false, $doFullFlush = false, $itemsPerPage = 10)
1054
    {
1055
        // FIXME: Write Unit tests for Filters
1056
        switch ($filter) {
1057 6
            case 'pending':
1058 1
                $addWhere = ' AND exec_time=0';
1059 1
                break;
1060 5
            case 'finished':
1061 1
                $addWhere = ' AND exec_time>0';
1062 1
                break;
1063
            default:
1064 4
                $addWhere = '';
1065 4
                break;
1066
        }
1067
        // FIXME: Write unit test that ensures that the right records are deleted.
1068 6
        if ($doFlush) {
1069 4
            $this->flushQueue($doFullFlush ? '' : ('set_id=' . intval($set_id) . $addWhere));
1070 4
            return [];
1071
        } else {
1072 2
            return $this->db->exec_SELECTgetRows(
1073 2
                '*',
1074 2
                'tx_crawler_queue',
1075 2
                'set_id=' . intval($set_id) . $addWhere,
1076 2
                '',
1077 2
                'scheduled DESC',
1078 2
                (intval($itemsPerPage) > 0 ? intval($itemsPerPage) : '')
1079
            );
1080
        }
1081
    }
1082
1083
    /**
1084
     * Removes queue entries
1085
     *
1086
     * @param string $where SQL related filter for the entries which should be removed
1087
     * @return void
1088
     */
1089 10
    protected function flushQueue($where = '')
1090
    {
1091 10
        $realWhere = strlen($where) > 0 ? $where : '1=1';
1092
1093 10
        if (EventDispatcher::getInstance()->hasObserver('queueEntryFlush') || SignalSlotUtility::hasSignal(__CLASS__, SignalSlotUtility::SIGNAL_QUEUE_ENTRY_FLUSH)) {
1094
            $groups = $this->db->exec_SELECTgetRows('DISTINCT set_id', 'tx_crawler_queue', $realWhere);
1095
            if (is_array($groups)) {
1096
                foreach ($groups as $group) {
1097
1098
                    // The event dispatcher is deprecated since crawler v6.4.0, will be removed in crawler v7.0.0.
1099
                    // Please use the Signal instead.
1100
                    if (EventDispatcher::getInstance()->hasObserver('queueEntryFlush')) {
1101
                        EventDispatcher::getInstance()->post(
1102
                            'queueEntryFlush',
1103
                            $group['set_id'],
1104
                            $this->db->exec_SELECTgetRows('uid, set_id', 'tx_crawler_queue', $realWhere . ' AND set_id="' . $group['set_id'] . '"')
1105
                        );
1106
                    }
1107
1108
                    if (SignalSlotUtility::hasSignal(__CLASS__, SignalSlotUtility::SIGNAL_QUEUE_ENTRY_FLUSH)) {
1109
                        $signalInputArray = $this->db->exec_SELECTgetRows('uid, set_id', 'tx_crawler_queue', $realWhere . ' AND set_id="' . $group['set_id'] . '"');
1110
                        SignalSlotUtility::emitSignal(
1111
                            __CLASS__,
1112
                            SignalSlotUtility::SIGNAL_QUEUE_ENTRY_FLUSH,
1113
                            $signalInputArray
0 ignored issues
show
Bug introduced by
It seems like $signalInputArray defined by $this->db->exec_SELECTge...$group['set_id'] . '"') on line 1109 can also be of type null; however, AOE\Crawler\Utility\Sign...otUtility::emitSignal() does only seem to accept array, maybe add an additional type check?

If a method or function can return multiple different values and unless you are sure that you only can receive a single value in this context, we recommend to add an additional type check:

/**
 * @return array|string
 */
function returnsDifferentValues($x) {
    if ($x) {
        return 'foo';
    }

    return array();
}

$x = returnsDifferentValues($y);
if (is_array($x)) {
    // $x is an array.
}

If this a common case that PHP Analyzer should handle natively, please let us know by opening an issue.

Loading history...
1114
                        );
1115
                    }
1116
                }
1117
            }
1118
        }
1119
1120 10
        $GLOBALS['TYPO3_DB']->exec_DELETEquery('tx_crawler_queue', $realWhere);
1121 10
    }
1122
1123
    /**
1124
     * Adding call back entries to log (called from hooks typically, see indexed search class "class.crawler.php"
1125
     *
1126
     * @param integer $setId Set ID
1127
     * @param array $params Parameters to pass to call back function
1128
     * @param string $callBack Call back object reference, eg. 'EXT:indexed_search/class.crawler.php:&tx_indexedsearch_crawler'
1129
     * @param integer $page_id Page ID to attach it to
1130
     * @param integer $schedule Time at which to activate
1131
     * @return void
1132
     */
1133
    public function addQueueEntry_callBack($setId, $params, $callBack, $page_id = 0, $schedule = 0)
1134
    {
1135
        if (!is_array($params)) {
1136
            $params = [];
1137
        }
1138
        $params['_CALLBACKOBJ'] = $callBack;
1139
1140
        // Compile value array:
1141
        $fieldArray = [
1142
            'page_id' => intval($page_id),
1143
            'parameters' => serialize($params),
1144
            'scheduled' => intval($schedule) ? intval($schedule) : $this->getCurrentTime(),
1145
            'exec_time' => 0,
1146
            'set_id' => intval($setId),
1147
            'result_data' => '',
1148
        ];
1149
1150
        $this->db->exec_INSERTquery('tx_crawler_queue', $fieldArray);
1151
    }
1152
1153
    /************************************
1154
     *
1155
     * URL setting
1156
     *
1157
     ************************************/
1158
1159
    /**
1160
     * Setting a URL for crawling:
1161
     *
1162
     * @param integer $id Page ID
1163
     * @param string $url Complete URL
1164
     * @param array $subCfg Sub configuration array (from TS config)
1165
     * @param integer $tstamp Scheduled-time
1166
     * @param string $configurationHash (optional) configuration hash
1167
     * @param bool $skipInnerDuplicationCheck (optional) skip inner duplication check
1168
     * @return bool
1169
     */
1170 4
    public function addUrl(
1171
        $id,
1172
        $url,
1173
        array $subCfg,
1174
        $tstamp,
1175
        $configurationHash = '',
1176
        $skipInnerDuplicationCheck = false
1177
    ) {
1178 4
        $urlAdded = false;
1179 4
        $rows = [];
1180
1181
        // Creating parameters:
1182
        $parameters = [
1183 4
            'url' => $url
1184
        ];
1185
1186
        // fe user group simulation:
1187 4
        $uGs = implode(',', array_unique(GeneralUtility::intExplode(',', $subCfg['userGroups'], true)));
1188 4
        if ($uGs) {
1189
            $parameters['feUserGroupList'] = $uGs;
1190
        }
1191
1192
        // Setting processing instructions
1193 4
        $parameters['procInstructions'] = GeneralUtility::trimExplode(',', $subCfg['procInstrFilter']);
1194 4
        if (is_array($subCfg['procInstrParams.'])) {
1195 4
            $parameters['procInstrParams'] = $subCfg['procInstrParams.'];
1196
        }
1197
1198
        // Possible TypoScript Template Parents
1199 4
        $parameters['rootTemplatePid'] = $subCfg['rootTemplatePid'];
1200
1201
        // Compile value array:
1202 4
        $parameters_serialized = serialize($parameters);
1203
        $fieldArray = [
1204 4
            'page_id' => intval($id),
1205 4
            'parameters' => $parameters_serialized,
1206 4
            'parameters_hash' => GeneralUtility::shortMD5($parameters_serialized),
1207 4
            'configuration_hash' => $configurationHash,
1208 4
            'scheduled' => $tstamp,
1209 4
            'exec_time' => 0,
1210 4
            'set_id' => intval($this->setID),
1211 4
            'result_data' => '',
1212 4
            'configuration' => $subCfg['key'],
1213
        ];
1214
1215 4
        if ($this->registerQueueEntriesInternallyOnly) {
1216
            //the entries will only be registered and not stored to the database
1217
            $this->queueEntries[] = $fieldArray;
1218
        } else {
1219 4
            if (!$skipInnerDuplicationCheck) {
1220
                // check if there is already an equal entry
1221 4
                $rows = $this->getDuplicateRowsIfExist($tstamp, $fieldArray);
1222
            }
1223
1224 4
            if (count($rows) == 0) {
1225 4
                $this->db->exec_INSERTquery('tx_crawler_queue', $fieldArray);
1226 4
                $uid = $this->db->sql_insert_id();
1227 4
                $rows[] = $uid;
1228 4
                $urlAdded = true;
1229
1230
                // The event dispatcher is deprecated since crawler v6.4.0, will be removed in crawler v7.0.0.
1231
                // Please use the Signal instead.
1232 4
                EventDispatcher::getInstance()->post('urlAddedToQueue', $this->setID, ['uid' => $uid, 'fieldArray' => $fieldArray]);
1233
1234 4
                $signalPayload = ['uid' => $uid, 'fieldArray' => $fieldArray];
1235 4
                SignalSlotUtility::emitSignal(
1236 4
                    __CLASS__,
1237 4
                    SignalSlotUtility::SIGNAL_URL_ADDED_TO_QUEUE,
1238 4
                    $signalPayload
1239
                );
1240
1241
            } else {
1242
                // The event dispatcher is deprecated since crawler v6.4.0, will be removed in crawler v7.0.0.
1243
                // Please use the Signal instead.
1244 2
                EventDispatcher::getInstance()->post('duplicateUrlInQueue', $this->setID, ['rows' => $rows, 'fieldArray' => $fieldArray]);
1245
1246 2
                $signalPayload = ['rows' => $rows, 'fieldArray' => $fieldArray];
1247 2
                SignalSlotUtility::emitSignal(
1248 2
                    __CLASS__,
1249 2
                    SignalSlotUtility::SIGNAL_DUPLICATE_URL_IN_QUEUE,
1250 2
                    $signalPayload
1251
                );
1252
            }
1253
        }
1254
1255 4
        return $urlAdded;
1256
    }
1257
1258
    /**
1259
     * This method determines duplicates for a queue entry with the same parameters and this timestamp.
1260
     * If the timestamp is in the past, it will check if there is any unprocessed queue entry in the past.
1261
     * If the timestamp is in the future it will check, if the queued entry has exactly the same timestamp
1262
     *
1263
     * @param int $tstamp
1264
     * @param array $fieldArray
1265
     *
1266
     * @return array
1267
     */
1268 4
    protected function getDuplicateRowsIfExist($tstamp, $fieldArray)
1269
    {
1270 4
        $rows = [];
1271
1272 4
        $currentTime = $this->getCurrentTime();
1273
1274
        //if this entry is scheduled with "now"
1275 4
        if ($tstamp <= $currentTime) {
1276 1
            if ($this->extensionSettings['enableTimeslot']) {
1277 1
                $timeBegin = $currentTime - 100;
1278 1
                $timeEnd = $currentTime + 100;
1279 1
                $where = ' ((scheduled BETWEEN ' . $timeBegin . ' AND ' . $timeEnd . ' ) OR scheduled <= ' . $currentTime . ') ';
1280
            } else {
1281 1
                $where = 'scheduled <= ' . $currentTime;
1282
            }
1283 3
        } elseif ($tstamp > $currentTime) {
1284
            //entry with a timestamp in the future need to have the same schedule time
1285 3
            $where = 'scheduled = ' . $tstamp ;
1286
        }
1287
1288 4
        if (!empty($where)) {
1289 4
            $result = $this->db->exec_SELECTgetRows(
1290 4
                'qid',
1291 4
                'tx_crawler_queue',
1292
                $where .
1293 4
                ' AND NOT exec_time' .
1294 4
                ' AND NOT process_id ' .
1295 4
                ' AND page_id=' . intval($fieldArray['page_id']) .
1296 4
                ' AND parameters_hash = ' . $this->db->fullQuoteStr($fieldArray['parameters_hash'], 'tx_crawler_queue')
1297
            );
1298
1299 4
            if (is_array($result)) {
1300 4
                foreach ($result as $value) {
1301 2
                    $rows[] = $value['qid'];
1302
                }
1303
            }
1304
        }
1305
1306 4
        return $rows;
1307
    }
1308
1309
    /**
1310
     * Returns the current system time
1311
     *
1312
     * @return int
1313
     */
1314
    public function getCurrentTime()
1315
    {
1316
        return time();
1317
    }
1318
1319
    /************************************
1320
     *
1321
     * URL reading
1322
     *
1323
     ************************************/
1324
1325
    /**
1326
     * Read URL for single queue entry
1327
     *
1328
     * @param integer $queueId
1329
     * @param boolean $force If set, will process even if exec_time has been set!
1330
     * @return integer
1331
     */
1332
    public function readUrl($queueId, $force = false)
1333
    {
1334
        $ret = 0;
1335
        if ($this->debugMode) {
1336
            GeneralUtility::devlog('crawler-readurl start ' . microtime(true), __FUNCTION__);
1337
        }
1338
        // Get entry:
1339
        list($queueRec) = $this->db->exec_SELECTgetRows(
1340
            '*',
1341
            'tx_crawler_queue',
1342
            'qid=' . intval($queueId) . ($force ? '' : ' AND exec_time=0 AND process_scheduled > 0')
1343
        );
1344
1345
        if (!is_array($queueRec)) {
1346
            return;
1347
        }
1348
1349
        $parameters = unserialize($queueRec['parameters']);
1350
        if ($parameters['rootTemplatePid']) {
1351
            $this->initTSFE((int)$parameters['rootTemplatePid']);
1352
        } else {
1353
            GeneralUtility::sysLog(
1354
                'Page with (' . $queueRec['page_id'] . ') could not be crawled, please check your crawler configuration. Perhaps no Root Template Pid is set',
1355
                'crawler',
1356
                GeneralUtility::SYSLOG_SEVERITY_WARNING
1357
            );
1358
        }
1359
1360
        $signalPayload = [$queueId, $queueRec];
1361
        SignalSlotUtility::emitSignal(
1362
            __CLASS__,
1363
            SignalSlotUtility::SIGNAL_QUEUEITEM_PREPROCESS,
1364
            $signalPayload
1365
        );
1366
1367
        // Set exec_time to lock record:
1368
        $field_array = ['exec_time' => $this->getCurrentTime()];
1369
1370
        if (isset($this->processID)) {
1371
            //if mulitprocessing is used we need to store the id of the process which has handled this entry
1372
            $field_array['process_id_completed'] = $this->processID;
1373
        }
1374
        $this->db->exec_UPDATEquery('tx_crawler_queue', 'qid=' . intval($queueId), $field_array);
1375
1376
        $result = $this->readUrl_exec($queueRec);
1377
        $resultData = unserialize($result['content']);
1378
1379
        //atm there's no need to point to specific pollable extensions
1380
        if (is_array($GLOBALS['TYPO3_CONF_VARS']['EXTCONF']['crawler']['pollSuccess'])) {
1381
            foreach ($GLOBALS['TYPO3_CONF_VARS']['EXTCONF']['crawler']['pollSuccess'] as $pollable) {
1382
                // only check the success value if the instruction is runnig
1383
                // it is important to name the pollSuccess key same as the procInstructions key
1384
                if (is_array($resultData['parameters']['procInstructions']) && in_array(
1385
                    $pollable,
1386
                        $resultData['parameters']['procInstructions']
1387
                )
1388
                ) {
1389
                    if (!empty($resultData['success'][$pollable]) && $resultData['success'][$pollable]) {
1390
                        $ret |= self::CLI_STATUS_POLLABLE_PROCESSED;
1391
                    }
1392
                }
1393
            }
1394
        }
1395
1396
        // Set result in log which also denotes the end of the processing of this entry.
1397
        $field_array = ['result_data' => serialize($result)];
1398
1399
        $signalPayload = [$queueId, $field_array];
1400
        SignalSlotUtility::emitSignal(
1401
            __CLASS__,
1402
            SignalSlotUtility::SIGNAL_QUEUEITEM_POSTPROCESS,
1403
            $signalPayload
1404
        );
1405
1406
        $this->db->exec_UPDATEquery('tx_crawler_queue', 'qid=' . intval($queueId), $field_array);
1407
1408
        if ($this->debugMode) {
1409
            GeneralUtility::devlog('crawler-readurl stop ' . microtime(true), __FUNCTION__);
1410
        }
1411
1412
        return $ret;
1413
    }
1414
1415
    /**
1416
     * Read URL for not-yet-inserted log-entry
1417
     *
1418
     * @param array $field_array Queue field array,
1419
     *
1420
     * @return string
1421
     */
1422
    public function readUrlFromArray($field_array)
1423
    {
1424
1425
            // Set exec_time to lock record:
1426
        $field_array['exec_time'] = $this->getCurrentTime();
1427
        $this->db->exec_INSERTquery('tx_crawler_queue', $field_array);
1428
        $queueId = $field_array['qid'] = $this->db->sql_insert_id();
1429
1430
        $result = $this->readUrl_exec($field_array);
1431
1432
        // Set result in log which also denotes the end of the processing of this entry.
1433
        $field_array = ['result_data' => serialize($result)];
1434
1435
        $signalPayload = [$queueId, $field_array];
1436
        SignalSlotUtility::emitSignal(
1437
            __CLASS__,
1438
            SignalSlotUtility::SIGNAL_QUEUEITEM_POSTPROCESS,
1439
            $signalPayload
1440
        );
1441
1442
        $this->db->exec_UPDATEquery('tx_crawler_queue', 'qid=' . intval($queueId), $field_array);
1443
1444
        return $result;
1445
    }
1446
1447
    /**
1448
     * Read URL for a queue record
1449
     *
1450
     * @param array $queueRec Queue record
1451
     * @return string
1452
     */
1453
    public function readUrl_exec($queueRec)
1454
    {
1455
        // Decode parameters:
1456
        $parameters = unserialize($queueRec['parameters']);
1457
        $result = 'ERROR';
1458
        if (is_array($parameters)) {
1459
            if ($parameters['_CALLBACKOBJ']) { // Calling object:
1460
                $objRef = $parameters['_CALLBACKOBJ'];
1461
                $callBackObj = &GeneralUtility::getUserObj($objRef);
1462
                if (is_object($callBackObj)) {
1463
                    unset($parameters['_CALLBACKOBJ']);
1464
                    $result = ['content' => serialize($callBackObj->crawler_execute($parameters, $this))];
1465
                } else {
1466
                    $result = ['content' => 'No object: ' . $objRef];
1467
                }
1468
            } else { // Regular FE request:
1469
1470
                // Prepare:
1471
                $crawlerId = $queueRec['qid'] . ':' . md5($queueRec['qid'] . '|' . $queueRec['set_id'] . '|' . $GLOBALS['TYPO3_CONF_VARS']['SYS']['encryptionKey']);
1472
1473
                // Get result:
1474
                $result = $this->requestUrl($parameters['url'], $crawlerId);
1475
1476
                // The event dispatcher is deprecated since crawler v6.4.0, will be removed in crawler v7.0.0.
1477
                // Please use the Signal instead.
1478
                EventDispatcher::getInstance()->post('urlCrawled', $queueRec['set_id'], ['url' => $parameters['url'], 'result' => $result]);
1479
1480
                $signalPayload = ['url' => $parameters['url'], 'result' => $result];
1481
                SignalSlotUtility::emitSignal(
1482
                    __CLASS__,
1483
                    SignalSlotUtility::SIGNAL_URL_CRAWLED,
1484
                    $signalPayload
1485
                );
1486
            }
1487
        }
1488
1489
        return $result;
1490
    }
1491
1492
    /**
1493
     * Gets the content of a URL.
1494
     *
1495
     * @param string $originalUrl URL to read
1496
     * @param string $crawlerId Crawler ID string (qid + hash to verify)
1497
     * @param integer $timeout Timeout time
1498
     * @param integer $recursion Recursion limiter for 302 redirects
1499
     * @return array
1500
     */
1501 2
    public function requestUrl($originalUrl, $crawlerId, $timeout = 2, $recursion = 10)
1502
    {
1503 2
        if (!$recursion) {
1504
            return false;
1505
        }
1506
1507
        // Parse URL, checking for scheme:
1508 2
        $url = parse_url($originalUrl);
1509
1510 2
        if ($url === false) {
1511
            if (TYPO3_DLOG) {
1512
                GeneralUtility::devLog(sprintf('Could not parse_url() for string "%s"', $url), 'crawler', 4, ['crawlerId' => $crawlerId]);
1513
            }
1514
            return false;
1515
        }
1516
1517 2
        if (!in_array($url['scheme'], ['','http','https'])) {
1518
            if (TYPO3_DLOG) {
1519
                GeneralUtility::devLog(sprintf('Scheme does not match for url "%s"', $url), 'crawler', 4, ['crawlerId' => $crawlerId]);
1520
            }
1521
            return false;
1522
        }
1523
1524
        // direct request
1525 2
        if ($this->extensionSettings['makeDirectRequests']) {
1526 2
            $result = $this->sendDirectRequest($originalUrl, $crawlerId);
1527 2
            return $result;
1528
        }
1529
1530
        $reqHeaders = $this->buildRequestHeaderArray($url, $crawlerId);
1531
1532
        // thanks to Pierrick Caillon for adding proxy support
1533
        $rurl = $url;
1534
1535
        if ($this->extensionSettings['curlUse'] && $this->extensionSettings['curlProxyServer']) {
1536
            $rurl = parse_url($this->extensionSettings['curlProxyServer']);
1537
            $url['path'] = $url['scheme'] . '://' . $url['host'] . ($url['port'] > 0 ? ':' . $url['port'] : '') . $url['path'];
1538
            $reqHeaders = $this->buildRequestHeaderArray($url, $crawlerId);
1539
        }
1540
1541
        $host = $rurl['host'];
1542
1543
        if ($url['scheme'] == 'https') {
1544
            $host = 'ssl://' . $host;
1545
            $port = ($rurl['port'] > 0) ? $rurl['port'] : 443;
1546
        } else {
1547
            $port = ($rurl['port'] > 0) ? $rurl['port'] : 80;
1548
        }
1549
1550
        $startTime = microtime(true);
1551
        $fp = fsockopen($host, $port, $errno, $errstr, $timeout);
1552
1553
        if (!$fp) {
1554
            if (TYPO3_DLOG) {
1555
                GeneralUtility::devLog(sprintf('Error while opening "%s"', $url), 'crawler', 4, ['crawlerId' => $crawlerId]);
1556
            }
1557
            return false;
1558
        } else {
1559
            // Request message:
1560
            $msg = implode("\r\n", $reqHeaders) . "\r\n\r\n";
1561
            fputs($fp, $msg);
1562
1563
            // Read response:
1564
            $d = $this->getHttpResponseFromStream($fp);
1565
            fclose($fp);
1566
1567
            $time = microtime(true) - $startTime;
1568
            $this->log($originalUrl . ' ' . $time);
1569
1570
            // Implode content and headers:
1571
            $result = [
1572
                'request' => $msg,
1573
                'headers' => implode('', $d['headers']),
1574
                'content' => implode('', (array)$d['content'])
1575
            ];
1576
1577
            if (($this->extensionSettings['follow30x']) && ($newUrl = $this->getRequestUrlFrom302Header($d['headers'], $url['user'], $url['pass']))) {
1578
                $result = array_merge(['parentRequest' => $result], $this->requestUrl($newUrl, $crawlerId, $recursion--));
0 ignored issues
show
Bug introduced by
It seems like $newUrl defined by $this->getRequestUrlFrom...['user'], $url['pass']) on line 1577 can also be of type boolean; however, AOE\Crawler\Controller\C...ontroller::requestUrl() does only seem to accept string, maybe add an additional type check?

If a method or function can return multiple different values and unless you are sure that you only can receive a single value in this context, we recommend to add an additional type check:

/**
 * @return array|string
 */
function returnsDifferentValues($x) {
    if ($x) {
        return 'foo';
    }

    return array();
}

$x = returnsDifferentValues($y);
if (is_array($x)) {
    // $x is an array.
}

If this a common case that PHP Analyzer should handle natively, please let us know by opening an issue.

Loading history...
1579
                $newRequestUrl = $this->requestUrl($newUrl, $crawlerId, $timeout, --$recursion);
0 ignored issues
show
Bug introduced by
It seems like $newUrl defined by $this->getRequestUrlFrom...['user'], $url['pass']) on line 1577 can also be of type boolean; however, AOE\Crawler\Controller\C...ontroller::requestUrl() does only seem to accept string, maybe add an additional type check?

If a method or function can return multiple different values and unless you are sure that you only can receive a single value in this context, we recommend to add an additional type check:

/**
 * @return array|string
 */
function returnsDifferentValues($x) {
    if ($x) {
        return 'foo';
    }

    return array();
}

$x = returnsDifferentValues($y);
if (is_array($x)) {
    // $x is an array.
}

If this a common case that PHP Analyzer should handle natively, please let us know by opening an issue.

Loading history...
1580
1581
                if (is_array($newRequestUrl)) {
1582
                    $result = array_merge(['parentRequest' => $result], $newRequestUrl);
1583
                } else {
1584
                    if (TYPO3_DLOG) {
1585
                        GeneralUtility::devLog(sprintf('Error while opening "%s"', $url), 'crawler', 4, ['crawlerId' => $crawlerId]);
1586
                    }
1587
                    return false;
1588
                }
1589
            }
1590
1591
            return $result;
1592
        }
1593
    }
1594
1595
    /**
1596
     * Gets the base path of the website frontend.
1597
     * (e.g. if you call http://mydomain.com/cms/index.php in
1598
     * the browser the base path is "/cms/")
1599
     *
1600
     * @return string Base path of the website frontend
1601
     */
1602
    protected function getFrontendBasePath()
1603
    {
1604
        $frontendBasePath = '/';
1605
1606
        // Get the path from the extension settings:
1607
        if (isset($this->extensionSettings['frontendBasePath']) && $this->extensionSettings['frontendBasePath']) {
1608
            $frontendBasePath = $this->extensionSettings['frontendBasePath'];
1609
            // If empty, try to use config.absRefPrefix:
1610
        } elseif (isset($GLOBALS['TSFE']->absRefPrefix) && !empty($GLOBALS['TSFE']->absRefPrefix)) {
1611
            $frontendBasePath = $GLOBALS['TSFE']->absRefPrefix;
1612
            // If not in CLI mode the base path can be determined from $_SERVER environment:
1613
        } elseif (!defined('TYPO3_REQUESTTYPE_CLI') || !TYPO3_REQUESTTYPE_CLI) {
1614
            $frontendBasePath = GeneralUtility::getIndpEnv('TYPO3_SITE_PATH');
1615
        }
1616
1617
        // Base path must be '/<pathSegements>/':
1618
        if ($frontendBasePath != '/') {
1619
            $frontendBasePath = '/' . ltrim($frontendBasePath, '/');
1620
            $frontendBasePath = rtrim($frontendBasePath, '/') . '/';
1621
        }
1622
1623
        return $frontendBasePath;
1624
    }
1625
1626
    /**
1627
     * Executes a shell command and returns the outputted result.
1628
     *
1629
     * @param string $command Shell command to be executed
1630
     * @return string Outputted result of the command execution
1631
     */
1632
    protected function executeShellCommand($command)
1633
    {
1634
        $result = shell_exec($command);
1635
        return $result;
1636
    }
1637
1638
    /**
1639
     * Reads HTTP response from the given stream.
1640
     *
1641
     * @param  resource $streamPointer  Pointer to connection stream.
1642
     * @return array                    Associative array with the following items:
1643
     *                                  headers <array> Response headers sent by server.
1644
     *                                  content <array> Content, with each line as an array item.
1645
     */
1646 1
    protected function getHttpResponseFromStream($streamPointer)
1647
    {
1648 1
        $response = ['headers' => [], 'content' => []];
1649
1650 1
        if (is_resource($streamPointer)) {
1651
            // read headers
1652 1
            while ($line = fgets($streamPointer, '2048')) {
1653 1
                $line = trim($line);
1654 1
                if ($line !== '') {
1655 1
                    $response['headers'][] = $line;
1656
                } else {
1657 1
                    break;
1658
                }
1659
            }
1660
1661
            // read content
1662 1
            while ($line = fgets($streamPointer, '2048')) {
1663 1
                $response['content'][] = $line;
1664
            }
1665
        }
1666
1667 1
        return $response;
1668
    }
1669
1670
    /**
1671
     * @param message
1672
     */
1673 2
    protected function log($message)
1674
    {
1675 2
        if (!empty($this->extensionSettings['logFileName'])) {
1676
            $fileResult = @file_put_contents($this->extensionSettings['logFileName'], date('Ymd His') . ' ' . $message . PHP_EOL, FILE_APPEND);
1677
            if (!$fileResult) {
1678
                GeneralUtility::devLog('File "' . $this->extensionSettings['logFileName'] . '" could not be written, please check file permissions.', 'crawler', LogLevel::INFO);
1679
            }
1680
        }
1681 2
    }
1682
1683
    /**
1684
     * Builds HTTP request headers.
1685
     *
1686
     * @param array $url
1687
     * @param string $crawlerId
1688
     *
1689
     * @return array
1690
     */
1691 6
    protected function buildRequestHeaderArray(array $url, $crawlerId)
1692
    {
1693 6
        $reqHeaders = [];
1694 6
        $reqHeaders[] = 'GET ' . $url['path'] . ($url['query'] ? '?' . $url['query'] : '') . ' HTTP/1.0';
1695 6
        $reqHeaders[] = 'Host: ' . $url['host'];
1696 6
        if (stristr($url['query'], 'ADMCMD_previewWS')) {
1697 2
            $reqHeaders[] = 'Cookie: $Version="1"; be_typo_user="1"; $Path=/';
1698
        }
1699 6
        $reqHeaders[] = 'Connection: close';
1700 6
        if ($url['user'] != '') {
1701 2
            $reqHeaders[] = 'Authorization: Basic ' . base64_encode($url['user'] . ':' . $url['pass']);
1702
        }
1703 6
        $reqHeaders[] = 'X-T3crawler: ' . $crawlerId;
1704 6
        $reqHeaders[] = 'User-Agent: TYPO3 crawler';
1705 6
        return $reqHeaders;
1706
    }
1707
1708
    /**
1709
     * Check if the submitted HTTP-Header contains a redirect location and built new crawler-url
1710
     *
1711
     * @param array $headers HTTP Header
1712
     * @param string $user HTTP Auth. User
1713
     * @param string $pass HTTP Auth. Password
1714
     * @return bool|string
1715
     */
1716 12
    protected function getRequestUrlFrom302Header($headers, $user = '', $pass = '')
1717
    {
1718 12
        $header = [];
1719 12
        if (!is_array($headers)) {
1720 1
            return false;
1721
        }
1722 11
        if (!(stristr($headers[0], '301 Moved') || stristr($headers[0], '302 Found') || stristr($headers[0], '302 Moved'))) {
1723 2
            return false;
1724
        }
1725
1726 9
        foreach ($headers as $hl) {
1727 9
            $tmp = explode(": ", $hl);
1728 9
            $header[trim($tmp[0])] = trim($tmp[1]);
1729 9
            if (trim($tmp[0]) == 'Location') {
1730 9
                break;
1731
            }
1732
        }
1733 9
        if (!array_key_exists('Location', $header)) {
1734 3
            return false;
1735
        }
1736
1737 6
        if ($user != '') {
1738 3
            if (!($tmp = parse_url($header['Location']))) {
1739 1
                return false;
1740
            }
1741 2
            $newUrl = $tmp['scheme'] . '://' . $user . ':' . $pass . '@' . $tmp['host'] . $tmp['path'];
1742 2
            if ($tmp['query'] != '') {
1743 2
                $newUrl .= '?' . $tmp['query'];
1744
            }
1745
        } else {
1746 3
            $newUrl = $header['Location'];
1747
        }
1748 5
        return $newUrl;
1749
    }
1750
1751
    /**************************
1752
     *
1753
     * tslib_fe hooks:
1754
     *
1755
     **************************/
1756
1757
    /**
1758
     * Initialization hook (called after database connection)
1759
     * Takes the "HTTP_X_T3CRAWLER" header and looks up queue record and verifies if the session comes from the system (by comparing hashes)
1760
     *
1761
     * @param array $params Parameters from frontend
1762
     * @param object $ref TSFE object (reference under PHP5)
1763
     * @return void
1764
     *
1765
     * FIXME: Look like this is not used, in commit 9910d3f40cce15f4e9b7bcd0488bf21f31d53ebc it's added as public,
1766
     * FIXME: I think this can be removed. (TNM)
1767
     */
1768
    public function fe_init(&$params, $ref)
0 ignored issues
show
Unused Code introduced by
The parameter $ref is not used and could be removed.

This check looks from parameters that have been defined for a function or method, but which are not used in the method body.

Loading history...
1769
    {
1770
        // Authenticate crawler request:
1771
        if (isset($_SERVER['HTTP_X_T3CRAWLER'])) {
1772
            list($queueId, $hash) = explode(':', $_SERVER['HTTP_X_T3CRAWLER']);
1773
            list($queueRec) = $this->db->exec_SELECTgetSingleRow('*', 'tx_crawler_queue', 'qid=' . intval($queueId));
1774
1775
            // If a crawler record was found and hash was matching, set it up:
1776
            if (is_array($queueRec) && $hash === md5($queueRec['qid'] . '|' . $queueRec['set_id'] . '|' . $GLOBALS['TYPO3_CONF_VARS']['SYS']['encryptionKey'])) {
1777
                $params['pObj']->applicationData['tx_crawler']['running'] = true;
1778
                $params['pObj']->applicationData['tx_crawler']['parameters'] = unserialize($queueRec['parameters']);
1779
                $params['pObj']->applicationData['tx_crawler']['log'] = [];
1780
            } else {
1781
                die('No crawler entry found!');
0 ignored issues
show
Coding Style Compatibility introduced by
The method fe_init() contains an exit expression.

An exit expression should only be used in rare cases. For example, if you write a short command line script.

In most cases however, using an exit expression makes the code untestable and often causes incompatibilities with other libraries. Thus, unless you are absolutely sure it is required here, we recommend to refactor your code to avoid its usage.

Loading history...
1782
            }
1783
        }
1784
    }
1785
1786
    /*****************************
1787
     *
1788
     * Compiling URLs to crawl - tools
1789
     *
1790
     *****************************/
1791
1792
    /**
1793
     * @param integer $id Root page id to start from.
1794
     * @param integer $depth Depth of tree, 0=only id-page, 1= on sublevel, 99 = infinite
1795
     * @param integer $scheduledTime Unix Time when the URL is timed to be visited when put in queue
1796
     * @param integer $reqMinute Number of requests per minute (creates the interleave between requests)
1797
     * @param boolean $submitCrawlUrls If set, submits the URLs to queue in database (real crawling)
1798
     * @param boolean $downloadCrawlUrls If set (and submitcrawlUrls is false) will fill $downloadUrls with entries)
1799
     * @param array $incomingProcInstructions Array of processing instructions
1800
     * @param array $configurationSelection Array of configuration keys
1801
     * @return string
1802
     */
1803
    public function getPageTreeAndUrls(
1804
        $id,
1805
        $depth,
1806
        $scheduledTime,
1807
        $reqMinute,
1808
        $submitCrawlUrls,
1809
        $downloadCrawlUrls,
1810
        array $incomingProcInstructions,
1811
        array $configurationSelection
1812
    ) {
1813
        global $BACK_PATH;
1814
        global $LANG;
1815
        if (!is_object($LANG)) {
1816
            $LANG = GeneralUtility::makeInstance(LanguageService::class);
1817
            $LANG->init(0);
1818
        }
1819
        $this->scheduledTime = $scheduledTime;
1820
        $this->reqMinute = $reqMinute;
1821
        $this->submitCrawlUrls = $submitCrawlUrls;
1822
        $this->downloadCrawlUrls = $downloadCrawlUrls;
1823
        $this->incomingProcInstructions = $incomingProcInstructions;
1824
        $this->incomingConfigurationSelection = $configurationSelection;
1825
1826
        $this->duplicateTrack = [];
1827
        $this->downloadUrls = [];
1828
1829
        // Drawing tree:
1830
        /* @var PageTreeView $tree */
1831
        $tree = GeneralUtility::makeInstance(PageTreeView::class);
1832
        $perms_clause = $GLOBALS['BE_USER']->getPagePermsClause(1);
1833
        $tree->init('AND ' . $perms_clause);
1834
1835
        $pageInfo = BackendUtility::readPageAccess($id, $perms_clause);
1836
        if (is_array($pageInfo)) {
1837
            // Set root row:
1838
            $tree->tree[] = [
1839
                'row' => $pageInfo,
1840
                'HTML' => IconUtility::getIconForRecord('pages', $pageInfo)
1841
            ];
1842
        }
1843
1844
        // Get branch beneath:
1845
        if ($depth) {
1846
            $tree->getTree($id, $depth, '');
1847
        }
1848
1849
        // Traverse page tree:
1850
        $code = '';
1851
1852
        foreach ($tree->tree as $data) {
1853
            $this->MP = false;
1854
1855
            // recognize mount points
1856
            if ($data['row']['doktype'] == 7) {
1857
                $mountpage = $this->db->exec_SELECTgetRows('*', 'pages', 'uid = ' . $data['row']['uid']);
1858
1859
                // fetch mounted pages
1860
                $this->MP = $mountpage[0]['mount_pid'] . '-' . $data['row']['uid'];
0 ignored issues
show
Documentation Bug introduced by
The property $MP was declared of type boolean, but $mountpage[0]['mount_pid...' . $data['row']['uid'] is of type string. Maybe add a type cast?

This check looks for assignments to scalar types that may be of the wrong type.

To ensure the code behaves as expected, it may be a good idea to add an explicit type cast.

$answer = 42;

$correct = false;

$correct = (bool) $answer;
Loading history...
1861
1862
                $mountTree = GeneralUtility::makeInstance(PageTreeView::class);
1863
                $mountTree->init('AND ' . $perms_clause);
1864
                $mountTree->getTree($mountpage[0]['mount_pid'], $depth, '');
1865
1866
                foreach ($mountTree->tree as $mountData) {
1867
                    $code .= $this->drawURLs_addRowsForPage(
1868
                        $mountData['row'],
1869
                        $mountData['HTML'] . BackendUtility::getRecordTitle('pages', $mountData['row'], true)
1870
                    );
1871
                }
1872
1873
                // replace page when mount_pid_ol is enabled
1874
                if ($mountpage[0]['mount_pid_ol']) {
1875
                    $data['row']['uid'] = $mountpage[0]['mount_pid'];
1876
                } else {
1877
                    // if the mount_pid_ol is not set the MP must not be used for the mountpoint page
1878
                    $this->MP = false;
1879
                }
1880
            }
1881
1882
            $code .= $this->drawURLs_addRowsForPage(
1883
                $data['row'],
1884
                $data['HTML'] . BackendUtility::getRecordTitle('pages', $data['row'], true)
1885
            );
1886
        }
1887
1888
        return $code;
1889
    }
1890
1891
    /**
1892
     * Expands exclude string
1893
     *
1894
     * @param string $excludeString Exclude string
1895
     * @return array
1896
     */
1897 1
    public function expandExcludeString($excludeString)
1898
    {
1899
        // internal static caches;
1900 1
        static $expandedExcludeStringCache;
1901 1
        static $treeCache;
1902
1903 1
        if (empty($expandedExcludeStringCache[$excludeString])) {
1904 1
            $pidList = [];
1905
1906 1
            if (!empty($excludeString)) {
1907
                /** @var PageTreeView $tree */
1908
                $tree = GeneralUtility::makeInstance(PageTreeView::class);
1909
                $tree->init('AND ' . $this->backendUser->getPagePermsClause(1));
1910
1911
                $excludeParts = GeneralUtility::trimExplode(',', $excludeString);
1912
1913
                foreach ($excludeParts as $excludePart) {
1914
                    list($pid, $depth) = GeneralUtility::trimExplode('+', $excludePart);
1915
1916
                    // default is "page only" = "depth=0"
1917
                    if (empty($depth)) {
1918
                        $depth = (stristr($excludePart, '+')) ? 99 : 0;
1919
                    }
1920
1921
                    $pidList[] = $pid;
1922
1923
                    if ($depth > 0) {
1924
                        if (empty($treeCache[$pid][$depth])) {
1925
                            $tree->reset();
1926
                            $tree->getTree($pid, $depth);
1927
                            $treeCache[$pid][$depth] = $tree->tree;
1928
                        }
1929
1930
                        foreach ($treeCache[$pid][$depth] as $data) {
1931
                            $pidList[] = $data['row']['uid'];
1932
                        }
1933
                    }
1934
                }
1935
            }
1936
1937 1
            $expandedExcludeStringCache[$excludeString] = array_unique($pidList);
1938
        }
1939
1940 1
        return $expandedExcludeStringCache[$excludeString];
1941
    }
1942
1943
    /**
1944
     * Create the rows for display of the page tree
1945
     * For each page a number of rows are shown displaying GET variable configuration
1946
     *
1947
     * @param    array        Page row
1948
     * @param    string        Page icon and title for row
1949
     * @return    string        HTML <tr> content (one or more)
1950
     */
1951
    public function drawURLs_addRowsForPage(array $pageRow, $pageTitleAndIcon)
1952
    {
1953
        $skipMessage = '';
1954
1955
        // Get list of configurations
1956
        $configurations = $this->getUrlsForPageRow($pageRow, $skipMessage);
1957
1958
        if (count($this->incomingConfigurationSelection) > 0) {
1959
            // remove configuration that does not match the current selection
1960
            foreach ($configurations as $confKey => $confArray) {
1961
                if (!in_array($confKey, $this->incomingConfigurationSelection)) {
1962
                    unset($configurations[$confKey]);
1963
                }
1964
            }
1965
        }
1966
1967
        // Traverse parameter combinations:
1968
        $c = 0;
1969
        $content = '';
1970
        if (count($configurations)) {
1971
            foreach ($configurations as $confKey => $confArray) {
1972
1973
                    // Title column:
1974
                if (!$c) {
1975
                    $titleClm = '<td rowspan="' . count($configurations) . '">' . $pageTitleAndIcon . '</td>';
1976
                } else {
1977
                    $titleClm = '';
1978
                }
1979
1980
                if (!in_array($pageRow['uid'], $this->expandExcludeString($confArray['subCfg']['exclude']))) {
1981
1982
                        // URL list:
1983
                    $urlList = $this->urlListFromUrlArray(
1984
                        $confArray,
1985
                        $pageRow,
1986
                        $this->scheduledTime,
1987
                        $this->reqMinute,
1988
                        $this->submitCrawlUrls,
1989
                        $this->downloadCrawlUrls,
1990
                        $this->duplicateTrack,
1991
                        $this->downloadUrls,
1992
                        $this->incomingProcInstructions // if empty the urls won't be filtered by processing instructions
1993
                    );
1994
1995
                    // Expanded parameters:
1996
                    $paramExpanded = '';
1997
                    $calcAccu = [];
1998
                    $calcRes = 1;
1999
                    foreach ($confArray['paramExpanded'] as $gVar => $gVal) {
2000
                        $paramExpanded .= '
2001
                            <tr>
2002
                                <td class="bgColor4-20">' . htmlspecialchars('&' . $gVar . '=') . '<br/>' .
2003
                                                '(' . count($gVal) . ')' .
2004
                                                '</td>
2005
                                <td class="bgColor4" nowrap="nowrap">' . nl2br(htmlspecialchars(implode(chr(10), $gVal))) . '</td>
2006
                            </tr>
2007
                        ';
2008
                        $calcRes *= count($gVal);
2009
                        $calcAccu[] = count($gVal);
2010
                    }
2011
                    $paramExpanded = '<table class="lrPadding c-list param-expanded">' . $paramExpanded . '</table>';
2012
                    $paramExpanded .= 'Comb: ' . implode('*', $calcAccu) . '=' . $calcRes;
2013
2014
                    // Options
2015
                    $optionValues = '';
2016
                    if ($confArray['subCfg']['userGroups']) {
2017
                        $optionValues .= 'User Groups: ' . $confArray['subCfg']['userGroups'] . '<br/>';
2018
                    }
2019
                    if ($confArray['subCfg']['baseUrl']) {
2020
                        $optionValues .= 'Base Url: ' . $confArray['subCfg']['baseUrl'] . '<br/>';
2021
                    }
2022
                    if ($confArray['subCfg']['procInstrFilter']) {
2023
                        $optionValues .= 'ProcInstr: ' . $confArray['subCfg']['procInstrFilter'] . '<br/>';
2024
                    }
2025
2026
                    // Compile row:
2027
                    $content .= '
2028
                        <tr class="bgColor' . ($c % 2 ? '-20' : '-10') . '">
2029
                            ' . $titleClm . '
2030
                            <td>' . htmlspecialchars($confKey) . '</td>
2031
                            <td>' . nl2br(htmlspecialchars(rawurldecode(trim(str_replace('&', chr(10) . '&', GeneralUtility::implodeArrayForUrl('', $confArray['paramParsed'])))))) . '</td>
2032
                            <td>' . $paramExpanded . '</td>
2033
                            <td nowrap="nowrap">' . $urlList . '</td>
2034
                            <td nowrap="nowrap">' . $optionValues . '</td>
2035
                            <td nowrap="nowrap">' . DebugUtility::viewArray($confArray['subCfg']['procInstrParams.']) . '</td>
2036
                        </tr>';
2037
                } else {
2038
                    $content .= '<tr class="bgColor' . ($c % 2 ? '-20' : '-10') . '">
2039
                            ' . $titleClm . '
2040
                            <td>' . htmlspecialchars($confKey) . '</td>
2041
                            <td colspan="5"><em>No entries</em> (Page is excluded in this configuration)</td>
2042
                        </tr>';
2043
                }
2044
2045
                $c++;
2046
            }
2047
        } else {
2048
            $message = !empty($skipMessage) ? ' (' . $skipMessage . ')' : '';
2049
2050
            // Compile row:
2051
            $content .= '
2052
                <tr class="bgColor-20" style="border-bottom: 1px solid black;">
2053
                    <td>' . $pageTitleAndIcon . '</td>
2054
                    <td colspan="6"><em>No entries</em>' . $message . '</td>
2055
                </tr>';
2056
        }
2057
2058
        return $content;
2059
    }
2060
2061
    /*****************************
2062
     *
2063
     * CLI functions
2064
     *
2065
     *****************************/
2066
2067
    /**
2068
     * Main function for running from Command Line PHP script (cron job)
2069
     * See ext/crawler/cli/crawler_cli.phpsh for details
2070
     *
2071
     * @return int number of remaining items or false if error
2072
     */
2073
    public function CLI_main()
2074
    {
2075
        $this->setAccessMode('cli');
2076
        $result = self::CLI_STATUS_NOTHING_PROCCESSED;
2077
        $cliObj = GeneralUtility::makeInstance(CrawlerCommandLineController::class);
2078
2079
        if (isset($cliObj->cli_args['-h']) || isset($cliObj->cli_args['--help'])) {
2080
            $cliObj->cli_validateArgs();
2081
            $cliObj->cli_help();
2082
            exit;
0 ignored issues
show
Coding Style Compatibility introduced by
The method CLI_main() contains an exit expression.

An exit expression should only be used in rare cases. For example, if you write a short command line script.

In most cases however, using an exit expression makes the code untestable and often causes incompatibilities with other libraries. Thus, unless you are absolutely sure it is required here, we recommend to refactor your code to avoid its usage.

Loading history...
2083
        }
2084
2085
        if (!$this->getDisabled() && $this->CLI_checkAndAcquireNewProcess($this->CLI_buildProcessId())) {
2086
            $countInARun = $cliObj->cli_argValue('--countInARun') ? intval($cliObj->cli_argValue('--countInARun')) : $this->extensionSettings['countInARun'];
2087
            // Seconds
2088
            $sleepAfterFinish = $cliObj->cli_argValue('--sleepAfterFinish') ? intval($cliObj->cli_argValue('--sleepAfterFinish')) : $this->extensionSettings['sleepAfterFinish'];
2089
            // Milliseconds
2090
            $sleepTime = $cliObj->cli_argValue('--sleepTime') ? intval($cliObj->cli_argValue('--sleepTime')) : $this->extensionSettings['sleepTime'];
2091
2092
            try {
2093
                // Run process:
2094
                $result = $this->CLI_run($countInARun, $sleepTime, $sleepAfterFinish);
2095
            } catch (\Exception $e) {
2096
                $this->CLI_debug(get_class($e) . ': ' . $e->getMessage());
2097
                $result = self::CLI_STATUS_ABORTED;
2098
            }
2099
2100
            // Cleanup
2101
            $this->db->exec_DELETEquery('tx_crawler_process', 'assigned_items_count = 0');
2102
2103
            //TODO can't we do that in a clean way?
2104
            $releaseStatus = $this->CLI_releaseProcesses($this->CLI_buildProcessId());
0 ignored issues
show
Unused Code introduced by
$releaseStatus is not used, you could remove the assignment.

This check looks for variable assignements that are either overwritten by other assignments or where the variable is not used subsequently.

$myVar = 'Value';
$higher = false;

if (rand(1, 6) > 3) {
    $higher = true;
} else {
    $higher = false;
}

Both the $myVar assignment in line 1 and the $higher assignment in line 2 are dead. The first because $myVar is never used and the second because $higher is always overwritten for every possible time line.

Loading history...
2105
2106
            $this->CLI_debug("Unprocessed Items remaining:" . $this->queueRepository->countUnprocessedItems() . " (" . $this->CLI_buildProcessId() . ")");
2107
            $result |= ($this->queueRepository->countUnprocessedItems() > 0 ? self::CLI_STATUS_REMAIN : self::CLI_STATUS_NOTHING_PROCCESSED);
2108
        } else {
2109
            $result |= self::CLI_STATUS_ABORTED;
2110
        }
2111
2112
        return $result;
2113
    }
2114
2115
    /**
2116
     * Function executed by crawler_im.php cli script.
2117
     *
2118
     * @return void
2119
     */
2120
    public function CLI_main_im()
2121
    {
2122
        $this->setAccessMode('cli_im');
2123
2124
        $cliObj = GeneralUtility::makeInstance(QueueCommandLineController::class);
2125
2126
        // Force user to admin state and set workspace to "Live":
2127
        $this->backendUser->user['admin'] = 1;
2128
        $this->backendUser->setWorkspace(0);
2129
2130
        // Print help
2131
        if (!isset($cliObj->cli_args['_DEFAULT'][1])) {
2132
            $cliObj->cli_validateArgs();
2133
            $cliObj->cli_help();
2134
            exit;
0 ignored issues
show
Coding Style Compatibility introduced by
The method CLI_main_im() contains an exit expression.

An exit expression should only be used in rare cases. For example, if you write a short command line script.

In most cases however, using an exit expression makes the code untestable and often causes incompatibilities with other libraries. Thus, unless you are absolutely sure it is required here, we recommend to refactor your code to avoid its usage.

Loading history...
2135
        }
2136
2137
        $cliObj->cli_validateArgs();
2138
2139
        if ($cliObj->cli_argValue('-o') === 'exec') {
2140
            $this->registerQueueEntriesInternallyOnly = true;
2141
        }
2142
2143
        if (isset($cliObj->cli_args['_DEFAULT'][2])) {
2144
            // Crawler is called over TYPO3 BE
2145
            $pageId = MathUtility::forceIntegerInRange($cliObj->cli_args['_DEFAULT'][2], 0);
2146
        } else {
2147
            // Crawler is called over cli
2148
            $pageId = MathUtility::forceIntegerInRange($cliObj->cli_args['_DEFAULT'][1], 0);
2149
        }
2150
2151
        $configurationKeys = $this->getConfigurationKeys($cliObj);
0 ignored issues
show
Deprecated Code introduced by
The method AOE\Crawler\Controller\C...:getConfigurationKeys() has been deprecated with message: since crawler v6.3.0, will be removed in crawler v7.0.0.

This method has been deprecated. The supplier of the class has supplied an explanatory message.

The explanatory message should give you some clue as to whether and when the method will be removed from the class and what other method or class to use instead.

Loading history...
2152
2153
        if (!is_array($configurationKeys)) {
2154
            $configurations = $this->getUrlsForPageId($pageId);
2155
            if (is_array($configurations)) {
2156
                $configurationKeys = array_keys($configurations);
2157
            } else {
2158
                $configurationKeys = [];
2159
            }
2160
        }
2161
2162
        if ($cliObj->cli_argValue('-o') === 'queue' || $cliObj->cli_argValue('-o') === 'exec') {
2163
            $reason = new Reason();
2164
            $reason->setReason(Reason::REASON_GUI_SUBMIT);
2165
            $reason->setDetailText('The cli script of the crawler added to the queue');
2166
2167
            // The event dispatcher is deprecated since crawler v6.4.0, will be removed in crawler v7.0.0.
2168
            // Please use the Signal instead.
2169
            EventDispatcher::getInstance()->post(
2170
                'invokeQueueChange',
2171
                $this->setID,
2172
                ['reason' => $reason]
2173
            );
2174
2175
            $signalPayload = ['reason' => $reason];
2176
            SignalSlotUtility::emitSignal(
2177
                __CLASS__,
2178
                SignalSlotUtility::SIGNAL_INVOKE_QUEUE_CHANGE,
2179
                $signalPayload
2180
            );
2181
2182
        }
2183
2184
        if ($this->extensionSettings['cleanUpOldQueueEntries']) {
2185
            $this->cleanUpOldQueueEntries();
2186
        }
2187
2188
        $this->setID = (int) GeneralUtility::md5int(microtime());
2189
        $this->getPageTreeAndUrls(
2190
            $pageId,
2191
            MathUtility::forceIntegerInRange($cliObj->cli_argValue('-d'), 0, 99),
2192
            $this->getCurrentTime(),
2193
            MathUtility::forceIntegerInRange($cliObj->cli_isArg('-n') ? $cliObj->cli_argValue('-n') : 30, 1, 1000),
2194
            $cliObj->cli_argValue('-o') === 'queue' || $cliObj->cli_argValue('-o') === 'exec',
2195
            $cliObj->cli_argValue('-o') === 'url',
2196
            GeneralUtility::trimExplode(',', $cliObj->cli_argValue('-proc'), true),
2197
            $configurationKeys
2198
        );
2199
2200
        if ($cliObj->cli_argValue('-o') === 'url') {
2201
            $cliObj->cli_echo(implode(chr(10), $this->downloadUrls) . chr(10), true);
2202
        } elseif ($cliObj->cli_argValue('-o') === 'exec') {
2203
            $cliObj->cli_echo("Executing " . count($this->urlList) . " requests right away:\n\n");
2204
            $cliObj->cli_echo(implode(chr(10), $this->urlList) . chr(10));
2205
            $cliObj->cli_echo("\nProcessing:\n");
2206
2207
            foreach ($this->queueEntries as $queueRec) {
2208
                $p = unserialize($queueRec['parameters']);
2209
                $cliObj->cli_echo($p['url'] . ' (' . implode(',', $p['procInstructions']) . ') => ');
2210
2211
                $result = $this->readUrlFromArray($queueRec);
2212
2213
                $requestResult = unserialize($result['content']);
2214
                if (is_array($requestResult)) {
2215
                    $resLog = is_array($requestResult['log']) ? chr(10) . chr(9) . chr(9) . implode(chr(10) . chr(9) . chr(9), $requestResult['log']) : '';
2216
                    $cliObj->cli_echo('OK: ' . $resLog . chr(10));
2217
                } else {
2218
                    $cliObj->cli_echo('Error checking Crawler Result: ' . substr(preg_replace('/\s+/', ' ', strip_tags($result['content'])), 0, 30000) . '...' . chr(10));
2219
                }
2220
            }
2221
        } elseif ($cliObj->cli_argValue('-o') === 'queue') {
2222
            $cliObj->cli_echo("Putting " . count($this->urlList) . " entries in queue:\n\n");
2223
            $cliObj->cli_echo(implode(chr(10), $this->urlList) . chr(10));
2224
        } else {
2225
            $cliObj->cli_echo(count($this->urlList) . " entries found for processing. (Use -o to decide action):\n\n", true);
2226
            $cliObj->cli_echo(implode(chr(10), $this->urlList) . chr(10), true);
2227
        }
2228
    }
2229
2230
    /**
2231
     * Function executed by crawler_im.php cli script.
2232
     *
2233
     * @return bool
2234
     */
2235
    public function CLI_main_flush()
2236
    {
2237
        $this->setAccessMode('cli_flush');
2238
        $cliObj = GeneralUtility::makeInstance(FlushCommandLineController::class);
2239
2240
        // Force user to admin state and set workspace to "Live":
2241
        $this->backendUser->user['admin'] = 1;
2242
        $this->backendUser->setWorkspace(0);
2243
2244
        // Print help
2245
        if (!isset($cliObj->cli_args['_DEFAULT'][1])) {
2246
            $cliObj->cli_validateArgs();
2247
            $cliObj->cli_help();
2248
            exit;
0 ignored issues
show
Coding Style Compatibility introduced by
The method CLI_main_flush() contains an exit expression.

An exit expression should only be used in rare cases. For example, if you write a short command line script.

In most cases however, using an exit expression makes the code untestable and often causes incompatibilities with other libraries. Thus, unless you are absolutely sure it is required here, we recommend to refactor your code to avoid its usage.

Loading history...
2249
        }
2250
2251
        $cliObj->cli_validateArgs();
2252
        $pageId = MathUtility::forceIntegerInRange($cliObj->cli_args['_DEFAULT'][1], 0);
2253
        $fullFlush = ($pageId == 0);
2254
2255
        $mode = $cliObj->cli_argValue('-o');
2256
2257
        switch ($mode) {
2258
            case 'all':
2259
                $result = $this->getLogEntriesForPageId($pageId, '', true, $fullFlush);
2260
                break;
2261
            case 'finished':
2262
            case 'pending':
2263
                $result = $this->getLogEntriesForPageId($pageId, $mode, true, $fullFlush);
2264
                break;
2265
            default:
2266
                $cliObj->cli_validateArgs();
2267
                $cliObj->cli_help();
2268
                $result = false;
2269
        }
2270
2271
        return $result !== false;
2272
    }
2273
2274
    /**
2275
     * Obtains configuration keys from the CLI arguments
2276
     *
2277
     * @param QueueCommandLineController $cliObj
2278
     * @return array
2279
     *
2280
     * @deprecated since crawler v6.3.0, will be removed in crawler v7.0.0.
2281
     */
2282
    protected function getConfigurationKeys(QueueCommandLineController $cliObj)
2283
    {
2284
        $parameter = trim($cliObj->cli_argValue('-conf'));
2285
        return ($parameter != '' ? GeneralUtility::trimExplode(',', $parameter) : []);
2286
    }
2287
2288
    /**
2289
     * Running the functionality of the CLI (crawling URLs from queue)
2290
     *
2291
     * @param int $countInARun
2292
     * @param int $sleepTime
2293
     * @param int $sleepAfterFinish
2294
     * @return string
2295
     */
2296
    public function CLI_run($countInARun, $sleepTime, $sleepAfterFinish)
2297
    {
2298
        $result = 0;
2299
        $counter = 0;
2300
2301
        // First, run hooks:
2302
        $this->CLI_runHooks();
2303
2304
        // Clean up the queue
2305
        if (intval($this->extensionSettings['purgeQueueDays']) > 0) {
2306
            $purgeDate = $this->getCurrentTime() - 24 * 60 * 60 * intval($this->extensionSettings['purgeQueueDays']);
2307
            $del = $this->db->exec_DELETEquery(
2308
                'tx_crawler_queue',
2309
                'exec_time!=0 AND exec_time<' . $purgeDate
2310
            );
2311
            if (false == $del) {
2312
                GeneralUtility::devLog('Records could not be deleted.', 'crawler', LogLevel::INFO);
2313
            }
2314
        }
2315
2316
        // Select entries:
2317
        //TODO Shouldn't this reside within the transaction?
2318
        $rows = $this->db->exec_SELECTgetRows(
2319
            'qid,scheduled',
2320
            'tx_crawler_queue',
2321
            'exec_time=0
2322
                AND process_scheduled= 0
2323
                AND scheduled<=' . $this->getCurrentTime(),
2324
            '',
2325
            'scheduled, qid',
2326
        intval($countInARun)
2327
        );
2328
2329
        if (count($rows) > 0) {
2330
            $quidList = [];
2331
2332
            foreach ($rows as $r) {
0 ignored issues
show
Bug introduced by
The expression $rows of type null|array is not guaranteed to be traversable. How about adding an additional type check?

There are different options of fixing this problem.

  1. If you want to be on the safe side, you can add an additional type-check:

    $collection = json_decode($data, true);
    if ( ! is_array($collection)) {
        throw new \RuntimeException('$collection must be an array.');
    }
    
    foreach ($collection as $item) { /** ... */ }
    
  2. If you are sure that the expression is traversable, you might want to add a doc comment cast to improve IDE auto-completion and static analysis:

    /** @var array $collection */
    $collection = json_decode($data, true);
    
    foreach ($collection as $item) { /** .. */ }
    
  3. Mark the issue as a false-positive: Just hover the remove button, in the top-right corner of this issue for more options.

Loading history...
2333
                $quidList[] = $r['qid'];
2334
            }
2335
2336
            $processId = $this->CLI_buildProcessId();
2337
2338
            //reserve queue entries for process
2339
            $this->db->sql_query('BEGIN');
2340
            //TODO make sure we're not taking assigned queue-entires
2341
            $this->db->exec_UPDATEquery(
2342
                'tx_crawler_queue',
2343
                'qid IN (' . implode(',', $quidList) . ')',
2344
                [
2345
                    'process_scheduled' => intval($this->getCurrentTime()),
2346
                    'process_id' => $processId
2347
                ]
2348
            );
2349
2350
            //save the number of assigned queue entrys to determine who many have been processed later
2351
            $numberOfAffectedRows = $this->db->sql_affected_rows();
2352
            $this->db->exec_UPDATEquery(
2353
                'tx_crawler_process',
2354
                "process_id = '" . $processId . "'",
2355
                [
2356
                    'assigned_items_count' => intval($numberOfAffectedRows)
2357
                ]
2358
            );
2359
2360
            if ($numberOfAffectedRows == count($quidList)) {
2361
                $this->db->sql_query('COMMIT');
2362
            } else {
2363
                $this->db->sql_query('ROLLBACK');
2364
                $this->CLI_debug("Nothing processed due to multi-process collision (" . $this->CLI_buildProcessId() . ")");
2365
                return ($result | self::CLI_STATUS_ABORTED);
2366
            }
2367
2368
            foreach ($rows as $r) {
0 ignored issues
show
Bug introduced by
The expression $rows of type null|array is not guaranteed to be traversable. How about adding an additional type check?

There are different options of fixing this problem.

  1. If you want to be on the safe side, you can add an additional type-check:

    $collection = json_decode($data, true);
    if ( ! is_array($collection)) {
        throw new \RuntimeException('$collection must be an array.');
    }
    
    foreach ($collection as $item) { /** ... */ }
    
  2. If you are sure that the expression is traversable, you might want to add a doc comment cast to improve IDE auto-completion and static analysis:

    /** @var array $collection */
    $collection = json_decode($data, true);
    
    foreach ($collection as $item) { /** .. */ }
    
  3. Mark the issue as a false-positive: Just hover the remove button, in the top-right corner of this issue for more options.

Loading history...
2369
                $result |= $this->readUrl($r['qid']);
2370
2371
                $counter++;
2372
                usleep(intval($sleepTime)); // Just to relax the system
2373
2374
                // if during the start and the current read url the cli has been disable we need to return from the function
2375
                // mark the process NOT as ended.
2376
                if ($this->getDisabled()) {
2377
                    return ($result | self::CLI_STATUS_ABORTED);
2378
                }
2379
2380
                if (!$this->CLI_checkIfProcessIsActive($this->CLI_buildProcessId())) {
2381
                    $this->CLI_debug("conflict / timeout (" . $this->CLI_buildProcessId() . ")");
2382
2383
                    //TODO might need an additional returncode
2384
                    $result |= self::CLI_STATUS_ABORTED;
2385
                    break; //possible timeout
2386
                }
2387
            }
2388
2389
            sleep(intval($sleepAfterFinish));
2390
2391
            $msg = 'Rows: ' . $counter;
2392
            $this->CLI_debug($msg . " (" . $this->CLI_buildProcessId() . ")");
2393
        } else {
2394
            $this->CLI_debug("Nothing within queue which needs to be processed (" . $this->CLI_buildProcessId() . ")");
2395
        }
2396
2397
        if ($counter > 0) {
2398
            $result |= self::CLI_STATUS_PROCESSED;
2399
        }
2400
2401
        return $result;
2402
    }
2403
2404
    /**
2405
     * Activate hooks
2406
     *
2407
     * @return void
2408
     */
2409
    public function CLI_runHooks()
2410
    {
2411
        global $TYPO3_CONF_VARS;
2412
        if (is_array($TYPO3_CONF_VARS['EXTCONF']['crawler']['cli_hooks'])) {
2413
            foreach ($TYPO3_CONF_VARS['EXTCONF']['crawler']['cli_hooks'] as $objRef) {
2414
                $hookObj = &GeneralUtility::getUserObj($objRef);
2415
                if (is_object($hookObj)) {
2416
                    $hookObj->crawler_init($this);
2417
                }
2418
            }
2419
        }
2420
    }
2421
2422
    /**
2423
     * Try to acquire a new process with the given id
2424
     * also performs some auto-cleanup for orphan processes
2425
     * @todo preemption might not be the most elegant way to clean up
2426
     *
2427
     * @param string $id identification string for the process
2428
     * @return boolean
2429
     */
2430
    public function CLI_checkAndAcquireNewProcess($id)
2431
    {
2432
        $ret = true;
2433
2434
        $systemProcessId = getmypid();
2435
        if ($systemProcessId < 1) {
2436
            return false;
2437
        }
2438
2439
        $processCount = 0;
2440
        $orphanProcesses = [];
2441
2442
        $this->db->sql_query('BEGIN');
2443
2444
        $res = $this->db->exec_SELECTquery(
2445
            'process_id,ttl',
2446
            'tx_crawler_process',
2447
            'active=1 AND deleted=0'
2448
            );
2449
2450
        $currentTime = $this->getCurrentTime();
2451
2452
        while ($row = $this->db->sql_fetch_assoc($res)) {
2453
            if ($row['ttl'] < $currentTime) {
2454
                $orphanProcesses[] = $row['process_id'];
2455
            } else {
2456
                $processCount++;
2457
            }
2458
        }
2459
2460
        // if there are less than allowed active processes then add a new one
2461
        if ($processCount < intval($this->extensionSettings['processLimit'])) {
2462
            $this->CLI_debug("add process " . $this->CLI_buildProcessId() . " (" . ($processCount + 1) . "/" . intval($this->extensionSettings['processLimit']) . ")");
2463
2464
            // create new process record
2465
            $this->db->exec_INSERTquery(
2466
                'tx_crawler_process',
2467
                [
2468
                    'process_id' => $id,
2469
                    'active' => '1',
2470
                    'ttl' => ($currentTime + intval($this->extensionSettings['processMaxRunTime'])),
2471
                    'system_process_id' => $systemProcessId
2472
                ]
2473
                );
2474
        } else {
2475
            $this->CLI_debug("Processlimit reached (" . ($processCount) . "/" . intval($this->extensionSettings['processLimit']) . ")");
2476
            $ret = false;
2477
        }
2478
2479
        $this->CLI_releaseProcesses($orphanProcesses, true); // maybe this should be somehow included into the current lock
2480
        $this->CLI_deleteProcessesMarkedDeleted();
2481
2482
        $this->db->sql_query('COMMIT');
2483
2484
        return $ret;
2485
    }
2486
2487
    /**
2488
     * Release a process and the required resources
2489
     *
2490
     * @param  mixed    $releaseIds   string with a single process-id or array with multiple process-ids
2491
     * @param  boolean  $withinLock   show whether the DB-actions are included within an existing lock
2492
     * @return boolean
2493
     */
2494
    public function CLI_releaseProcesses($releaseIds, $withinLock = false)
2495
    {
2496
        if (!is_array($releaseIds)) {
2497
            $releaseIds = [$releaseIds];
2498
        }
2499
2500
        if (!count($releaseIds) > 0) {
2501
            return false;   //nothing to release
2502
        }
2503
2504
        if (!$withinLock) {
2505
            $this->db->sql_query('BEGIN');
2506
        }
2507
2508
        // some kind of 2nd chance algo - this way you need at least 2 processes to have a real cleanup
2509
        // this ensures that a single process can't mess up the entire process table
2510
2511
        // mark all processes as deleted which have no "waiting" queue-entires and which are not active
2512
        $this->db->exec_UPDATEquery(
2513
            'tx_crawler_queue',
2514
            'process_id IN (SELECT process_id FROM tx_crawler_process WHERE active=0 AND deleted=0)',
2515
            [
2516
                'process_scheduled' => 0,
2517
                'process_id' => ''
2518
            ]
2519
        );
2520
        $this->db->exec_UPDATEquery(
2521
            'tx_crawler_process',
2522
            'active=0 AND deleted=0
2523
            AND NOT EXISTS (
2524
                SELECT * FROM tx_crawler_queue
2525
                WHERE tx_crawler_queue.process_id = tx_crawler_process.process_id
2526
                AND tx_crawler_queue.exec_time = 0
2527
            )',
2528
            [
2529
                'deleted' => '1',
2530
                'system_process_id' => 0
2531
            ]
2532
        );
2533
        // mark all requested processes as non-active
2534
        $this->db->exec_UPDATEquery(
2535
            'tx_crawler_process',
2536
            'process_id IN (\'' . implode('\',\'', $releaseIds) . '\') AND deleted=0',
2537
            [
2538
                'active' => '0'
2539
            ]
2540
        );
2541
        $this->db->exec_UPDATEquery(
2542
            'tx_crawler_queue',
2543
            'exec_time=0 AND process_id IN ("' . implode('","', $releaseIds) . '")',
2544
            [
2545
                'process_scheduled' => 0,
2546
                'process_id' => ''
2547
            ]
2548
        );
2549
2550
        if (!$withinLock) {
2551
            $this->db->sql_query('COMMIT');
2552
        }
2553
2554
        return true;
2555
    }
2556
2557
    /**
2558
     * Delete processes marked as deleted
2559
     *
2560
     * @return void
2561
     */
2562 1
    public function CLI_deleteProcessesMarkedDeleted()
2563
    {
2564 1
        $this->db->exec_DELETEquery('tx_crawler_process', 'deleted = 1');
2565 1
    }
2566
2567
    /**
2568
     * Check if there are still resources left for the process with the given id
2569
     * Used to determine timeouts and to ensure a proper cleanup if there's a timeout
2570
     *
2571
     * @param  string  identification string for the process
2572
     * @return boolean determines if the process is still active / has resources
2573
     *
2574
     * FIXME: Please remove Transaction, not needed as only a select query.
2575
     */
2576
    public function CLI_checkIfProcessIsActive($pid)
2577
    {
2578
        $ret = false;
2579
        $this->db->sql_query('BEGIN');
2580
        $res = $this->db->exec_SELECTquery(
2581
            'process_id,active,ttl',
2582
            'tx_crawler_process',
2583
            'process_id = \'' . $pid . '\'  AND deleted=0',
2584
            '',
2585
            'ttl',
2586
            '0,1'
2587
        );
2588
        if ($row = $this->db->sql_fetch_assoc($res)) {
2589
            $ret = intVal($row['active']) == 1;
2590
        }
2591
        $this->db->sql_query('COMMIT');
2592
2593
        return $ret;
2594
    }
2595
2596
    /**
2597
     * Create a unique Id for the current process
2598
     *
2599
     * @return string  the ID
2600
     */
2601 2
    public function CLI_buildProcessId()
2602
    {
2603 2
        if (!$this->processID) {
2604 1
            $this->processID = GeneralUtility::shortMD5($this->microtime(true));
2605
        }
2606 2
        return $this->processID;
2607
    }
2608
2609
    /**
2610
     * @param bool $get_as_float
2611
     *
2612
     * @return mixed
2613
     */
2614
    protected function microtime($get_as_float = false)
2615
    {
2616
        return microtime($get_as_float);
2617
    }
2618
2619
    /**
2620
     * Prints a message to the stdout (only if debug-mode is enabled)
2621
     *
2622
     * @param  string $msg  the message
2623
     */
2624
    public function CLI_debug($msg)
2625
    {
2626
        if (intval($this->extensionSettings['processDebug'])) {
2627
            echo $msg . "\n";
2628
            flush();
2629
        }
2630
    }
2631
2632
    /**
2633
     * Get URL content by making direct request to TYPO3.
2634
     *
2635
     * @param  string $url          Page URL
2636
     * @param  int    $crawlerId    Crawler-ID
2637
     * @return array
2638
     */
2639 2
    protected function sendDirectRequest($url, $crawlerId)
2640
    {
2641 2
        $parsedUrl = parse_url($url);
2642 2
        if (!is_array($parsedUrl)) {
2643
            return [];
2644
        }
2645
2646 2
        $requestHeaders = $this->buildRequestHeaderArray($parsedUrl, $crawlerId);
2647
2648 2
        $cmd = escapeshellcmd($this->extensionSettings['phpPath']);
2649 2
        $cmd .= ' ';
2650 2
        $cmd .= escapeshellarg(ExtensionManagementUtility::extPath('crawler') . 'cli/bootstrap.php');
2651 2
        $cmd .= ' ';
2652 2
        $cmd .= escapeshellarg($this->getFrontendBasePath());
2653 2
        $cmd .= ' ';
2654 2
        $cmd .= escapeshellarg($url);
2655 2
        $cmd .= ' ';
2656 2
        $cmd .= escapeshellarg(base64_encode(serialize($requestHeaders)));
2657
2658 2
        $startTime = microtime(true);
2659 2
        $content = $this->executeShellCommand($cmd);
2660 2
        $this->log($url . ' ' . (microtime(true) - $startTime));
2661
2662
        $result = [
2663 2
            'request' => implode("\r\n", $requestHeaders) . "\r\n\r\n",
2664 2
            'headers' => '',
2665 2
            'content' => $content
2666
        ];
2667
2668 2
        return $result;
2669
    }
2670
2671
    /**
2672
     * Cleans up entries that stayed for too long in the queue. These are:
2673
     * - processed entries that are over 1.5 days in age
2674
     * - scheduled entries that are over 7 days old
2675
     *
2676
     * @return void
2677
     *
2678
     * TODO: Should be switched back to protected - TNM 2018-11-16
2679
     */
2680
    public function cleanUpOldQueueEntries()
2681
    {
2682
        $processedAgeInSeconds = $this->extensionSettings['cleanUpProcessedAge'] * 86400; // 24*60*60 Seconds in 24 hours
2683
        $scheduledAgeInSeconds = $this->extensionSettings['cleanUpScheduledAge'] * 86400;
2684
2685
        $now = time();
2686
        $condition = '(exec_time<>0 AND exec_time<' . ($now - $processedAgeInSeconds) . ') OR scheduled<=' . ($now - $scheduledAgeInSeconds);
2687
        $this->flushQueue($condition);
2688
    }
2689
2690
    /**
2691
     * Initializes a TypoScript Frontend necessary for using TypoScript and TypoLink functions
2692
     *
2693
     * @param int $id
2694
     * @param int $typeNum
2695
     *
2696
     * @throws \TYPO3\CMS\Core\Error\Http\ServiceUnavailableException
2697
     *
2698
     * @return void
2699
     */
2700
    protected function initTSFE($id = 1, $typeNum = 0)
2701
    {
2702
        EidUtility::initTCA();
2703
2704
        $isVersion7 = VersionNumberUtility::convertVersionNumberToInteger(TYPO3_version) < 8000000;
2705
        if ($isVersion7 && !is_object($GLOBALS['TT'])) {
2706
            /** @var NullTimeTracker $GLOBALS['TT'] */
2707
            $GLOBALS['TT'] = new NullTimeTracker();
0 ignored issues
show
Deprecated Code introduced by
The class TYPO3\CMS\Core\TimeTracker\NullTimeTracker has been deprecated with message: since TYPO3 v8, will be removed in v9

This class, trait or interface has been deprecated. The supplier of the file has supplied an explanatory message.

The explanatory message should give you some clue as to whether and when the type will be removed from the class and what other constant to use instead.

Loading history...
2708
            $GLOBALS['TT']->start();
0 ignored issues
show
Deprecated Code introduced by
The method TYPO3\CMS\Core\TimeTrack...ullTimeTracker::start() has been deprecated with message: since TYPO3 v8, will be removed in v9, use the regular time tracking

This method has been deprecated. The supplier of the class has supplied an explanatory message.

The explanatory message should give you some clue as to whether and when the method will be removed from the class and what other method or class to use instead.

Loading history...
2709
        } else {
2710
            $timeTracker = GeneralUtility::makeInstance(TimeTracker::class);
2711
            $timeTracker->start();
2712
        }
2713
2714
        $GLOBALS['TSFE'] = GeneralUtility::makeInstance(TypoScriptFrontendController::class, $GLOBALS['TYPO3_CONF_VARS'], $id, $typeNum);
2715
        $GLOBALS['TSFE']->sys_page = GeneralUtility::makeInstance(PageRepository::class);
2716
        $GLOBALS['TSFE']->sys_page->init(true);
2717
        $GLOBALS['TSFE']->connectToDB();
2718
        $GLOBALS['TSFE']->initFEuser();
2719
        $GLOBALS['TSFE']->determineId();
2720
        $GLOBALS['TSFE']->initTemplate();
2721
        $GLOBALS['TSFE']->rootLine = $GLOBALS['TSFE']->sys_page->getRootLine($id, '');
2722
        $GLOBALS['TSFE']->getConfigArray();
2723
        PageGenerator::pagegenInit();
0 ignored issues
show
Deprecated Code introduced by
The method TYPO3\CMS\Frontend\Page\...enerator::pagegenInit() has been deprecated with message: since TYPO3 v8, will be removed in TYPO3 v9

This method has been deprecated. The supplier of the class has supplied an explanatory message.

The explanatory message should give you some clue as to whether and when the method will be removed from the class and what other method or class to use instead.

Loading history...
2724
    }
2725
2726
    /**
2727
     * Returns a md5 hash generated from a serialized configuration array.
2728
     *
2729
     * @param array $configuration
2730
     *
2731
     * @return string
2732
     */
2733 9
    protected function getConfigurationHash(array $configuration) {
2734 9
        unset($configuration['paramExpanded']);
2735 9
        unset($configuration['URLs']);
2736 9
        return md5(serialize($configuration));
2737
    }
2738
2739
    /**
2740
     * Check whether the Crawling Protocol should be http or https
2741
     *
2742
     * @param $crawlerConfiguration
2743
     * @param $pageConfiguration
2744
     *
2745
     * @return bool
2746
     */
2747 6
    protected function isCrawlingProtocolHttps($crawlerConfiguration, $pageConfiguration) {
2748
        switch($crawlerConfiguration) {
2749 6
            case -1:
2750 1
                return false;
2751 5
            case 0:
2752 3
                return $pageConfiguration;
2753 2
            case 1:
2754 1
                return true;
2755
            default:
2756 1
                return false;
2757
        }
2758
    }
2759
}
2760