Completed
Push — test-coverage ( f79ab3 )
by Tomas Norre
06:05
created

CrawlerController::addUrl()   B

Complexity

Conditions 6
Paths 20

Size

Total Lines 87

Duplication

Lines 0
Ratio 0 %

Code Coverage

Tests 43
CRAP Score 6

Importance

Changes 0
Metric Value
cc 6
nc 20
nop 6
dl 0
loc 87
ccs 43
cts 43
cp 1
crap 6
rs 7.6614
c 0
b 0
f 0

How to fix   Long Method   

Long Method

Small methods make your code easier to understand, in particular if combined with a good name. Besides, if your method is small, finding a good name is usually much easier.

For example, if you find yourself adding comments to a method's body, this is usually a good sign to extract the commented part to a new method, and use the comment as a starting point when coming up with a good name for this new method.

Commonly applied refactorings include:

1
<?php
2
namespace AOE\Crawler\Controller;
3
4
/***************************************************************
5
 *  Copyright notice
6
 *
7
 *  (c) 2017 AOE GmbH <[email protected]>
8
 *
9
 *  All rights reserved
10
 *
11
 *  This script is part of the TYPO3 project. The TYPO3 project is
12
 *  free software; you can redistribute it and/or modify
13
 *  it under the terms of the GNU General Public License as published by
14
 *  the Free Software Foundation; either version 3 of the License, or
15
 *  (at your option) any later version.
16
 *
17
 *  The GNU General Public License can be found at
18
 *  http://www.gnu.org/copyleft/gpl.html.
19
 *
20
 *  This script is distributed in the hope that it will be useful,
21
 *  but WITHOUT ANY WARRANTY; without even the implied warranty of
22
 *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
23
 *  GNU General Public License for more details.
24
 *
25
 *  This copyright notice MUST APPEAR in all copies of the script!
26
 ***************************************************************/
27
28
use AOE\Crawler\Command\CrawlerCommandLineController;
29
use AOE\Crawler\Command\FlushCommandLineController;
30
use AOE\Crawler\Command\QueueCommandLineController;
31
use AOE\Crawler\Domain\Model\Configuration;
32
use AOE\Crawler\Domain\Model\Reason;
33
use AOE\Crawler\Domain\Repository\ConfigurationRepository;
34
use AOE\Crawler\Domain\Repository\QueueRepository;
35
use AOE\Crawler\Event\EventDispatcher;
36
use AOE\Crawler\Utility\IconUtility;
37
use AOE\Crawler\Utility\SignalSlotUtility;
38
use TYPO3\CMS\Backend\Utility\BackendUtility;
39
use TYPO3\CMS\Backend\Tree\View\PageTreeView;
40
use TYPO3\CMS\Core\Authentication\BackendUserAuthentication;
41
use TYPO3\CMS\Core\Database\DatabaseConnection;
42
use TYPO3\CMS\Core\Log\LogLevel;
43
use TYPO3\CMS\Core\TimeTracker\NullTimeTracker;
44
use TYPO3\CMS\Core\TimeTracker\TimeTracker;
45
use TYPO3\CMS\Core\Utility\DebugUtility;
46
use TYPO3\CMS\Core\Utility\ExtensionManagementUtility;
47
use TYPO3\CMS\Core\Utility\GeneralUtility;
48
use TYPO3\CMS\Core\Utility\MathUtility;
49
use TYPO3\CMS\Core\Utility\VersionNumberUtility;
50
use TYPO3\CMS\Extbase\Object\ObjectManager;
51
use TYPO3\CMS\Frontend\Controller\TypoScriptFrontendController;
52
use TYPO3\CMS\Frontend\Page\PageGenerator;
53
use TYPO3\CMS\Frontend\Page\PageRepository;
54
use TYPO3\CMS\Frontend\Utility\EidUtility;
55
use TYPO3\CMS\Lang\LanguageService;
56
57
/**
58
 * Class CrawlerController
59
 *
60
 * @package AOE\Crawler\Controller
61
 */
62
class CrawlerController
63
{
64
    const CLI_STATUS_NOTHING_PROCCESSED = 0;
65
    const CLI_STATUS_REMAIN = 1; //queue not empty
66
    const CLI_STATUS_PROCESSED = 2; //(some) queue items where processed
67
    const CLI_STATUS_ABORTED = 4; //instance didn't finish
68
    const CLI_STATUS_POLLABLE_PROCESSED = 8;
69
70
    /**
71
     * @var integer
72
     */
73
    public $setID = 0;
74
75
    /**
76
     * @var string
77
     */
78
    public $processID = '';
79
80
    /**
81
     * One hour is max stalled time for the CLI
82
     * If the process had the status "start" for 3600 seconds, it will be regarded stalled and a new process is started
83
     *
84
     * @var integer
85
     */
86
    public $max_CLI_exec_time = 3600;
87
88
    /**
89
     * @var array
90
     */
91
    public $duplicateTrack = [];
92
93
    /**
94
     * @var array
95
     */
96
    public $downloadUrls = [];
97
98
    /**
99
     * @var array
100
     */
101
    public $incomingProcInstructions = [];
102
103
    /**
104
     * @var array
105
     */
106
    public $incomingConfigurationSelection = [];
107
108
    /**
109
     * @var bool
110
     */
111
    public $registerQueueEntriesInternallyOnly = false;
112
113
    /**
114
     * @var array
115
     */
116
    public $queueEntries = [];
117
118
    /**
119
     * @var array
120
     */
121
    public $urlList = [];
122
123
    /**
124
     * @var boolean
125
     */
126
    public $debugMode = false;
127
128
    /**
129
     * @var array
130
     */
131
    public $extensionSettings = [];
132
133
    /**
134
     * Mount Point
135
     *
136
     * @var boolean
137
     */
138
    public $MP = false;
139
140
    /**
141
     * @var string
142
     */
143
    protected $processFilename;
144
145
    /**
146
     * Holds the internal access mode can be 'gui','cli' or 'cli_im'
147
     *
148
     * @var string
149
     */
150
    protected $accessMode;
151
152
    /**
153
     * @var DatabaseConnection
154
     */
155
    private $db;
156
157
    /**
158
     * @var BackendUserAuthentication
159
     */
160
    private $backendUser;
161
162
    /**
163
     * @var integer
164
     */
165
    private $scheduledTime = 0;
166
167
    /**
168
     * @var integer
169
     */
170
    private $reqMinute = 0;
171
172
    /**
173
     * @var bool
174
     */
175
    private $submitCrawlUrls = false;
176
177
    /**
178
     * @var bool
179
     */
180
    private $downloadCrawlUrls = false;
181
182
    /**
183
     * @var QueueRepository
184
     */
185
    protected  $queueRepository;
186
187
    /**
188
     * @var ConfigurationRepository
189
     */
190
    protected $configurationRepository;
191
192
    /**
193
     * Method to set the accessMode can be gui, cli or cli_im
194
     *
195
     * @return string
196
     */
197 1
    public function getAccessMode()
198
    {
199 1
        return $this->accessMode;
200
    }
201
202
    /**
203
     * @param string $accessMode
204
     */
205 1
    public function setAccessMode($accessMode)
206
    {
207 1
        $this->accessMode = $accessMode;
208 1
    }
209
210
    /**
211
     * Set disabled status to prevent processes from being processed
212
     *
213
     * @param  bool $disabled (optional, defaults to true)
214
     * @return void
215
     */
216 3
    public function setDisabled($disabled = true)
217
    {
218 3
        if ($disabled) {
219 2
            GeneralUtility::writeFile($this->processFilename, '');
220
        } else {
221 1
            if (is_file($this->processFilename)) {
222 1
                unlink($this->processFilename);
223
            }
224
        }
225 3
    }
226
227
    /**
228
     * Get disable status
229
     *
230
     * @return bool true if disabled
231
     */
232 3
    public function getDisabled()
233
    {
234 3
        if (is_file($this->processFilename)) {
235 2
            return true;
236
        } else {
237 1
            return false;
238
        }
239
    }
240
241
    /**
242
     * @param string $filenameWithPath
243
     *
244
     * @return void
245
     */
246 4
    public function setProcessFilename($filenameWithPath)
247
    {
248 4
        $this->processFilename = $filenameWithPath;
249 4
    }
250
251
    /**
252
     * @return string
253
     */
254 1
    public function getProcessFilename()
255
    {
256 1
        return $this->processFilename;
257
    }
258
259
    /************************************
260
     *
261
     * Getting URLs based on Page TSconfig
262
     *
263
     ************************************/
264
265 43
    public function __construct()
266
    {
267 43
        $objectManager = GeneralUtility::makeInstance(ObjectManager::class);
268 43
        $this->queueRepository = $objectManager->get(QueueRepository::class);
269 43
        $this->configurationRepository = $objectManager->get(ConfigurationRepository::class);
270
271 43
        $this->db = $GLOBALS['TYPO3_DB'];
272 43
        $this->backendUser = $GLOBALS['BE_USER'];
273 43
        $this->processFilename = PATH_site . 'typo3temp/tx_crawler.proc';
274
275 43
        $settings = unserialize($GLOBALS['TYPO3_CONF_VARS']['EXT']['extConf']['crawler']);
276 43
        $settings = is_array($settings) ? $settings : [];
277
278
        // read ext_em_conf_template settings and set
279 43
        $this->setExtensionSettings($settings);
280
281
        // set defaults:
282 43
        if (MathUtility::convertToPositiveInteger($this->extensionSettings['countInARun']) == 0) {
283 36
            $this->extensionSettings['countInARun'] = 100;
284
        }
285
286 43
        $this->extensionSettings['processLimit'] = MathUtility::forceIntegerInRange($this->extensionSettings['processLimit'], 1, 99, 1);
287 43
    }
288
289
    /**
290
     * Sets the extensions settings (unserialized pendant of $TYPO3_CONF_VARS['EXT']['extConf']['crawler']).
291
     *
292
     * @param array $extensionSettings
293
     * @return void
294
     */
295 52
    public function setExtensionSettings(array $extensionSettings)
296
    {
297 52
        $this->extensionSettings = $extensionSettings;
298 52
    }
299
300
    /**
301
     * Check if the given page should be crawled
302
     *
303
     * @param array $pageRow
304
     * @return false|string false if the page should be crawled (not excluded), true / skipMessage if it should be skipped
305
     */
306 10
    public function checkIfPageShouldBeSkipped(array $pageRow)
307
    {
308 10
        $skipPage = false;
309 10
        $skipMessage = 'Skipped'; // message will be overwritten later
310
311
        // if page is hidden
312 10
        if (!$this->extensionSettings['crawlHiddenPages']) {
313 10
            if ($pageRow['hidden']) {
314 1
                $skipPage = true;
315 1
                $skipMessage = 'Because page is hidden';
316
            }
317
        }
318
319 10
        if (!$skipPage) {
320 9
            if (GeneralUtility::inList('3,4', $pageRow['doktype']) || $pageRow['doktype'] >= 199) {
321 3
                $skipPage = true;
322 3
                $skipMessage = 'Because doktype is not allowed';
323
            }
324
        }
325
326 10
        if (!$skipPage) {
327 6
            if (is_array($GLOBALS['TYPO3_CONF_VARS']['EXTCONF']['crawler']['excludeDoktype'])) {
328 2
                foreach ($GLOBALS['TYPO3_CONF_VARS']['EXTCONF']['crawler']['excludeDoktype'] as $key => $doktypeList) {
329 1
                    if (GeneralUtility::inList($doktypeList, $pageRow['doktype'])) {
330 1
                        $skipPage = true;
331 1
                        $skipMessage = 'Doktype was excluded by "' . $key . '"';
332 1
                        break;
333
                    }
334
                }
335
            }
336
        }
337
338 10
        if (!$skipPage) {
339
            // veto hook
340 5
            if (is_array($GLOBALS['TYPO3_CONF_VARS']['EXTCONF']['crawler']['pageVeto'])) {
341
                foreach ($GLOBALS['TYPO3_CONF_VARS']['EXTCONF']['crawler']['pageVeto'] as $key => $func) {
342
                    $params = [
343
                        'pageRow' => $pageRow,
344
                    ];
345
                    // expects "false" if page is ok and "true" or a skipMessage if this page should _not_ be crawled
346
                    $veto = GeneralUtility::callUserFunction($func, $params, $this);
347
                    if ($veto !== false) {
348
                        $skipPage = true;
349
                        if (is_string($veto)) {
350
                            $skipMessage = $veto;
351
                        } else {
352
                            $skipMessage = 'Veto from hook "' . htmlspecialchars($key) . '"';
353
                        }
354
                        // no need to execute other hooks if a previous one return a veto
355
                        break;
356
                    }
357
                }
358
            }
359
        }
360
361 10
        return $skipPage ? $skipMessage : false;
362
    }
363
364
    /**
365
     * Wrapper method for getUrlsForPageId()
366
     * It returns an array of configurations and no urls!
367
     *
368
     * @param array $pageRow Page record with at least dok-type and uid columns.
369
     * @param string $skipMessage
370
     * @return array
371
     * @see getUrlsForPageId()
372
     */
373 6
    public function getUrlsForPageRow(array $pageRow, &$skipMessage = '')
374
    {
375 6
        $message = $this->checkIfPageShouldBeSkipped($pageRow);
376
377 6
        if ($message === false) {
378 5
            $forceSsl = ($pageRow['url_scheme'] === 2) ? true : false;
379 5
            $res = $this->getUrlsForPageId($pageRow['uid'], $forceSsl);
380 5
            $skipMessage = '';
381
        } else {
382 1
            $skipMessage = $message;
383 1
            $res = [];
384
        }
385
386 6
        return $res;
387
    }
388
389
    /**
390
     * This method is used to count if there are ANY unprocessed queue entries
391
     * of a given page_id and the configuration which matches a given hash.
392
     * If there if none, we can skip an inner detail check
393
     *
394
     * @param  int $uid
395
     * @param  string $configurationHash
396
     * @return boolean
397
     */
398 7
    protected function noUnprocessedQueueEntriesForPageWithConfigurationHashExist($uid, $configurationHash)
399
    {
400 7
        $configurationHash = $this->db->fullQuoteStr($configurationHash, 'tx_crawler_queue');
401 7
        $res = $this->db->exec_SELECTquery('count(*) as anz', 'tx_crawler_queue', "page_id=" . intval($uid) . " AND configuration_hash=" . $configurationHash . " AND exec_time=0");
402 7
        $row = $this->db->sql_fetch_assoc($res);
403
404 7
        return ($row['anz'] == 0);
405
    }
406
407
    /**
408
     * Creates a list of URLs from input array (and submits them to queue if asked for)
409
     * See Web > Info module script + "indexed_search"'s crawler hook-client using this!
410
     *
411
     * @param    array        Information about URLs from pageRow to crawl.
412
     * @param    array        Page row
413
     * @param    integer        Unix time to schedule indexing to, typically time()
414
     * @param    integer        Number of requests per minute (creates the interleave between requests)
415
     * @param    boolean        If set, submits the URLs to queue
416
     * @param    boolean        If set (and submitcrawlUrls is false) will fill $downloadUrls with entries)
417
     * @param    array        Array which is passed by reference and contains the an id per url to secure we will not crawl duplicates
418
     * @param    array        Array which will be filled with URLS for download if flag is set.
419
     * @param    array        Array of processing instructions
420
     * @return    string        List of URLs (meant for display in backend module)
421
     *
422
     */
423 4
    public function urlListFromUrlArray(
424
    array $vv,
425
    array $pageRow,
426
    $scheduledTime,
427
    $reqMinute,
428
    $submitCrawlUrls,
429
    $downloadCrawlUrls,
430
    array &$duplicateTrack,
431
    array &$downloadUrls,
432
    array $incomingProcInstructions
433
    ) {
434 4
        $urlList = '';
435
        // realurl support (thanks to Ingo Renner)
436 4
        if (ExtensionManagementUtility::isLoaded('realurl') && $vv['subCfg']['realurl']) {
437
438
            /** @var tx_realurl $urlObj */
439
            $urlObj = GeneralUtility::makeInstance('tx_realurl');
440
441
            if (!empty($vv['subCfg']['baseUrl'])) {
442
                $urlParts = parse_url($vv['subCfg']['baseUrl']);
443
                $host = strtolower($urlParts['host']);
444
                $urlObj->host = $host;
445
446
                // First pass, finding configuration OR pointer string:
447
                $urlObj->extConf = isset($GLOBALS['TYPO3_CONF_VARS']['EXTCONF']['realurl'][$urlObj->host]) ? $GLOBALS['TYPO3_CONF_VARS']['EXTCONF']['realurl'][$urlObj->host] : $GLOBALS['TYPO3_CONF_VARS']['EXTCONF']['realurl']['_DEFAULT'];
448
449
                // If it turned out to be a string pointer, then look up the real config:
450
                if (is_string($urlObj->extConf)) {
451
                    $urlObj->extConf = is_array($GLOBALS['TYPO3_CONF_VARS']['EXTCONF']['realurl'][$urlObj->extConf]) ? $GLOBALS['TYPO3_CONF_VARS']['EXTCONF']['realurl'][$urlObj->extConf] : $GLOBALS['TYPO3_CONF_VARS']['EXTCONF']['realurl']['_DEFAULT'];
452
                }
453
            }
454
455
            if (!$GLOBALS['TSFE']->sys_page) {
456
                $GLOBALS['TSFE']->sys_page = GeneralUtility::makeInstance('TYPO3\CMS\Frontend\Page\PageRepository');
457
            }
458
459
            if (!$GLOBALS['TSFE']->tmpl->rootLine[0]['uid']) {
460
                $GLOBALS['TSFE']->tmpl->rootLine[0]['uid'] = $urlObj->extConf['pagePath']['rootpage_id'];
461
            }
462
        }
463
464 4
        if (is_array($vv['URLs'])) {
465 4
            $configurationHash = $this->getConfigurationHash($vv);
466 4
            $skipInnerCheck = $this->noUnprocessedQueueEntriesForPageWithConfigurationHashExist($pageRow['uid'], $configurationHash);
467
468 4
            foreach ($vv['URLs'] as $urlQuery) {
469 4
                if ($this->drawURLs_PIfilter($vv['subCfg']['procInstrFilter'], $incomingProcInstructions)) {
470
471
                    // Calculate cHash:
472 4
                    if ($vv['subCfg']['cHash']) {
473
                        /* @var $cacheHash \TYPO3\CMS\Frontend\Page\CacheHashCalculator */
474
                        $cacheHash = GeneralUtility::makeInstance('TYPO3\CMS\Frontend\Page\CacheHashCalculator');
475
                        $urlQuery .= '&cHash=' . $cacheHash->generateForParameters($urlQuery);
476
                    }
477
478
                    // Create key by which to determine unique-ness:
479 4
                    $uKey = $urlQuery . '|' . $vv['subCfg']['userGroups'] . '|' . $vv['subCfg']['baseUrl'] . '|' . $vv['subCfg']['procInstrFilter'];
480
481
                    // realurl support (thanks to Ingo Renner)
482 4
                    $urlQuery = 'index.php' . $urlQuery;
483 4
                    if (ExtensionManagementUtility::isLoaded('realurl') && $vv['subCfg']['realurl']) {
484
                        $params = [
485
                            'LD' => [
486
                                'totalURL' => $urlQuery,
487
                            ],
488
                            'TCEmainHook' => true,
489
                        ];
490
                        $urlObj->encodeSpURL($params);
0 ignored issues
show
Bug introduced by
The variable $urlObj does not seem to be defined for all execution paths leading up to this point.

If you define a variable conditionally, it can happen that it is not defined for all execution paths.

Let’s take a look at an example:

function myFunction($a) {
    switch ($a) {
        case 'foo':
            $x = 1;
            break;

        case 'bar':
            $x = 2;
            break;
    }

    // $x is potentially undefined here.
    echo $x;
}

In the above example, the variable $x is defined if you pass “foo” or “bar” as argument for $a. However, since the switch statement has no default case statement, if you pass any other value, the variable $x would be undefined.

Available Fixes

  1. Check for existence of the variable explicitly:

    function myFunction($a) {
        switch ($a) {
            case 'foo':
                $x = 1;
                break;
    
            case 'bar':
                $x = 2;
                break;
        }
    
        if (isset($x)) { // Make sure it's always set.
            echo $x;
        }
    }
    
  2. Define a default value for the variable:

    function myFunction($a) {
        $x = ''; // Set a default which gets overridden for certain paths.
        switch ($a) {
            case 'foo':
                $x = 1;
                break;
    
            case 'bar':
                $x = 2;
                break;
        }
    
        echo $x;
    }
    
  3. Add a value for the missing path:

    function myFunction($a) {
        switch ($a) {
            case 'foo':
                $x = 1;
                break;
    
            case 'bar':
                $x = 2;
                break;
    
            // We add support for the missing case.
            default:
                $x = '';
                break;
        }
    
        echo $x;
    }
    
Loading history...
491
                        $urlQuery = $params['LD']['totalURL'];
492
                    }
493
494
                    // Scheduled time:
495 4
                    $schTime = $scheduledTime + round(count($duplicateTrack) * (60 / $reqMinute));
496 4
                    $schTime = floor($schTime / 60) * 60;
497
498 4
                    if (isset($duplicateTrack[$uKey])) {
499
500
                        //if the url key is registered just display it and do not resubmit is
501
                        $urlList = '<em><span class="typo3-dimmed">' . htmlspecialchars($urlQuery) . '</span></em><br/>';
502
                    } else {
503 4
                        $urlList = '[' . date('d.m.y H:i', $schTime) . '] ' . htmlspecialchars($urlQuery);
504 4
                        $this->urlList[] = '[' . date('d.m.y H:i', $schTime) . '] ' . $urlQuery;
505
506 4
                        $theUrl = ($vv['subCfg']['baseUrl'] ? $vv['subCfg']['baseUrl'] : GeneralUtility::getIndpEnv('TYPO3_SITE_URL')) . $urlQuery;
507
508
                        // Submit for crawling!
509 4
                        if ($submitCrawlUrls) {
510 4
                            $added = $this->addUrl(
511 4
                            $pageRow['uid'],
512 4
                            $theUrl,
513 4
                            $vv['subCfg'],
514 4
                            $scheduledTime,
515 4
                            $configurationHash,
516 4
                            $skipInnerCheck
517
                            );
518 4
                            if ($added === false) {
519 4
                                $urlList .= ' (Url already existed)';
520
                            }
521
                        } elseif ($downloadCrawlUrls) {
522
                            $downloadUrls[$theUrl] = $theUrl;
523
                        }
524
525 4
                        $urlList .= '<br />';
526
                    }
527 4
                    $duplicateTrack[$uKey] = true;
528
                }
529
            }
530
        } else {
531
            $urlList = 'ERROR - no URL generated';
532
        }
533
534 4
        return $urlList;
535
    }
536
537
    /**
538
     * Returns true if input processing instruction is among registered ones.
539
     *
540
     * @param string $piString PI to test
541
     * @param array $incomingProcInstructions Processing instructions
542
     * @return boolean
543
     */
544 5
    public function drawURLs_PIfilter($piString, array $incomingProcInstructions)
545
    {
546 5
        if (empty($incomingProcInstructions)) {
547 1
            return true;
548
        }
549
550 4
        foreach ($incomingProcInstructions as $pi) {
551 4
            if (GeneralUtility::inList($piString, $pi)) {
552 4
                return true;
553
            }
554
        }
555 2
    }
556
557 5
    public function getPageTSconfigForId($id)
558
    {
559 5
        if (!$this->MP) {
560 5
            $pageTSconfig = BackendUtility::getPagesTSconfig($id);
561
        } else {
562
            list(, $mountPointId) = explode('-', $this->MP);
563
            $pageTSconfig = BackendUtility::getPagesTSconfig($mountPointId);
564
        }
565
566
        // Call a hook to alter configuration
567 5
        if (is_array($GLOBALS['TYPO3_CONF_VARS']['EXTCONF']['crawler']['getPageTSconfigForId'])) {
568
            $params = [
569
                'pageId' => $id,
570
                'pageTSConfig' => &$pageTSconfig,
571
            ];
572
            foreach ($GLOBALS['TYPO3_CONF_VARS']['EXTCONF']['crawler']['getPageTSconfigForId'] as $userFunc) {
573
                GeneralUtility::callUserFunction($userFunc, $params, $this);
574
            }
575
        }
576
577 5
        return $pageTSconfig;
578
    }
579
580
    /**
581
     * This methods returns an array of configurations.
582
     * And no urls!
583
     *
584
     * @param integer $id Page ID
585
     * @param bool $forceSsl Use https
586
     * @return array
587
     *
588
     * TODO: Should be switched back to protected - TNM 2018-11-16
589
     */
590 4
    public function getUrlsForPageId($id, $forceSsl = false)
591
    {
592
593
        /**
594
         * Get configuration from tsConfig
595
         */
596
597
        // Get page TSconfig for page ID:
598 4
        $pageTSconfig = $this->getPageTSconfigForId($id);
599
600 4
        $res = [];
601
602 4
        if (is_array($pageTSconfig) && is_array($pageTSconfig['tx_crawler.']['crawlerCfg.'])) {
603 3
            $crawlerCfg = $pageTSconfig['tx_crawler.']['crawlerCfg.'];
604
605 3
            if (is_array($crawlerCfg['paramSets.'])) {
606 3
                foreach ($crawlerCfg['paramSets.'] as $key => $values) {
607 3
                    if (is_array($values)) {
608 3
                        $key = str_replace('.', '', $key);
609
                        // Sub configuration for a single configuration string:
610 3
                        $subCfg = (array)$crawlerCfg['paramSets.'][$key . '.'];
611 3
                        $subCfg['key'] = $key;
612
613 3
                        if (strcmp($subCfg['procInstrFilter'], '')) {
614 3
                            $subCfg['procInstrFilter'] = implode(',', GeneralUtility::trimExplode(',', $subCfg['procInstrFilter']));
615
                        }
616 3
                        $pidOnlyList = implode(',', GeneralUtility::trimExplode(',', $subCfg['pidsOnly'], true));
617
618
                        // process configuration if it is not page-specific or if the specific page is the current page:
619 3
                        if (!strcmp($subCfg['pidsOnly'], '') || GeneralUtility::inList($pidOnlyList, $id)) {
620
621
                                // add trailing slash if not present
622 3
                            if (!empty($subCfg['baseUrl']) && substr($subCfg['baseUrl'], -1) != '/') {
623
                                $subCfg['baseUrl'] .= '/';
624
                            }
625
626
                            // Explode, process etc.:
627 3
                            $res[$key] = [];
628 3
                            $res[$key]['subCfg'] = $subCfg;
629 3
                            $res[$key]['paramParsed'] = $this->parseParams($crawlerCfg['paramSets.'][$key]);
630 3
                            $res[$key]['paramExpanded'] = $this->expandParameters($res[$key]['paramParsed'], $id);
631 3
                            $res[$key]['origin'] = 'pagets';
632
633
                            // recognize MP value
634 3
                            if (!$this->MP) {
635 3
                                $res[$key]['URLs'] = $this->compileUrls($res[$key]['paramExpanded'], ['?id=' . $id]);
636
                            } else {
637 3
                                $res[$key]['URLs'] = $this->compileUrls($res[$key]['paramExpanded'], ['?id=' . $id . '&MP=' . $this->MP]);
638
                            }
639
                        }
640
                    }
641
                }
642
            }
643
        }
644
645
        /**
646
         * Get configuration from tx_crawler_configuration records
647
         */
648
649
        // get records along the rootline
650 4
        $rootLine = BackendUtility::BEgetRootLine($id);
651
652 4
        foreach ($rootLine as $page) {
653 4
            $configurationRecordsForCurrentPage = $this->configurationRepository->getConfigurationRecordsPageUid($page['uid'])->toArray();
654
655 4
            if (is_array($configurationRecordsForCurrentPage)) {
656
                /** @var Configuration $configurationRecord */
657 4
                foreach ($configurationRecordsForCurrentPage as $configurationRecord) {
658
659
                        // check access to the configuration record
660 1
                    if (empty($configurationRecord->getBeGroups()) || $GLOBALS['BE_USER']->isAdmin() || $this->hasGroupAccess($GLOBALS['BE_USER']->user['usergroup_cached_list'], $configurationRecord->getBeGroups())) {
661 1
                        $pidOnlyList = implode(',', GeneralUtility::trimExplode(',', $configurationRecord->getPidsOnly(), true));
662
663
                        // process configuration if it is not page-specific or if the specific page is the current page:
664 1
                        if (!strcmp($configurationRecord->getPidsOnly(), '') || GeneralUtility::inList($pidOnlyList, $id)) {
665 1
                            $key = $configurationRecord->getName();
666
667
                            // don't overwrite previously defined paramSets
668 1
                            if (!isset($res[$key])) {
669
670
                                    /* @var $TSparserObject \TYPO3\CMS\Core\TypoScript\Parser\TypoScriptParser */
671 1
                                $TSparserObject = GeneralUtility::makeInstance('TYPO3\CMS\Core\TypoScript\Parser\TypoScriptParser');
672
                                // Todo: Check where the field processing_instructions_parameters_ts comes from.
673 1
                                $TSparserObject->parse($configurationRecord->getProcessingInstructionFilter()); //['processing_instruction_parameters_ts']);
674
675 1
                                $isCrawlingProtocolHttps = $this->isCrawlingProtocolHttps($configurationRecord->isForceSsl(), $forceSsl);
676
677
                                $subCfg = [
678 1
                                    'procInstrFilter' => $configurationRecord->getProcessingInstructionFilter(),
679 1
                                    'procInstrParams.' => $TSparserObject->setup,
680 1
                                    'baseUrl' => $this->getBaseUrlForConfigurationRecord(
681 1
                                        $configurationRecord->getBaseUrl(),
682 1
                                        $configurationRecord->getSysDomainBaseUrl(),
683 1
                                        $isCrawlingProtocolHttps
684
                                    ),
685 1
                                    'realurl' => $configurationRecord->getRealUrl(),
686 1
                                    'cHash' => $configurationRecord->getCHash(),
687 1
                                    'userGroups' => $configurationRecord->getFeGroups(),
688 1
                                    'exclude' => $configurationRecord->getExcludeText(),
689 1
                                    'rootTemplatePid' => (int) $configurationRecord->getRootTemplatePid(),
690 1
                                    'key' => $key,
691
                                ];
692
693
                                // add trailing slash if not present
694 1
                                if (!empty($subCfg['baseUrl']) && substr($subCfg['baseUrl'], -1) != '/') {
695
                                    $subCfg['baseUrl'] .= '/';
696
                                }
697 1
                                if (!in_array($id, $this->expandExcludeString($subCfg['exclude']))) {
698 1
                                    $res[$key] = [];
699 1
                                    $res[$key]['subCfg'] = $subCfg;
700 1
                                    $res[$key]['paramParsed'] = $this->parseParams($configurationRecord->getConfiguration());
701 1
                                    $res[$key]['paramExpanded'] = $this->expandParameters($res[$key]['paramParsed'], $id);
702 1
                                    $res[$key]['URLs'] = $this->compileUrls($res[$key]['paramExpanded'], ['?id=' . $id]);
703 4
                                    $res[$key]['origin'] = 'tx_crawler_configuration_' . $configurationRecord->getUid();
704
                                }
705
                            }
706
                        }
707
                    }
708
                }
709
            }
710
        }
711
712 4
        if (is_array($GLOBALS['TYPO3_CONF_VARS']['EXTCONF']['crawler']['processUrls'])) {
713
            foreach ($GLOBALS['TYPO3_CONF_VARS']['EXTCONF']['crawler']['processUrls'] as $func) {
714
                $params = [
715
                    'res' => &$res,
716
                ];
717
                GeneralUtility::callUserFunction($func, $params, $this);
718
            }
719
        }
720
721 4
        return $res;
722
    }
723
724
    /**
725
     * Checks if a domain record exist and returns the base-url based on the record. If not the given baseUrl string is used.
726
     *
727
     * @param string $baseUrl
728
     * @param integer $sysDomainUid
729
     * @param bool $ssl
730
     * @return string
731
     */
732 4
    protected function getBaseUrlForConfigurationRecord($baseUrl, $sysDomainUid, $ssl = false)
733
    {
734 4
        $sysDomainUid = intval($sysDomainUid);
735 4
        $urlScheme = ($ssl === false) ? 'http' : 'https';
736
737 4
        if ($sysDomainUid > 0) {
738 2
            $res = $this->db->exec_SELECTquery(
739 2
                '*',
740 2
                'sys_domain',
741 2
                'uid = ' . $sysDomainUid .
742 2
                BackendUtility::BEenableFields('sys_domain') .
743 2
                BackendUtility::deleteClause('sys_domain')
744
            );
745 2
            $row = $this->db->sql_fetch_assoc($res);
746 2
            if ($row['domainName'] != '') {
747 1
                return $urlScheme . '://' . $row['domainName'];
748
            }
749
        }
750 3
        return $baseUrl;
751
    }
752
753 1
    public function getConfigurationsForBranch($rootid, $depth)
754
    {
755 1
        $configurationsForBranch = [];
756
757 1
        $pageTSconfig = $this->getPageTSconfigForId($rootid);
758 1
        if (is_array($pageTSconfig) && is_array($pageTSconfig['tx_crawler.']['crawlerCfg.']) && is_array($pageTSconfig['tx_crawler.']['crawlerCfg.']['paramSets.'])) {
759
            $sets = $pageTSconfig['tx_crawler.']['crawlerCfg.']['paramSets.'];
760
            if (is_array($sets)) {
761
                foreach ($sets as $key => $value) {
762
                    if (!is_array($value)) {
763
                        continue;
764
                    }
765
                    $configurationsForBranch[] = substr($key, -1) == '.' ? substr($key, 0, -1) : $key;
766
                }
767
            }
768
        }
769 1
        $pids = [];
770 1
        $rootLine = BackendUtility::BEgetRootLine($rootid);
771 1
        foreach ($rootLine as $node) {
772 1
            $pids[] = $node['uid'];
773
        }
774
        /* @var PageTreeView $tree */
775 1
        $tree = GeneralUtility::makeInstance(PageTreeView::class);
776 1
        $perms_clause = $GLOBALS['BE_USER']->getPagePermsClause(1);
777 1
        $tree->init('AND ' . $perms_clause);
778 1
        $tree->getTree($rootid, $depth, '');
779 1
        foreach ($tree->tree as $node) {
780
            $pids[] = $node['row']['uid'];
781
        }
782
783 1
        $res = $this->db->exec_SELECTquery(
784 1
            '*',
785 1
            'tx_crawler_configuration',
786 1
            'pid IN (' . implode(',', $pids) . ') ' .
787 1
            BackendUtility::BEenableFields('tx_crawler_configuration') .
788 1
            BackendUtility::deleteClause('tx_crawler_configuration') . ' ' .
789 1
            BackendUtility::versioningPlaceholderClause('tx_crawler_configuration') . ' '
790
        );
791
792 1
        while ($row = $this->db->sql_fetch_assoc($res)) {
793 1
            $configurationsForBranch[] = $row['name'];
794
        }
795 1
        $this->db->sql_free_result($res);
796 1
        return $configurationsForBranch;
797
    }
798
799
    /**
800
     * Check if a user has access to an item
801
     * (e.g. get the group list of the current logged in user from $GLOBALS['TSFE']->gr_list)
802
     *
803
     * @see \TYPO3\CMS\Frontend\Page\PageRepository::getMultipleGroupsWhereClause()
804
     * @param  string $groupList    Comma-separated list of (fe_)group UIDs from a user
805
     * @param  string $accessList   Comma-separated list of (fe_)group UIDs of the item to access
806
     * @return bool                 TRUE if at least one of the users group UIDs is in the access list or the access list is empty
807
     */
808 3
    public function hasGroupAccess($groupList, $accessList)
809
    {
810 3
        if (empty($accessList)) {
811 1
            return true;
812
        }
813 2
        foreach (GeneralUtility::intExplode(',', $groupList) as $groupUid) {
814 2
            if (GeneralUtility::inList($accessList, $groupUid)) {
815 2
                return true;
816
            }
817
        }
818 1
        return false;
819
    }
820
821
    /**
822
     * Parse GET vars of input Query into array with key=>value pairs
823
     *
824
     * @param string $inputQuery Input query string
825
     * @return array
826
     */
827 7
    public function parseParams($inputQuery)
828
    {
829
        // Extract all GET parameters into an ARRAY:
830 7
        $paramKeyValues = [];
831 7
        $GETparams = explode('&', $inputQuery);
832
833 7
        foreach ($GETparams as $paramAndValue) {
834 7
            list($p, $v) = explode('=', $paramAndValue, 2);
835 7
            if (strlen($p)) {
836 7
                $paramKeyValues[rawurldecode($p)] = rawurldecode($v);
837
            }
838
        }
839
840 7
        return $paramKeyValues;
841
    }
842
843
    /**
844
     * Will expand the parameters configuration to individual values. This follows a certain syntax of the value of each parameter.
845
     * Syntax of values:
846
     * - Basically: If the value is wrapped in [...] it will be expanded according to the following syntax, otherwise the value is taken literally
847
     * - Configuration is splitted by "|" and the parts are processed individually and finally added together
848
     * - For each configuration part:
849
     *         - "[int]-[int]" = Integer range, will be expanded to all values in between, values included, starting from low to high (max. 1000). Example "1-34" or "-40--30"
850
     *         - "_TABLE:[TCA table name];[_PID:[optional page id, default is current page]];[_ENABLELANG:1]" = Look up of table records from PID, filtering out deleted records. Example "_TABLE:tt_content; _PID:123"
851
     *        _ENABLELANG:1 picks only original records without their language overlays
852
     *         - Default: Literal value
853
     *
854
     * @param array $paramArray Array with key (GET var name) and values (value of GET var which is configuration for expansion)
855
     * @param integer $pid Current page ID
856
     * @return array
857
     */
858 4
    public function expandParameters($paramArray, $pid)
859
    {
860 4
        global $TCA;
861
862
        // Traverse parameter names:
863 4
        foreach ($paramArray as $p => $v) {
864 4
            $v = trim($v);
865
866
            // If value is encapsulated in square brackets it means there are some ranges of values to find, otherwise the value is literal
867 4
            if (substr($v, 0, 1) === '[' && substr($v, -1) === ']') {
868
                // So, find the value inside brackets and reset the paramArray value as an array.
869 4
                $v = substr($v, 1, -1);
870 4
                $paramArray[$p] = [];
871
872
                // Explode parts and traverse them:
873 4
                $parts = explode('|', $v);
874 4
                foreach ($parts as $pV) {
875
876
                        // Look for integer range: (fx. 1-34 or -40--30 // reads minus 40 to minus 30)
877 4
                    if (preg_match('/^(-?[0-9]+)\s*-\s*(-?[0-9]+)$/', trim($pV), $reg)) {
878
879
                        // Swap if first is larger than last:
880
                        if ($reg[1] > $reg[2]) {
881
                            $temp = $reg[2];
882
                            $reg[2] = $reg[1];
883
                            $reg[1] = $temp;
884
                        }
885
886
                        // Traverse range, add values:
887
                        $runAwayBrake = 1000; // Limit to size of range!
888
                        for ($a = $reg[1]; $a <= $reg[2];$a++) {
889
                            $paramArray[$p][] = $a;
890
                            $runAwayBrake--;
891
                            if ($runAwayBrake <= 0) {
892
                                break;
893
                            }
894
                        }
895 4
                    } elseif (substr(trim($pV), 0, 7) == '_TABLE:') {
896
897
                        // Parse parameters:
898
                        $subparts = GeneralUtility::trimExplode(';', $pV);
899
                        $subpartParams = [];
900
                        foreach ($subparts as $spV) {
901
                            list($pKey, $pVal) = GeneralUtility::trimExplode(':', $spV);
902
                            $subpartParams[$pKey] = $pVal;
903
                        }
904
905
                        // Table exists:
906
                        if (isset($TCA[$subpartParams['_TABLE']])) {
907
                            $lookUpPid = isset($subpartParams['_PID']) ? intval($subpartParams['_PID']) : $pid;
908
                            $pidField = isset($subpartParams['_PIDFIELD']) ? trim($subpartParams['_PIDFIELD']) : 'pid';
909
                            $where = isset($subpartParams['_WHERE']) ? $subpartParams['_WHERE'] : '';
910
                            $addTable = isset($subpartParams['_ADDTABLE']) ? $subpartParams['_ADDTABLE'] : '';
911
912
                            $fieldName = $subpartParams['_FIELD'] ? $subpartParams['_FIELD'] : 'uid';
913
                            if ($fieldName === 'uid' || $TCA[$subpartParams['_TABLE']]['columns'][$fieldName]) {
914
                                $andWhereLanguage = '';
915
                                $transOrigPointerField = $TCA[$subpartParams['_TABLE']]['ctrl']['transOrigPointerField'];
916
917
                                if ($subpartParams['_ENABLELANG'] && $transOrigPointerField) {
918
                                    $andWhereLanguage = ' AND ' . $this->db->quoteStr($transOrigPointerField, $subpartParams['_TABLE']) . ' <= 0 ';
919
                                }
920
921
                                $where = $this->db->quoteStr($pidField, $subpartParams['_TABLE']) . '=' . intval($lookUpPid) . ' ' .
922
                                    $andWhereLanguage . $where;
923
924
                                $rows = $this->db->exec_SELECTgetRows(
925
                                    $fieldName,
926
                                    $subpartParams['_TABLE'] . $addTable,
927
                                    $where . BackendUtility::deleteClause($subpartParams['_TABLE']),
928
                                    '',
929
                                    '',
930
                                    '',
931
                                    $fieldName
932
                                );
933
934
                                if (is_array($rows)) {
935
                                    $paramArray[$p] = array_merge($paramArray[$p], array_keys($rows));
936
                                }
937
                            }
938
                        }
939
                    } else { // Just add value:
940 4
                        $paramArray[$p][] = $pV;
941
                    }
942
                    // Hook for processing own expandParameters place holder
943 4
                    if (is_array($GLOBALS['TYPO3_CONF_VARS']['SC_OPTIONS']['crawler/class.tx_crawler_lib.php']['expandParameters'])) {
944
                        $_params = [
945
                            'pObj' => &$this,
946
                            'paramArray' => &$paramArray,
947
                            'currentKey' => $p,
948
                            'currentValue' => $pV,
949
                            'pid' => $pid,
950
                        ];
951
                        foreach ($GLOBALS['TYPO3_CONF_VARS']['SC_OPTIONS']['crawler/class.tx_crawler_lib.php']['expandParameters'] as $key => $_funcRef) {
952 4
                            GeneralUtility::callUserFunction($_funcRef, $_params, $this);
953
                        }
954
                    }
955
                }
956
957
                // Make unique set of values and sort array by key:
958 4
                $paramArray[$p] = array_unique($paramArray[$p]);
959 4
                ksort($paramArray);
960
            } else {
961
                // Set the literal value as only value in array:
962 4
                $paramArray[$p] = [$v];
963
            }
964
        }
965
966 4
        return $paramArray;
967
    }
968
969
    /**
970
     * Compiling URLs from parameter array (output of expandParameters())
971
     * The number of URLs will be the multiplication of the number of parameter values for each key
972
     *
973
     * @param array $paramArray Output of expandParameters(): Array with keys (GET var names) and for each an array of values
974
     * @param array $urls URLs accumulated in this array (for recursion)
975
     * @return array
976
     */
977 7
    public function compileUrls($paramArray, $urls = [])
978
    {
979 7
        if (count($paramArray) && is_array($urls)) {
980
            // shift first off stack:
981 6
            reset($paramArray);
982 6
            $varName = key($paramArray);
983 6
            $valueSet = array_shift($paramArray);
984
985
            // Traverse value set:
986 6
            $newUrls = [];
987 6
            foreach ($urls as $url) {
988 5
                foreach ($valueSet as $val) {
989 5
                    $newUrls[] = $url . (strcmp($val, '') ? '&' . rawurlencode($varName) . '=' . rawurlencode($val) : '');
990
991 5
                    if (count($newUrls) > MathUtility::forceIntegerInRange($this->extensionSettings['maxCompileUrls'], 1, 1000000000, 10000)) {
992 5
                        break;
993
                    }
994
                }
995
            }
996 6
            $urls = $newUrls;
997 6
            $urls = $this->compileUrls($paramArray, $urls);
998
        }
999
1000 7
        return $urls;
1001
    }
1002
1003
    /************************************
1004
     *
1005
     * Crawler log
1006
     *
1007
     ************************************/
1008
1009
    /**
1010
     * Return array of records from crawler queue for input page ID
1011
     *
1012
     * @param integer $id Page ID for which to look up log entries.
1013
     * @param string$filter Filter: "all" => all entries, "pending" => all that is not yet run, "finished" => all complete ones
1014
     * @param boolean $doFlush If TRUE, then entries selected at DELETED(!) instead of selected!
1015
     * @param boolean $doFullFlush
1016
     * @param integer $itemsPerPage Limit the amount of entries per page default is 10
1017
     * @return array
1018
     */
1019 4
    public function getLogEntriesForPageId($id, $filter = '', $doFlush = false, $doFullFlush = false, $itemsPerPage = 10)
1020
    {
1021
        switch ($filter) {
1022 4
            case 'pending':
1023
                $addWhere = ' AND exec_time=0';
1024
                break;
1025 4
            case 'finished':
1026
                $addWhere = ' AND exec_time>0';
1027
                break;
1028
            default:
1029 4
                $addWhere = '';
1030 4
                break;
1031
        }
1032
1033
        // FIXME: Write unit test that ensures that the right records are deleted.
1034 4
        if ($doFlush) {
1035 2
            $this->flushQueue(($doFullFlush ? '1=1' : ('page_id=' . intval($id))) . $addWhere);
1036 2
            return [];
1037
        } else {
1038 2
            return $this->db->exec_SELECTgetRows(
1039 2
                '*',
1040 2
                'tx_crawler_queue',
1041 2
                'page_id=' . intval($id) . $addWhere,
1042 2
                '',
1043 2
                'scheduled DESC',
1044 2
                (intval($itemsPerPage) > 0 ? intval($itemsPerPage) : '')
1045
            );
1046
        }
1047
    }
1048
1049
    /**
1050
     * Return array of records from crawler queue for input set ID
1051
     *
1052
     * @param integer $set_id Set ID for which to look up log entries.
1053
     * @param string $filter Filter: "all" => all entries, "pending" => all that is not yet run, "finished" => all complete ones
1054
     * @param boolean $doFlush If TRUE, then entries selected at DELETED(!) instead of selected!
1055
     * @param integer $itemsPerPage Limit the amount of entires per page default is 10
1056
     * @return array
1057
     */
1058 6
    public function getLogEntriesForSetId($set_id, $filter = '', $doFlush = false, $doFullFlush = false, $itemsPerPage = 10)
1059
    {
1060
        // FIXME: Write Unit tests for Filters
1061
        switch ($filter) {
1062 6
            case 'pending':
1063 1
                $addWhere = ' AND exec_time=0';
1064 1
                break;
1065 5
            case 'finished':
1066 1
                $addWhere = ' AND exec_time>0';
1067 1
                break;
1068
            default:
1069 4
                $addWhere = '';
1070 4
                break;
1071
        }
1072
        // FIXME: Write unit test that ensures that the right records are deleted.
1073 6
        if ($doFlush) {
1074 4
            $this->flushQueue($doFullFlush ? '' : ('set_id=' . intval($set_id) . $addWhere));
1075 4
            return [];
1076
        } else {
1077 2
            return $this->db->exec_SELECTgetRows(
1078 2
                '*',
1079 2
                'tx_crawler_queue',
1080 2
                'set_id=' . intval($set_id) . $addWhere,
1081 2
                '',
1082 2
                'scheduled DESC',
1083 2
                (intval($itemsPerPage) > 0 ? intval($itemsPerPage) : '')
1084
            );
1085
        }
1086
    }
1087
1088
    /**
1089
     * Removes queue entries
1090
     *
1091
     * @param string $where SQL related filter for the entries which should be removed
1092
     * @return void
1093
     */
1094 10
    protected function flushQueue($where = '')
1095
    {
1096 10
        $realWhere = strlen($where) > 0 ? $where : '1=1';
1097
1098 10
        if (EventDispatcher::getInstance()->hasObserver('queueEntryFlush') || SignalSlotUtility::hasSignal(__CLASS__, SignalSlotUtility::SIGNAL_QUEUE_ENTRY_FLUSH)) {
1099
            $groups = $this->db->exec_SELECTgetRows('DISTINCT set_id', 'tx_crawler_queue', $realWhere);
1100
            if (is_array($groups)) {
1101
                foreach ($groups as $group) {
1102
1103
                    // The event dispatcher is deprecated since crawler v6.4.0, will be removed in crawler v7.0.0.
1104
                    // Please use the Signal instead.
1105
                    if (EventDispatcher::getInstance()->hasObserver('queueEntryFlush')) {
1106
                        EventDispatcher::getInstance()->post(
1107
                            'queueEntryFlush',
1108
                            $group['set_id'],
1109
                            $this->db->exec_SELECTgetRows('uid, set_id', 'tx_crawler_queue', $realWhere . ' AND set_id="' . $group['set_id'] . '"')
1110
                        );
1111
                    }
1112
1113
                    if (SignalSlotUtility::hasSignal(__CLASS__, SignalSlotUtility::SIGNAL_QUEUE_ENTRY_FLUSH)) {
1114
                        $signalInputArray = $this->db->exec_SELECTgetRows('uid, set_id', 'tx_crawler_queue', $realWhere . ' AND set_id="' . $group['set_id'] . '"');
1115
                        SignalSlotUtility::emitSignal(
1116
                            __CLASS__,
1117
                            SignalSlotUtility::SIGNAL_QUEUE_ENTRY_FLUSH,
1118
                            $signalInputArray
0 ignored issues
show
Bug introduced by
It seems like $signalInputArray defined by $this->db->exec_SELECTge...$group['set_id'] . '"') on line 1114 can also be of type null; however, AOE\Crawler\Utility\Sign...otUtility::emitSignal() does only seem to accept array, maybe add an additional type check?

If a method or function can return multiple different values and unless you are sure that you only can receive a single value in this context, we recommend to add an additional type check:

/**
 * @return array|string
 */
function returnsDifferentValues($x) {
    if ($x) {
        return 'foo';
    }

    return array();
}

$x = returnsDifferentValues($y);
if (is_array($x)) {
    // $x is an array.
}

If this a common case that PHP Analyzer should handle natively, please let us know by opening an issue.

Loading history...
1119
                        );
1120
                    }
1121
                }
1122
            }
1123
        }
1124
1125 10
        $GLOBALS['TYPO3_DB']->exec_DELETEquery('tx_crawler_queue', $realWhere);
1126 10
    }
1127
1128
    /**
1129
     * Adding call back entries to log (called from hooks typically, see indexed search class "class.crawler.php"
1130
     *
1131
     * @param integer $setId Set ID
1132
     * @param array $params Parameters to pass to call back function
1133
     * @param string $callBack Call back object reference, eg. 'EXT:indexed_search/class.crawler.php:&tx_indexedsearch_crawler'
1134
     * @param integer $page_id Page ID to attach it to
1135
     * @param integer $schedule Time at which to activate
1136
     * @return void
1137
     */
1138
    public function addQueueEntry_callBack($setId, $params, $callBack, $page_id = 0, $schedule = 0)
1139
    {
1140
        if (!is_array($params)) {
1141
            $params = [];
1142
        }
1143
        $params['_CALLBACKOBJ'] = $callBack;
1144
1145
        // Compile value array:
1146
        $fieldArray = [
1147
            'page_id' => intval($page_id),
1148
            'parameters' => serialize($params),
1149
            'scheduled' => intval($schedule) ? intval($schedule) : $this->getCurrentTime(),
1150
            'exec_time' => 0,
1151
            'set_id' => intval($setId),
1152
            'result_data' => '',
1153
        ];
1154
1155
        $this->db->exec_INSERTquery('tx_crawler_queue', $fieldArray);
1156
    }
1157
1158
    /************************************
1159
     *
1160
     * URL setting
1161
     *
1162
     ************************************/
1163
1164
    /**
1165
     * Setting a URL for crawling:
1166
     *
1167
     * @param integer $id Page ID
1168
     * @param string $url Complete URL
1169
     * @param array $subCfg Sub configuration array (from TS config)
1170
     * @param integer $tstamp Scheduled-time
1171
     * @param string $configurationHash (optional) configuration hash
1172
     * @param bool $skipInnerDuplicationCheck (optional) skip inner duplication check
1173
     * @return bool
1174
     */
1175 8
    public function addUrl(
1176
        $id,
1177
        $url,
1178
        array $subCfg,
1179
        $tstamp,
1180
        $configurationHash = '',
1181
        $skipInnerDuplicationCheck = false
1182
    ) {
1183 8
        $urlAdded = false;
1184 8
        $rows = [];
1185
1186
        // Creating parameters:
1187
        $parameters = [
1188 8
            'url' => $url,
1189
        ];
1190
1191
        // fe user group simulation:
1192 8
        $uGs = implode(',', array_unique(GeneralUtility::intExplode(',', $subCfg['userGroups'], true)));
1193 8
        if ($uGs) {
1194 1
            $parameters['feUserGroupList'] = $uGs;
1195
        }
1196
1197
        // Setting processing instructions
1198 8
        $parameters['procInstructions'] = GeneralUtility::trimExplode(',', $subCfg['procInstrFilter']);
1199 8
        if (is_array($subCfg['procInstrParams.'])) {
1200 5
            $parameters['procInstrParams'] = $subCfg['procInstrParams.'];
1201
        }
1202
1203
        // Possible TypoScript Template Parents
1204 8
        $parameters['rootTemplatePid'] = $subCfg['rootTemplatePid'];
1205
1206
        // Compile value array:
1207 8
        $parameters_serialized = serialize($parameters);
1208
        $fieldArray = [
1209 8
            'page_id' => intval($id),
1210 8
            'parameters' => $parameters_serialized,
1211 8
            'parameters_hash' => GeneralUtility::shortMD5($parameters_serialized),
1212 8
            'configuration_hash' => $configurationHash,
1213 8
            'scheduled' => $tstamp,
1214 8
            'exec_time' => 0,
1215 8
            'set_id' => intval($this->setID),
1216 8
            'result_data' => '',
1217 8
            'configuration' => $subCfg['key'],
1218
        ];
1219
1220 8
        if ($this->registerQueueEntriesInternallyOnly) {
1221
            //the entries will only be registered and not stored to the database
1222 1
            $this->queueEntries[] = $fieldArray;
1223
        } else {
1224 7
            if (!$skipInnerDuplicationCheck) {
1225
                // check if there is already an equal entry
1226 6
                $rows = $this->getDuplicateRowsIfExist($tstamp, $fieldArray);
1227
            }
1228
1229 7
            if (count($rows) == 0) {
1230 6
                $this->db->exec_INSERTquery('tx_crawler_queue', $fieldArray);
1231 6
                $uid = $this->db->sql_insert_id();
1232 6
                $rows[] = $uid;
1233 6
                $urlAdded = true;
1234
1235
                // The event dispatcher is deprecated since crawler v6.4.0, will be removed in crawler v7.0.0.
1236
                // Please use the Signal instead.
1237 6
                EventDispatcher::getInstance()->post('urlAddedToQueue', $this->setID, ['uid' => $uid, 'fieldArray' => $fieldArray]);
1238
1239 6
                $signalPayload = ['uid' => $uid, 'fieldArray' => $fieldArray];
1240 6
                SignalSlotUtility::emitSignal(
1241 6
                    __CLASS__,
1242 6
                    SignalSlotUtility::SIGNAL_URL_ADDED_TO_QUEUE,
1243 6
                    $signalPayload
1244
                );
1245
1246
            } else {
1247
                // The event dispatcher is deprecated since crawler v6.4.0, will be removed in crawler v7.0.0.
1248
                // Please use the Signal instead.
1249 3
                EventDispatcher::getInstance()->post('duplicateUrlInQueue', $this->setID, ['rows' => $rows, 'fieldArray' => $fieldArray]);
1250
1251 3
                $signalPayload = ['rows' => $rows, 'fieldArray' => $fieldArray];
1252 3
                SignalSlotUtility::emitSignal(
1253 3
                    __CLASS__,
1254 3
                    SignalSlotUtility::SIGNAL_DUPLICATE_URL_IN_QUEUE,
1255 3
                    $signalPayload
1256
                );
1257
            }
1258
        }
1259
1260 8
        return $urlAdded;
1261
    }
1262
1263
    /**
1264
     * This method determines duplicates for a queue entry with the same parameters and this timestamp.
1265
     * If the timestamp is in the past, it will check if there is any unprocessed queue entry in the past.
1266
     * If the timestamp is in the future it will check, if the queued entry has exactly the same timestamp
1267
     *
1268
     * @param int $tstamp
1269
     * @param array $fieldArray
1270
     *
1271
     * @return array
1272
     */
1273 9
    protected function getDuplicateRowsIfExist($tstamp, $fieldArray)
1274
    {
1275 9
        $rows = [];
1276
1277 9
        $currentTime = $this->getCurrentTime();
1278
1279
        //if this entry is scheduled with "now"
1280 9
        if ($tstamp <= $currentTime) {
1281 3
            if ($this->extensionSettings['enableTimeslot']) {
1282 2
                $timeBegin = $currentTime - 100;
1283 2
                $timeEnd = $currentTime + 100;
1284 2
                $where = ' ((scheduled BETWEEN ' . $timeBegin . ' AND ' . $timeEnd . ' ) OR scheduled <= ' . $currentTime . ') ';
1285
            } else {
1286 3
                $where = 'scheduled <= ' . $currentTime;
1287
            }
1288 6
        } elseif ($tstamp > $currentTime) {
1289
            //entry with a timestamp in the future need to have the same schedule time
1290 6
            $where = 'scheduled = ' . $tstamp ;
1291
        }
1292
1293 9
        if (!empty($where)) {
1294 9
            $result = $this->db->exec_SELECTgetRows(
1295 9
                'qid',
1296 9
                'tx_crawler_queue',
1297
                $where .
1298 9
                ' AND NOT exec_time' .
1299 9
                ' AND NOT process_id ' .
1300 9
                ' AND page_id=' . intval($fieldArray['page_id']) .
1301 9
                ' AND parameters_hash = ' . $this->db->fullQuoteStr($fieldArray['parameters_hash'], 'tx_crawler_queue')
1302
            );
1303
1304 9
            if (is_array($result)) {
1305 9
                foreach ($result as $value) {
1306 7
                    $rows[] = $value['qid'];
1307
                }
1308
            }
1309
        }
1310
1311 9
        return $rows;
1312
    }
1313
1314
    /**
1315
     * Returns the current system time
1316
     *
1317
     * @return int
1318
     */
1319
    public function getCurrentTime()
1320
    {
1321
        return time();
1322
    }
1323
1324
    /************************************
1325
     *
1326
     * URL reading
1327
     *
1328
     ************************************/
1329
1330
    /**
1331
     * Read URL for single queue entry
1332
     *
1333
     * @param integer $queueId
1334
     * @param boolean $force If set, will process even if exec_time has been set!
1335
     * @return integer
1336
     */
1337
    public function readUrl($queueId, $force = false)
1338
    {
1339
        $ret = 0;
1340
        if ($this->debugMode) {
1341
            GeneralUtility::devlog('crawler-readurl start ' . microtime(true), __FUNCTION__);
1342
        }
1343
        // Get entry:
1344
        list($queueRec) = $this->db->exec_SELECTgetRows(
1345
            '*',
1346
            'tx_crawler_queue',
1347
            'qid=' . intval($queueId) . ($force ? '' : ' AND exec_time=0 AND process_scheduled > 0')
1348
        );
1349
1350
        if (!is_array($queueRec)) {
1351
            return;
1352
        }
1353
1354
        $parameters = unserialize($queueRec['parameters']);
1355
        if ($parameters['rootTemplatePid']) {
1356
            $this->initTSFE((int)$parameters['rootTemplatePid']);
1357
        } else {
1358
            GeneralUtility::sysLog(
1359
                'Page with (' . $queueRec['page_id'] . ') could not be crawled, please check your crawler configuration. Perhaps no Root Template Pid is set',
1360
                'crawler',
1361
                GeneralUtility::SYSLOG_SEVERITY_WARNING
1362
            );
1363
        }
1364
1365
        $signalPayload = [$queueId, $queueRec];
1366
        SignalSlotUtility::emitSignal(
1367
            __CLASS__,
1368
            SignalSlotUtility::SIGNAL_QUEUEITEM_PREPROCESS,
1369
            $signalPayload
1370
        );
1371
1372
        // Set exec_time to lock record:
1373
        $field_array = ['exec_time' => $this->getCurrentTime()];
1374
1375
        if (isset($this->processID)) {
1376
            //if mulitprocessing is used we need to store the id of the process which has handled this entry
1377
            $field_array['process_id_completed'] = $this->processID;
1378
        }
1379
        $this->db->exec_UPDATEquery('tx_crawler_queue', 'qid=' . intval($queueId), $field_array);
1380
1381
        $result = $this->readUrl_exec($queueRec);
1382
        $resultData = unserialize($result['content']);
1383
1384
        //atm there's no need to point to specific pollable extensions
1385
        if (is_array($GLOBALS['TYPO3_CONF_VARS']['EXTCONF']['crawler']['pollSuccess'])) {
1386
            foreach ($GLOBALS['TYPO3_CONF_VARS']['EXTCONF']['crawler']['pollSuccess'] as $pollable) {
1387
                // only check the success value if the instruction is runnig
1388
                // it is important to name the pollSuccess key same as the procInstructions key
1389
                if (is_array($resultData['parameters']['procInstructions']) && in_array(
1390
                    $pollable,
1391
                        $resultData['parameters']['procInstructions']
1392
                )
1393
                ) {
1394
                    if (!empty($resultData['success'][$pollable]) && $resultData['success'][$pollable]) {
1395
                        $ret |= self::CLI_STATUS_POLLABLE_PROCESSED;
1396
                    }
1397
                }
1398
            }
1399
        }
1400
1401
        // Set result in log which also denotes the end of the processing of this entry.
1402
        $field_array = ['result_data' => serialize($result)];
1403
1404
        $signalPayload = [$queueId, $field_array];
1405
        SignalSlotUtility::emitSignal(
1406
            __CLASS__,
1407
            SignalSlotUtility::SIGNAL_QUEUEITEM_POSTPROCESS,
1408
            $signalPayload
1409
        );
1410
1411
        $this->db->exec_UPDATEquery('tx_crawler_queue', 'qid=' . intval($queueId), $field_array);
1412
1413
        if ($this->debugMode) {
1414
            GeneralUtility::devlog('crawler-readurl stop ' . microtime(true), __FUNCTION__);
1415
        }
1416
1417
        return $ret;
1418
    }
1419
1420
    /**
1421
     * Read URL for not-yet-inserted log-entry
1422
     *
1423
     * @param array $field_array Queue field array,
1424
     *
1425
     * @return string
1426
     */
1427
    public function readUrlFromArray($field_array)
1428
    {
1429
1430
            // Set exec_time to lock record:
1431
        $field_array['exec_time'] = $this->getCurrentTime();
1432
        $this->db->exec_INSERTquery('tx_crawler_queue', $field_array);
1433
        $queueId = $field_array['qid'] = $this->db->sql_insert_id();
1434
1435
        $result = $this->readUrl_exec($field_array);
1436
1437
        // Set result in log which also denotes the end of the processing of this entry.
1438
        $field_array = ['result_data' => serialize($result)];
1439
1440
        $signalPayload = [$queueId, $field_array];
1441
        SignalSlotUtility::emitSignal(
1442
            __CLASS__,
1443
            SignalSlotUtility::SIGNAL_QUEUEITEM_POSTPROCESS,
1444
            $signalPayload
1445
        );
1446
1447
        $this->db->exec_UPDATEquery('tx_crawler_queue', 'qid=' . intval($queueId), $field_array);
1448
1449
        return $result;
1450
    }
1451
1452
    /**
1453
     * Read URL for a queue record
1454
     *
1455
     * @param array $queueRec Queue record
1456
     * @return string
1457
     */
1458
    public function readUrl_exec($queueRec)
1459
    {
1460
        // Decode parameters:
1461
        $parameters = unserialize($queueRec['parameters']);
1462
        $result = 'ERROR';
1463
        if (is_array($parameters)) {
1464
            if ($parameters['_CALLBACKOBJ']) { // Calling object:
1465
                $objRef = $parameters['_CALLBACKOBJ'];
1466
                $callBackObj = &GeneralUtility::getUserObj($objRef);
1467
                if (is_object($callBackObj)) {
1468
                    unset($parameters['_CALLBACKOBJ']);
1469
                    $result = ['content' => serialize($callBackObj->crawler_execute($parameters, $this))];
1470
                } else {
1471
                    $result = ['content' => 'No object: ' . $objRef];
1472
                }
1473
            } else { // Regular FE request:
1474
1475
                // Prepare:
1476
                $crawlerId = $queueRec['qid'] . ':' . md5($queueRec['qid'] . '|' . $queueRec['set_id'] . '|' . $GLOBALS['TYPO3_CONF_VARS']['SYS']['encryptionKey']);
1477
1478
                // Get result:
1479
                $result = $this->requestUrl($parameters['url'], $crawlerId);
1480
1481
                // The event dispatcher is deprecated since crawler v6.4.0, will be removed in crawler v7.0.0.
1482
                // Please use the Signal instead.
1483
                EventDispatcher::getInstance()->post('urlCrawled', $queueRec['set_id'], ['url' => $parameters['url'], 'result' => $result]);
1484
1485
                $signalPayload = ['url' => $parameters['url'], 'result' => $result];
1486
                SignalSlotUtility::emitSignal(
1487
                    __CLASS__,
1488
                    SignalSlotUtility::SIGNAL_URL_CRAWLED,
1489
                    $signalPayload
1490
                );
1491
            }
1492
        }
1493
1494
        return $result;
1495
    }
1496
1497
    /**
1498
     * Gets the content of a URL.
1499
     *
1500
     * @param string $originalUrl URL to read
1501
     * @param string $crawlerId Crawler ID string (qid + hash to verify)
1502
     * @param integer $timeout Timeout time
1503
     * @param integer $recursion Recursion limiter for 302 redirects
1504
     * @return array
1505
     */
1506 2
    public function requestUrl($originalUrl, $crawlerId, $timeout = 2, $recursion = 10)
1507
    {
1508 2
        if (!$recursion) {
1509
            return false;
1510
        }
1511
1512
        // Parse URL, checking for scheme:
1513 2
        $url = parse_url($originalUrl);
1514
1515 2
        if ($url === false) {
1516
            if (TYPO3_DLOG) {
1517
                GeneralUtility::devLog(sprintf('Could not parse_url() for string "%s"', $url), 'crawler', 4, ['crawlerId' => $crawlerId]);
1518
            }
1519
            return false;
1520
        }
1521
1522 2
        if (!in_array($url['scheme'], ['','http','https'])) {
1523
            if (TYPO3_DLOG) {
1524
                GeneralUtility::devLog(sprintf('Scheme does not match for url "%s"', $url), 'crawler', 4, ['crawlerId' => $crawlerId]);
1525
            }
1526
            return false;
1527
        }
1528
1529
        // direct request
1530 2
        if ($this->extensionSettings['makeDirectRequests']) {
1531 2
            $result = $this->sendDirectRequest($originalUrl, $crawlerId);
1532 2
            return $result;
1533
        }
1534
1535
        $reqHeaders = $this->buildRequestHeaderArray($url, $crawlerId);
1536
1537
        // thanks to Pierrick Caillon for adding proxy support
1538
        $rurl = $url;
1539
1540
        if ($this->extensionSettings['curlUse'] && $this->extensionSettings['curlProxyServer']) {
1541
            $rurl = parse_url($this->extensionSettings['curlProxyServer']);
1542
            $url['path'] = $url['scheme'] . '://' . $url['host'] . ($url['port'] > 0 ? ':' . $url['port'] : '') . $url['path'];
1543
            $reqHeaders = $this->buildRequestHeaderArray($url, $crawlerId);
1544
        }
1545
1546
        $host = $rurl['host'];
1547
1548
        if ($url['scheme'] == 'https') {
1549
            $host = 'ssl://' . $host;
1550
            $port = ($rurl['port'] > 0) ? $rurl['port'] : 443;
1551
        } else {
1552
            $port = ($rurl['port'] > 0) ? $rurl['port'] : 80;
1553
        }
1554
1555
        $startTime = microtime(true);
1556
        $fp = fsockopen($host, $port, $errno, $errstr, $timeout);
1557
1558
        if (!$fp) {
1559
            if (TYPO3_DLOG) {
1560
                GeneralUtility::devLog(sprintf('Error while opening "%s"', $url), 'crawler', 4, ['crawlerId' => $crawlerId]);
1561
            }
1562
            return false;
1563
        } else {
1564
            // Request message:
1565
            $msg = implode("\r\n", $reqHeaders) . "\r\n\r\n";
1566
            fputs($fp, $msg);
1567
1568
            // Read response:
1569
            $d = $this->getHttpResponseFromStream($fp);
1570
            fclose($fp);
1571
1572
            $time = microtime(true) - $startTime;
1573
            $this->log($originalUrl . ' ' . $time);
1574
1575
            // Implode content and headers:
1576
            $result = [
1577
                'request' => $msg,
1578
                'headers' => implode('', $d['headers']),
1579
                'content' => implode('', (array)$d['content']),
1580
            ];
1581
1582
            if (($this->extensionSettings['follow30x']) && ($newUrl = $this->getRequestUrlFrom302Header($d['headers'], $url['user'], $url['pass']))) {
1583
                $result = array_merge(['parentRequest' => $result], $this->requestUrl($newUrl, $crawlerId, $recursion--));
0 ignored issues
show
Bug introduced by
It seems like $newUrl defined by $this->getRequestUrlFrom...['user'], $url['pass']) on line 1582 can also be of type boolean; however, AOE\Crawler\Controller\C...ontroller::requestUrl() does only seem to accept string, maybe add an additional type check?

If a method or function can return multiple different values and unless you are sure that you only can receive a single value in this context, we recommend to add an additional type check:

/**
 * @return array|string
 */
function returnsDifferentValues($x) {
    if ($x) {
        return 'foo';
    }

    return array();
}

$x = returnsDifferentValues($y);
if (is_array($x)) {
    // $x is an array.
}

If this a common case that PHP Analyzer should handle natively, please let us know by opening an issue.

Loading history...
1584
                $newRequestUrl = $this->requestUrl($newUrl, $crawlerId, $timeout, --$recursion);
0 ignored issues
show
Bug introduced by
It seems like $newUrl defined by $this->getRequestUrlFrom...['user'], $url['pass']) on line 1582 can also be of type boolean; however, AOE\Crawler\Controller\C...ontroller::requestUrl() does only seem to accept string, maybe add an additional type check?

If a method or function can return multiple different values and unless you are sure that you only can receive a single value in this context, we recommend to add an additional type check:

/**
 * @return array|string
 */
function returnsDifferentValues($x) {
    if ($x) {
        return 'foo';
    }

    return array();
}

$x = returnsDifferentValues($y);
if (is_array($x)) {
    // $x is an array.
}

If this a common case that PHP Analyzer should handle natively, please let us know by opening an issue.

Loading history...
1585
1586
                if (is_array($newRequestUrl)) {
1587
                    $result = array_merge(['parentRequest' => $result], $newRequestUrl);
1588
                } else {
1589
                    if (TYPO3_DLOG) {
1590
                        GeneralUtility::devLog(sprintf('Error while opening "%s"', $url), 'crawler', 4, ['crawlerId' => $crawlerId]);
1591
                    }
1592
                    return false;
1593
                }
1594
            }
1595
1596
            return $result;
1597
        }
1598
    }
1599
1600
    /**
1601
     * Gets the base path of the website frontend.
1602
     * (e.g. if you call http://mydomain.com/cms/index.php in
1603
     * the browser the base path is "/cms/")
1604
     *
1605
     * @return string Base path of the website frontend
1606
     */
1607
    protected function getFrontendBasePath()
1608
    {
1609
        $frontendBasePath = '/';
1610
1611
        // Get the path from the extension settings:
1612
        if (isset($this->extensionSettings['frontendBasePath']) && $this->extensionSettings['frontendBasePath']) {
1613
            $frontendBasePath = $this->extensionSettings['frontendBasePath'];
1614
            // If empty, try to use config.absRefPrefix:
1615
        } elseif (isset($GLOBALS['TSFE']->absRefPrefix) && !empty($GLOBALS['TSFE']->absRefPrefix)) {
1616
            $frontendBasePath = $GLOBALS['TSFE']->absRefPrefix;
1617
            // If not in CLI mode the base path can be determined from $_SERVER environment:
1618
        } elseif (!defined('TYPO3_REQUESTTYPE_CLI') || !TYPO3_REQUESTTYPE_CLI) {
1619
            $frontendBasePath = GeneralUtility::getIndpEnv('TYPO3_SITE_PATH');
1620
        }
1621
1622
        // Base path must be '/<pathSegements>/':
1623
        if ($frontendBasePath != '/') {
1624
            $frontendBasePath = '/' . ltrim($frontendBasePath, '/');
1625
            $frontendBasePath = rtrim($frontendBasePath, '/') . '/';
1626
        }
1627
1628
        return $frontendBasePath;
1629
    }
1630
1631
    /**
1632
     * Executes a shell command and returns the outputted result.
1633
     *
1634
     * @param string $command Shell command to be executed
1635
     * @return string Outputted result of the command execution
1636
     */
1637
    protected function executeShellCommand($command)
1638
    {
1639
        $result = shell_exec($command);
1640
        return $result;
1641
    }
1642
1643
    /**
1644
     * Reads HTTP response from the given stream.
1645
     *
1646
     * @param  resource $streamPointer  Pointer to connection stream.
1647
     * @return array                    Associative array with the following items:
1648
     *                                  headers <array> Response headers sent by server.
1649
     *                                  content <array> Content, with each line as an array item.
1650
     */
1651 1
    protected function getHttpResponseFromStream($streamPointer)
1652
    {
1653 1
        $response = ['headers' => [], 'content' => []];
1654
1655 1
        if (is_resource($streamPointer)) {
1656
            // read headers
1657 1
            while ($line = fgets($streamPointer, '2048')) {
1658 1
                $line = trim($line);
1659 1
                if ($line !== '') {
1660 1
                    $response['headers'][] = $line;
1661
                } else {
1662 1
                    break;
1663
                }
1664
            }
1665
1666
            // read content
1667 1
            while ($line = fgets($streamPointer, '2048')) {
1668 1
                $response['content'][] = $line;
1669
            }
1670
        }
1671
1672 1
        return $response;
1673
    }
1674
1675
    /**
1676
     * @param message
1677
     */
1678 2
    protected function log($message)
1679
    {
1680 2
        if (!empty($this->extensionSettings['logFileName'])) {
1681
            $fileResult = @file_put_contents($this->extensionSettings['logFileName'], date('Ymd His') . ' ' . $message . PHP_EOL, FILE_APPEND);
1682
            if (!$fileResult) {
1683
                GeneralUtility::devLog('File "' . $this->extensionSettings['logFileName'] . '" could not be written, please check file permissions.', 'crawler', LogLevel::INFO);
1684
            }
1685
        }
1686 2
    }
1687
1688
    /**
1689
     * Builds HTTP request headers.
1690
     *
1691
     * @param array $url
1692
     * @param string $crawlerId
1693
     *
1694
     * @return array
1695
     */
1696 6
    protected function buildRequestHeaderArray(array $url, $crawlerId)
1697
    {
1698 6
        $reqHeaders = [];
1699 6
        $reqHeaders[] = 'GET ' . $url['path'] . ($url['query'] ? '?' . $url['query'] : '') . ' HTTP/1.0';
1700 6
        $reqHeaders[] = 'Host: ' . $url['host'];
1701 6
        if (stristr($url['query'], 'ADMCMD_previewWS')) {
1702 2
            $reqHeaders[] = 'Cookie: $Version="1"; be_typo_user="1"; $Path=/';
1703
        }
1704 6
        $reqHeaders[] = 'Connection: close';
1705 6
        if ($url['user'] != '') {
1706 2
            $reqHeaders[] = 'Authorization: Basic ' . base64_encode($url['user'] . ':' . $url['pass']);
1707
        }
1708 6
        $reqHeaders[] = 'X-T3crawler: ' . $crawlerId;
1709 6
        $reqHeaders[] = 'User-Agent: TYPO3 crawler';
1710 6
        return $reqHeaders;
1711
    }
1712
1713
    /**
1714
     * Check if the submitted HTTP-Header contains a redirect location and built new crawler-url
1715
     *
1716
     * @param array $headers HTTP Header
1717
     * @param string $user HTTP Auth. User
1718
     * @param string $pass HTTP Auth. Password
1719
     * @return bool|string
1720
     */
1721 12
    protected function getRequestUrlFrom302Header($headers, $user = '', $pass = '')
1722
    {
1723 12
        $header = [];
1724 12
        if (!is_array($headers)) {
1725 1
            return false;
1726
        }
1727 11
        if (!(stristr($headers[0], '301 Moved') || stristr($headers[0], '302 Found') || stristr($headers[0], '302 Moved'))) {
1728 2
            return false;
1729
        }
1730
1731 9
        foreach ($headers as $hl) {
1732 9
            $tmp = explode(": ", $hl);
1733 9
            $header[trim($tmp[0])] = trim($tmp[1]);
1734 9
            if (trim($tmp[0]) == 'Location') {
1735 9
                break;
1736
            }
1737
        }
1738 9
        if (!array_key_exists('Location', $header)) {
1739 3
            return false;
1740
        }
1741
1742 6
        if ($user != '') {
1743 3
            if (!($tmp = parse_url($header['Location']))) {
1744 1
                return false;
1745
            }
1746 2
            $newUrl = $tmp['scheme'] . '://' . $user . ':' . $pass . '@' . $tmp['host'] . $tmp['path'];
1747 2
            if ($tmp['query'] != '') {
1748 2
                $newUrl .= '?' . $tmp['query'];
1749
            }
1750
        } else {
1751 3
            $newUrl = $header['Location'];
1752
        }
1753 5
        return $newUrl;
1754
    }
1755
1756
    /**************************
1757
     *
1758
     * tslib_fe hooks:
1759
     *
1760
     **************************/
1761
1762
    /**
1763
     * Initialization hook (called after database connection)
1764
     * Takes the "HTTP_X_T3CRAWLER" header and looks up queue record and verifies if the session comes from the system (by comparing hashes)
1765
     *
1766
     * @param array $params Parameters from frontend
1767
     * @param object $ref TSFE object (reference under PHP5)
1768
     * @return void
1769
     *
1770
     * FIXME: Look like this is not used, in commit 9910d3f40cce15f4e9b7bcd0488bf21f31d53ebc it's added as public,
1771
     * FIXME: I think this can be removed. (TNM)
1772
     */
1773
    public function fe_init(&$params, $ref)
0 ignored issues
show
Unused Code introduced by
The parameter $ref is not used and could be removed.

This check looks from parameters that have been defined for a function or method, but which are not used in the method body.

Loading history...
1774
    {
1775
        // Authenticate crawler request:
1776
        if (isset($_SERVER['HTTP_X_T3CRAWLER'])) {
1777
            list($queueId, $hash) = explode(':', $_SERVER['HTTP_X_T3CRAWLER']);
1778
            list($queueRec) = $this->db->exec_SELECTgetSingleRow('*', 'tx_crawler_queue', 'qid=' . intval($queueId));
1779
1780
            // If a crawler record was found and hash was matching, set it up:
1781
            if (is_array($queueRec) && $hash === md5($queueRec['qid'] . '|' . $queueRec['set_id'] . '|' . $GLOBALS['TYPO3_CONF_VARS']['SYS']['encryptionKey'])) {
1782
                $params['pObj']->applicationData['tx_crawler']['running'] = true;
1783
                $params['pObj']->applicationData['tx_crawler']['parameters'] = unserialize($queueRec['parameters']);
1784
                $params['pObj']->applicationData['tx_crawler']['log'] = [];
1785
            } else {
1786
                die('No crawler entry found!');
1787
            }
1788
        }
1789
    }
1790
1791
    /*****************************
1792
     *
1793
     * Compiling URLs to crawl - tools
1794
     *
1795
     *****************************/
1796
1797
    /**
1798
     * @param integer $id Root page id to start from.
1799
     * @param integer $depth Depth of tree, 0=only id-page, 1= on sublevel, 99 = infinite
1800
     * @param integer $scheduledTime Unix Time when the URL is timed to be visited when put in queue
1801
     * @param integer $reqMinute Number of requests per minute (creates the interleave between requests)
1802
     * @param boolean $submitCrawlUrls If set, submits the URLs to queue in database (real crawling)
1803
     * @param boolean $downloadCrawlUrls If set (and submitcrawlUrls is false) will fill $downloadUrls with entries)
1804
     * @param array $incomingProcInstructions Array of processing instructions
1805
     * @param array $configurationSelection Array of configuration keys
1806
     * @return string
1807
     */
1808
    public function getPageTreeAndUrls(
1809
        $id,
1810
        $depth,
1811
        $scheduledTime,
1812
        $reqMinute,
1813
        $submitCrawlUrls,
1814
        $downloadCrawlUrls,
1815
        array $incomingProcInstructions,
1816
        array $configurationSelection
1817
    ) {
1818
        global $BACK_PATH;
1819
        global $LANG;
1820
        if (!is_object($LANG)) {
1821
            $LANG = GeneralUtility::makeInstance(LanguageService::class);
1822
            $LANG->init(0);
1823
        }
1824
        $this->scheduledTime = $scheduledTime;
1825
        $this->reqMinute = $reqMinute;
1826
        $this->submitCrawlUrls = $submitCrawlUrls;
1827
        $this->downloadCrawlUrls = $downloadCrawlUrls;
1828
        $this->incomingProcInstructions = $incomingProcInstructions;
1829
        $this->incomingConfigurationSelection = $configurationSelection;
1830
1831
        $this->duplicateTrack = [];
1832
        $this->downloadUrls = [];
1833
1834
        // Drawing tree:
1835
        /* @var PageTreeView $tree */
1836
        $tree = GeneralUtility::makeInstance(PageTreeView::class);
1837
        $perms_clause = $GLOBALS['BE_USER']->getPagePermsClause(1);
1838
        $tree->init('AND ' . $perms_clause);
1839
1840
        $pageInfo = BackendUtility::readPageAccess($id, $perms_clause);
1841
        if (is_array($pageInfo)) {
1842
            // Set root row:
1843
            $tree->tree[] = [
1844
                'row' => $pageInfo,
1845
                'HTML' => IconUtility::getIconForRecord('pages', $pageInfo),
1846
            ];
1847
        }
1848
1849
        // Get branch beneath:
1850
        if ($depth) {
1851
            $tree->getTree($id, $depth, '');
1852
        }
1853
1854
        // Traverse page tree:
1855
        $code = '';
1856
1857
        foreach ($tree->tree as $data) {
1858
            $this->MP = false;
1859
1860
            // recognize mount points
1861
            if ($data['row']['doktype'] == 7) {
1862
                $mountpage = $this->db->exec_SELECTgetRows('*', 'pages', 'uid = ' . $data['row']['uid']);
1863
1864
                // fetch mounted pages
1865
                $this->MP = $mountpage[0]['mount_pid'] . '-' . $data['row']['uid'];
0 ignored issues
show
Documentation Bug introduced by
The property $MP was declared of type boolean, but $mountpage[0]['mount_pid...' . $data['row']['uid'] is of type string. Maybe add a type cast?

This check looks for assignments to scalar types that may be of the wrong type.

To ensure the code behaves as expected, it may be a good idea to add an explicit type cast.

$answer = 42;

$correct = false;

$correct = (bool) $answer;
Loading history...
1866
1867
                $mountTree = GeneralUtility::makeInstance(PageTreeView::class);
1868
                $mountTree->init('AND ' . $perms_clause);
1869
                $mountTree->getTree($mountpage[0]['mount_pid'], $depth, '');
1870
1871
                foreach ($mountTree->tree as $mountData) {
1872
                    $code .= $this->drawURLs_addRowsForPage(
1873
                        $mountData['row'],
1874
                        $mountData['HTML'] . BackendUtility::getRecordTitle('pages', $mountData['row'], true)
1875
                    );
1876
                }
1877
1878
                // replace page when mount_pid_ol is enabled
1879
                if ($mountpage[0]['mount_pid_ol']) {
1880
                    $data['row']['uid'] = $mountpage[0]['mount_pid'];
1881
                } else {
1882
                    // if the mount_pid_ol is not set the MP must not be used for the mountpoint page
1883
                    $this->MP = false;
1884
                }
1885
            }
1886
1887
            $code .= $this->drawURLs_addRowsForPage(
1888
                $data['row'],
1889
                $data['HTML'] . BackendUtility::getRecordTitle('pages', $data['row'], true)
1890
            );
1891
        }
1892
1893
        return $code;
1894
    }
1895
1896
    /**
1897
     * Expands exclude string
1898
     *
1899
     * @param string $excludeString Exclude string
1900
     * @return array
1901
     */
1902 1
    public function expandExcludeString($excludeString)
1903
    {
1904
        // internal static caches;
1905 1
        static $expandedExcludeStringCache;
1906 1
        static $treeCache;
1907
1908 1
        if (empty($expandedExcludeStringCache[$excludeString])) {
1909 1
            $pidList = [];
1910
1911 1
            if (!empty($excludeString)) {
1912
                /** @var PageTreeView $tree */
1913
                $tree = GeneralUtility::makeInstance(PageTreeView::class);
1914
                $tree->init('AND ' . $this->backendUser->getPagePermsClause(1));
1915
1916
                $excludeParts = GeneralUtility::trimExplode(',', $excludeString);
1917
1918
                foreach ($excludeParts as $excludePart) {
1919
                    list($pid, $depth) = GeneralUtility::trimExplode('+', $excludePart);
1920
1921
                    // default is "page only" = "depth=0"
1922
                    if (empty($depth)) {
1923
                        $depth = (stristr($excludePart, '+')) ? 99 : 0;
1924
                    }
1925
1926
                    $pidList[] = $pid;
1927
1928
                    if ($depth > 0) {
1929
                        if (empty($treeCache[$pid][$depth])) {
1930
                            $tree->reset();
1931
                            $tree->getTree($pid, $depth);
1932
                            $treeCache[$pid][$depth] = $tree->tree;
1933
                        }
1934
1935
                        foreach ($treeCache[$pid][$depth] as $data) {
1936
                            $pidList[] = $data['row']['uid'];
1937
                        }
1938
                    }
1939
                }
1940
            }
1941
1942 1
            $expandedExcludeStringCache[$excludeString] = array_unique($pidList);
1943
        }
1944
1945 1
        return $expandedExcludeStringCache[$excludeString];
1946
    }
1947
1948
    /**
1949
     * Create the rows for display of the page tree
1950
     * For each page a number of rows are shown displaying GET variable configuration
1951
     *
1952
     * @param    array        Page row
1953
     * @param    string        Page icon and title for row
1954
     * @return    string        HTML <tr> content (one or more)
1955
     */
1956
    public function drawURLs_addRowsForPage(array $pageRow, $pageTitleAndIcon)
1957
    {
1958
        $skipMessage = '';
1959
1960
        // Get list of configurations
1961
        $configurations = $this->getUrlsForPageRow($pageRow, $skipMessage);
1962
1963
        if (count($this->incomingConfigurationSelection) > 0) {
1964
            // remove configuration that does not match the current selection
1965
            foreach ($configurations as $confKey => $confArray) {
1966
                if (!in_array($confKey, $this->incomingConfigurationSelection)) {
1967
                    unset($configurations[$confKey]);
1968
                }
1969
            }
1970
        }
1971
1972
        // Traverse parameter combinations:
1973
        $c = 0;
1974
        $content = '';
1975
        if (count($configurations)) {
1976
            foreach ($configurations as $confKey => $confArray) {
1977
1978
                    // Title column:
1979
                if (!$c) {
1980
                    $titleClm = '<td rowspan="' . count($configurations) . '">' . $pageTitleAndIcon . '</td>';
1981
                } else {
1982
                    $titleClm = '';
1983
                }
1984
1985
                if (!in_array($pageRow['uid'], $this->expandExcludeString($confArray['subCfg']['exclude']))) {
1986
1987
                        // URL list:
1988
                    $urlList = $this->urlListFromUrlArray(
1989
                        $confArray,
1990
                        $pageRow,
1991
                        $this->scheduledTime,
1992
                        $this->reqMinute,
1993
                        $this->submitCrawlUrls,
1994
                        $this->downloadCrawlUrls,
1995
                        $this->duplicateTrack,
1996
                        $this->downloadUrls,
1997
                        $this->incomingProcInstructions // if empty the urls won't be filtered by processing instructions
1998
                    );
1999
2000
                    // Expanded parameters:
2001
                    $paramExpanded = '';
2002
                    $calcAccu = [];
2003
                    $calcRes = 1;
2004
                    foreach ($confArray['paramExpanded'] as $gVar => $gVal) {
2005
                        $paramExpanded .= '
2006
                            <tr>
2007
                                <td class="bgColor4-20">' . htmlspecialchars('&' . $gVar . '=') . '<br/>' .
2008
                                                '(' . count($gVal) . ')' .
2009
                                                '</td>
2010
                                <td class="bgColor4" nowrap="nowrap">' . nl2br(htmlspecialchars(implode(chr(10), $gVal))) . '</td>
2011
                            </tr>
2012
                        ';
2013
                        $calcRes *= count($gVal);
2014
                        $calcAccu[] = count($gVal);
2015
                    }
2016
                    $paramExpanded = '<table class="lrPadding c-list param-expanded">' . $paramExpanded . '</table>';
2017
                    $paramExpanded .= 'Comb: ' . implode('*', $calcAccu) . '=' . $calcRes;
2018
2019
                    // Options
2020
                    $optionValues = '';
2021
                    if ($confArray['subCfg']['userGroups']) {
2022
                        $optionValues .= 'User Groups: ' . $confArray['subCfg']['userGroups'] . '<br/>';
2023
                    }
2024
                    if ($confArray['subCfg']['baseUrl']) {
2025
                        $optionValues .= 'Base Url: ' . $confArray['subCfg']['baseUrl'] . '<br/>';
2026
                    }
2027
                    if ($confArray['subCfg']['procInstrFilter']) {
2028
                        $optionValues .= 'ProcInstr: ' . $confArray['subCfg']['procInstrFilter'] . '<br/>';
2029
                    }
2030
2031
                    // Compile row:
2032
                    $content .= '
2033
                        <tr class="bgColor' . ($c % 2 ? '-20' : '-10') . '">
2034
                            ' . $titleClm . '
2035
                            <td>' . htmlspecialchars($confKey) . '</td>
2036
                            <td>' . nl2br(htmlspecialchars(rawurldecode(trim(str_replace('&', chr(10) . '&', GeneralUtility::implodeArrayForUrl('', $confArray['paramParsed'])))))) . '</td>
2037
                            <td>' . $paramExpanded . '</td>
2038
                            <td nowrap="nowrap">' . $urlList . '</td>
2039
                            <td nowrap="nowrap">' . $optionValues . '</td>
2040
                            <td nowrap="nowrap">' . DebugUtility::viewArray($confArray['subCfg']['procInstrParams.']) . '</td>
2041
                        </tr>';
2042
                } else {
2043
                    $content .= '<tr class="bgColor' . ($c % 2 ? '-20' : '-10') . '">
2044
                            ' . $titleClm . '
2045
                            <td>' . htmlspecialchars($confKey) . '</td>
2046
                            <td colspan="5"><em>No entries</em> (Page is excluded in this configuration)</td>
2047
                        </tr>';
2048
                }
2049
2050
                $c++;
2051
            }
2052
        } else {
2053
            $message = !empty($skipMessage) ? ' (' . $skipMessage . ')' : '';
2054
2055
            // Compile row:
2056
            $content .= '
2057
                <tr class="bgColor-20" style="border-bottom: 1px solid black;">
2058
                    <td>' . $pageTitleAndIcon . '</td>
2059
                    <td colspan="6"><em>No entries</em>' . $message . '</td>
2060
                </tr>';
2061
        }
2062
2063
        return $content;
2064
    }
2065
2066
    /*****************************
2067
     *
2068
     * CLI functions
2069
     *
2070
     *****************************/
2071
2072
    /**
2073
     * Main function for running from Command Line PHP script (cron job)
2074
     * See ext/crawler/cli/crawler_cli.phpsh for details
2075
     *
2076
     * @return int number of remaining items or false if error
2077
     */
2078
    public function CLI_main()
2079
    {
2080
        $this->setAccessMode('cli');
2081
        $result = self::CLI_STATUS_NOTHING_PROCCESSED;
2082
        $cliObj = GeneralUtility::makeInstance(CrawlerCommandLineController::class);
2083
2084
        if (isset($cliObj->cli_args['-h']) || isset($cliObj->cli_args['--help'])) {
2085
            $cliObj->cli_validateArgs();
2086
            $cliObj->cli_help();
2087
            exit;
2088
        }
2089
2090
        if (!$this->getDisabled() && $this->CLI_checkAndAcquireNewProcess($this->CLI_buildProcessId())) {
2091
            $countInARun = $cliObj->cli_argValue('--countInARun') ? intval($cliObj->cli_argValue('--countInARun')) : $this->extensionSettings['countInARun'];
2092
            // Seconds
2093
            $sleepAfterFinish = $cliObj->cli_argValue('--sleepAfterFinish') ? intval($cliObj->cli_argValue('--sleepAfterFinish')) : $this->extensionSettings['sleepAfterFinish'];
2094
            // Milliseconds
2095
            $sleepTime = $cliObj->cli_argValue('--sleepTime') ? intval($cliObj->cli_argValue('--sleepTime')) : $this->extensionSettings['sleepTime'];
2096
2097
            try {
2098
                // Run process:
2099
                $result = $this->CLI_run($countInARun, $sleepTime, $sleepAfterFinish);
2100
            } catch (\Exception $e) {
2101
                $this->CLI_debug(get_class($e) . ': ' . $e->getMessage());
2102
                $result = self::CLI_STATUS_ABORTED;
2103
            }
2104
2105
            // Cleanup
2106
            $this->db->exec_DELETEquery('tx_crawler_process', 'assigned_items_count = 0');
2107
2108
            //TODO can't we do that in a clean way?
2109
            $releaseStatus = $this->CLI_releaseProcesses($this->CLI_buildProcessId());
0 ignored issues
show
Unused Code introduced by
$releaseStatus is not used, you could remove the assignment.

This check looks for variable assignements that are either overwritten by other assignments or where the variable is not used subsequently.

$myVar = 'Value';
$higher = false;

if (rand(1, 6) > 3) {
    $higher = true;
} else {
    $higher = false;
}

Both the $myVar assignment in line 1 and the $higher assignment in line 2 are dead. The first because $myVar is never used and the second because $higher is always overwritten for every possible time line.

Loading history...
2110
2111
            $this->CLI_debug("Unprocessed Items remaining:" . $this->queueRepository->countUnprocessedItems() . " (" . $this->CLI_buildProcessId() . ")");
2112
            $result |= ($this->queueRepository->countUnprocessedItems() > 0 ? self::CLI_STATUS_REMAIN : self::CLI_STATUS_NOTHING_PROCCESSED);
2113
        } else {
2114
            $result |= self::CLI_STATUS_ABORTED;
2115
        }
2116
2117
        return $result;
2118
    }
2119
2120
    /**
2121
     * Function executed by crawler_im.php cli script.
2122
     *
2123
     * @return void
2124
     */
2125
    public function CLI_main_im()
2126
    {
2127
        $this->setAccessMode('cli_im');
2128
2129
        $cliObj = GeneralUtility::makeInstance(QueueCommandLineController::class);
2130
2131
        // Force user to admin state and set workspace to "Live":
2132
        $this->backendUser->user['admin'] = 1;
2133
        $this->backendUser->setWorkspace(0);
2134
2135
        // Print help
2136
        if (!isset($cliObj->cli_args['_DEFAULT'][1])) {
2137
            $cliObj->cli_validateArgs();
2138
            $cliObj->cli_help();
2139
            exit;
2140
        }
2141
2142
        $cliObj->cli_validateArgs();
2143
2144
        if ($cliObj->cli_argValue('-o') === 'exec') {
2145
            $this->registerQueueEntriesInternallyOnly = true;
2146
        }
2147
2148
        if (isset($cliObj->cli_args['_DEFAULT'][2])) {
2149
            // Crawler is called over TYPO3 BE
2150
            $pageId = MathUtility::forceIntegerInRange($cliObj->cli_args['_DEFAULT'][2], 0);
2151
        } else {
2152
            // Crawler is called over cli
2153
            $pageId = MathUtility::forceIntegerInRange($cliObj->cli_args['_DEFAULT'][1], 0);
2154
        }
2155
2156
        $configurationKeys = $this->getConfigurationKeys($cliObj);
0 ignored issues
show
Deprecated Code introduced by
The method AOE\Crawler\Controller\C...:getConfigurationKeys() has been deprecated with message: since crawler v6.3.0, will be removed in crawler v7.0.0.

This method has been deprecated. The supplier of the class has supplied an explanatory message.

The explanatory message should give you some clue as to whether and when the method will be removed from the class and what other method or class to use instead.

Loading history...
2157
2158
        if (!is_array($configurationKeys)) {
2159
            $configurations = $this->getUrlsForPageId($pageId);
2160
            if (is_array($configurations)) {
2161
                $configurationKeys = array_keys($configurations);
2162
            } else {
2163
                $configurationKeys = [];
2164
            }
2165
        }
2166
2167
        if ($cliObj->cli_argValue('-o') === 'queue' || $cliObj->cli_argValue('-o') === 'exec') {
2168
            $reason = new Reason();
2169
            $reason->setReason(Reason::REASON_GUI_SUBMIT);
2170
            $reason->setDetailText('The cli script of the crawler added to the queue');
2171
2172
            // The event dispatcher is deprecated since crawler v6.4.0, will be removed in crawler v7.0.0.
2173
            // Please use the Signal instead.
2174
            EventDispatcher::getInstance()->post(
2175
                'invokeQueueChange',
2176
                $this->setID,
2177
                ['reason' => $reason]
2178
            );
2179
2180
            $signalPayload = ['reason' => $reason];
2181
            SignalSlotUtility::emitSignal(
2182
                __CLASS__,
2183
                SignalSlotUtility::SIGNAL_INVOKE_QUEUE_CHANGE,
2184
                $signalPayload
2185
            );
2186
2187
        }
2188
2189
        if ($this->extensionSettings['cleanUpOldQueueEntries']) {
2190
            $this->cleanUpOldQueueEntries();
2191
        }
2192
2193
        $this->setID = (int) GeneralUtility::md5int(microtime());
2194
        $this->getPageTreeAndUrls(
2195
            $pageId,
2196
            MathUtility::forceIntegerInRange($cliObj->cli_argValue('-d'), 0, 99),
2197
            $this->getCurrentTime(),
2198
            MathUtility::forceIntegerInRange($cliObj->cli_isArg('-n') ? $cliObj->cli_argValue('-n') : 30, 1, 1000),
2199
            $cliObj->cli_argValue('-o') === 'queue' || $cliObj->cli_argValue('-o') === 'exec',
2200
            $cliObj->cli_argValue('-o') === 'url',
2201
            GeneralUtility::trimExplode(',', $cliObj->cli_argValue('-proc'), true),
2202
            $configurationKeys
2203
        );
2204
2205
        if ($cliObj->cli_argValue('-o') === 'url') {
2206
            $cliObj->cli_echo(implode(chr(10), $this->downloadUrls) . chr(10), true);
2207
        } elseif ($cliObj->cli_argValue('-o') === 'exec') {
2208
            $cliObj->cli_echo("Executing " . count($this->urlList) . " requests right away:\n\n");
2209
            $cliObj->cli_echo(implode(chr(10), $this->urlList) . chr(10));
2210
            $cliObj->cli_echo("\nProcessing:\n");
2211
2212
            foreach ($this->queueEntries as $queueRec) {
2213
                $p = unserialize($queueRec['parameters']);
2214
                $cliObj->cli_echo($p['url'] . ' (' . implode(',', $p['procInstructions']) . ') => ');
2215
2216
                $result = $this->readUrlFromArray($queueRec);
2217
2218
                $requestResult = unserialize($result['content']);
2219
                if (is_array($requestResult)) {
2220
                    $resLog = is_array($requestResult['log']) ? chr(10) . chr(9) . chr(9) . implode(chr(10) . chr(9) . chr(9), $requestResult['log']) : '';
2221
                    $cliObj->cli_echo('OK: ' . $resLog . chr(10));
2222
                } else {
2223
                    $cliObj->cli_echo('Error checking Crawler Result: ' . substr(preg_replace('/\s+/', ' ', strip_tags($result['content'])), 0, 30000) . '...' . chr(10));
2224
                }
2225
            }
2226
        } elseif ($cliObj->cli_argValue('-o') === 'queue') {
2227
            $cliObj->cli_echo("Putting " . count($this->urlList) . " entries in queue:\n\n");
2228
            $cliObj->cli_echo(implode(chr(10), $this->urlList) . chr(10));
2229
        } else {
2230
            $cliObj->cli_echo(count($this->urlList) . " entries found for processing. (Use -o to decide action):\n\n", true);
2231
            $cliObj->cli_echo(implode(chr(10), $this->urlList) . chr(10), true);
2232
        }
2233
    }
2234
2235
    /**
2236
     * Function executed by crawler_im.php cli script.
2237
     *
2238
     * @return bool
2239
     */
2240
    public function CLI_main_flush()
2241
    {
2242
        $this->setAccessMode('cli_flush');
2243
        $cliObj = GeneralUtility::makeInstance(FlushCommandLineController::class);
2244
2245
        // Force user to admin state and set workspace to "Live":
2246
        $this->backendUser->user['admin'] = 1;
2247
        $this->backendUser->setWorkspace(0);
2248
2249
        // Print help
2250
        if (!isset($cliObj->cli_args['_DEFAULT'][1])) {
2251
            $cliObj->cli_validateArgs();
2252
            $cliObj->cli_help();
2253
            exit;
2254
        }
2255
2256
        $cliObj->cli_validateArgs();
2257
        $pageId = MathUtility::forceIntegerInRange($cliObj->cli_args['_DEFAULT'][1], 0);
2258
        $fullFlush = ($pageId == 0);
2259
2260
        $mode = $cliObj->cli_argValue('-o');
2261
2262
        switch ($mode) {
2263
            case 'all':
2264
                $result = $this->getLogEntriesForPageId($pageId, '', true, $fullFlush);
2265
                break;
2266
            case 'finished':
2267
            case 'pending':
2268
                $result = $this->getLogEntriesForPageId($pageId, $mode, true, $fullFlush);
2269
                break;
2270
            default:
2271
                $cliObj->cli_validateArgs();
2272
                $cliObj->cli_help();
2273
                $result = false;
2274
        }
2275
2276
        return $result !== false;
2277
    }
2278
2279
    /**
2280
     * Obtains configuration keys from the CLI arguments
2281
     *
2282
     * @param QueueCommandLineController $cliObj
2283
     * @return array
2284
     *
2285
     * @deprecated since crawler v6.3.0, will be removed in crawler v7.0.0.
2286
     */
2287
    protected function getConfigurationKeys(QueueCommandLineController $cliObj)
2288
    {
2289
        $parameter = trim($cliObj->cli_argValue('-conf'));
2290
        return ($parameter != '' ? GeneralUtility::trimExplode(',', $parameter) : []);
2291
    }
2292
2293
    /**
2294
     * Running the functionality of the CLI (crawling URLs from queue)
2295
     *
2296
     * @param int $countInARun
2297
     * @param int $sleepTime
2298
     * @param int $sleepAfterFinish
2299
     * @return string
2300
     */
2301
    public function CLI_run($countInARun, $sleepTime, $sleepAfterFinish)
2302
    {
2303
        $result = 0;
2304
        $counter = 0;
2305
2306
        // First, run hooks:
2307
        $this->CLI_runHooks();
2308
2309
        // Clean up the queue
2310
        if (intval($this->extensionSettings['purgeQueueDays']) > 0) {
2311
            $purgeDate = $this->getCurrentTime() - 24 * 60 * 60 * intval($this->extensionSettings['purgeQueueDays']);
2312
            $del = $this->db->exec_DELETEquery(
2313
                'tx_crawler_queue',
2314
                'exec_time!=0 AND exec_time<' . $purgeDate
2315
            );
2316
            if (false == $del) {
2317
                GeneralUtility::devLog('Records could not be deleted.', 'crawler', LogLevel::INFO);
2318
            }
2319
        }
2320
2321
        // Select entries:
2322
        //TODO Shouldn't this reside within the transaction?
2323
        $rows = $this->db->exec_SELECTgetRows(
2324
            'qid,scheduled',
2325
            'tx_crawler_queue',
2326
            'exec_time=0
2327
                AND process_scheduled= 0
2328
                AND scheduled<=' . $this->getCurrentTime(),
2329
            '',
2330
            'scheduled, qid',
2331
        intval($countInARun)
2332
        );
2333
2334
        if (count($rows) > 0) {
2335
            $quidList = [];
2336
2337
            foreach ($rows as $r) {
0 ignored issues
show
Bug introduced by
The expression $rows of type null|array is not guaranteed to be traversable. How about adding an additional type check?

There are different options of fixing this problem.

  1. If you want to be on the safe side, you can add an additional type-check:

    $collection = json_decode($data, true);
    if ( ! is_array($collection)) {
        throw new \RuntimeException('$collection must be an array.');
    }
    
    foreach ($collection as $item) { /** ... */ }
    
  2. If you are sure that the expression is traversable, you might want to add a doc comment cast to improve IDE auto-completion and static analysis:

    /** @var array $collection */
    $collection = json_decode($data, true);
    
    foreach ($collection as $item) { /** .. */ }
    
  3. Mark the issue as a false-positive: Just hover the remove button, in the top-right corner of this issue for more options.

Loading history...
2338
                $quidList[] = $r['qid'];
2339
            }
2340
2341
            $processId = $this->CLI_buildProcessId();
2342
2343
            //reserve queue entries for process
2344
            $this->db->sql_query('BEGIN');
2345
            //TODO make sure we're not taking assigned queue-entires
2346
            $this->db->exec_UPDATEquery(
2347
                'tx_crawler_queue',
2348
                'qid IN (' . implode(',', $quidList) . ')',
2349
                [
2350
                    'process_scheduled' => intval($this->getCurrentTime()),
2351
                    'process_id' => $processId,
2352
                ]
2353
            );
2354
2355
            //save the number of assigned queue entrys to determine who many have been processed later
2356
            $numberOfAffectedRows = $this->db->sql_affected_rows();
2357
            $this->db->exec_UPDATEquery(
2358
                'tx_crawler_process',
2359
                "process_id = '" . $processId . "'",
2360
                [
2361
                    'assigned_items_count' => intval($numberOfAffectedRows),
2362
                ]
2363
            );
2364
2365
            if ($numberOfAffectedRows == count($quidList)) {
2366
                $this->db->sql_query('COMMIT');
2367
            } else {
2368
                $this->db->sql_query('ROLLBACK');
2369
                $this->CLI_debug("Nothing processed due to multi-process collision (" . $this->CLI_buildProcessId() . ")");
2370
                return ($result | self::CLI_STATUS_ABORTED);
2371
            }
2372
2373
            foreach ($rows as $r) {
0 ignored issues
show
Bug introduced by
The expression $rows of type null|array is not guaranteed to be traversable. How about adding an additional type check?

There are different options of fixing this problem.

  1. If you want to be on the safe side, you can add an additional type-check:

    $collection = json_decode($data, true);
    if ( ! is_array($collection)) {
        throw new \RuntimeException('$collection must be an array.');
    }
    
    foreach ($collection as $item) { /** ... */ }
    
  2. If you are sure that the expression is traversable, you might want to add a doc comment cast to improve IDE auto-completion and static analysis:

    /** @var array $collection */
    $collection = json_decode($data, true);
    
    foreach ($collection as $item) { /** .. */ }
    
  3. Mark the issue as a false-positive: Just hover the remove button, in the top-right corner of this issue for more options.

Loading history...
2374
                $result |= $this->readUrl($r['qid']);
2375
2376
                $counter++;
2377
                usleep(intval($sleepTime)); // Just to relax the system
2378
2379
                // if during the start and the current read url the cli has been disable we need to return from the function
2380
                // mark the process NOT as ended.
2381
                if ($this->getDisabled()) {
2382
                    return ($result | self::CLI_STATUS_ABORTED);
2383
                }
2384
2385
                if (!$this->CLI_checkIfProcessIsActive($this->CLI_buildProcessId())) {
2386
                    $this->CLI_debug("conflict / timeout (" . $this->CLI_buildProcessId() . ")");
2387
2388
                    //TODO might need an additional returncode
2389
                    $result |= self::CLI_STATUS_ABORTED;
2390
                    break; //possible timeout
2391
                }
2392
            }
2393
2394
            sleep(intval($sleepAfterFinish));
2395
2396
            $msg = 'Rows: ' . $counter;
2397
            $this->CLI_debug($msg . " (" . $this->CLI_buildProcessId() . ")");
2398
        } else {
2399
            $this->CLI_debug("Nothing within queue which needs to be processed (" . $this->CLI_buildProcessId() . ")");
2400
        }
2401
2402
        if ($counter > 0) {
2403
            $result |= self::CLI_STATUS_PROCESSED;
2404
        }
2405
2406
        return $result;
2407
    }
2408
2409
    /**
2410
     * Activate hooks
2411
     *
2412
     * @return void
2413
     */
2414
    public function CLI_runHooks()
2415
    {
2416
        global $TYPO3_CONF_VARS;
2417
        if (is_array($TYPO3_CONF_VARS['EXTCONF']['crawler']['cli_hooks'])) {
2418
            foreach ($TYPO3_CONF_VARS['EXTCONF']['crawler']['cli_hooks'] as $objRef) {
2419
                $hookObj = &GeneralUtility::getUserObj($objRef);
2420
                if (is_object($hookObj)) {
2421
                    $hookObj->crawler_init($this);
2422
                }
2423
            }
2424
        }
2425
    }
2426
2427
    /**
2428
     * Try to acquire a new process with the given id
2429
     * also performs some auto-cleanup for orphan processes
2430
     * @todo preemption might not be the most elegant way to clean up
2431
     *
2432
     * @param string $id identification string for the process
2433
     * @return boolean
2434
     */
2435
    public function CLI_checkAndAcquireNewProcess($id)
2436
    {
2437
        $ret = true;
2438
2439
        $systemProcessId = getmypid();
2440
        if ($systemProcessId < 1) {
2441
            return false;
2442
        }
2443
2444
        $processCount = 0;
2445
        $orphanProcesses = [];
2446
2447
        $this->db->sql_query('BEGIN');
2448
2449
        $res = $this->db->exec_SELECTquery(
2450
            'process_id,ttl',
2451
            'tx_crawler_process',
2452
            'active=1 AND deleted=0'
2453
            );
2454
2455
        $currentTime = $this->getCurrentTime();
2456
2457
        while ($row = $this->db->sql_fetch_assoc($res)) {
2458
            if ($row['ttl'] < $currentTime) {
2459
                $orphanProcesses[] = $row['process_id'];
2460
            } else {
2461
                $processCount++;
2462
            }
2463
        }
2464
2465
        // if there are less than allowed active processes then add a new one
2466
        if ($processCount < intval($this->extensionSettings['processLimit'])) {
2467
            $this->CLI_debug("add process " . $this->CLI_buildProcessId() . " (" . ($processCount + 1) . "/" . intval($this->extensionSettings['processLimit']) . ")");
2468
2469
            // create new process record
2470
            $this->db->exec_INSERTquery(
2471
                'tx_crawler_process',
2472
                [
2473
                    'process_id' => $id,
2474
                    'active' => '1',
2475
                    'ttl' => ($currentTime + intval($this->extensionSettings['processMaxRunTime'])),
2476
                    'system_process_id' => $systemProcessId,
2477
                ]
2478
                );
2479
        } else {
2480
            $this->CLI_debug("Processlimit reached (" . ($processCount) . "/" . intval($this->extensionSettings['processLimit']) . ")");
2481
            $ret = false;
2482
        }
2483
2484
        $this->CLI_releaseProcesses($orphanProcesses, true); // maybe this should be somehow included into the current lock
2485
        $this->CLI_deleteProcessesMarkedDeleted();
2486
2487
        $this->db->sql_query('COMMIT');
2488
2489
        return $ret;
2490
    }
2491
2492
    /**
2493
     * Release a process and the required resources
2494
     *
2495
     * @param  mixed    $releaseIds   string with a single process-id or array with multiple process-ids
2496
     * @param  boolean  $withinLock   show whether the DB-actions are included within an existing lock
2497
     * @return boolean
2498
     */
2499
    public function CLI_releaseProcesses($releaseIds, $withinLock = false)
2500
    {
2501
        if (!is_array($releaseIds)) {
2502
            $releaseIds = [$releaseIds];
2503
        }
2504
2505
        if (!count($releaseIds) > 0) {
2506
            return false;   //nothing to release
2507
        }
2508
2509
        if (!$withinLock) {
2510
            $this->db->sql_query('BEGIN');
2511
        }
2512
2513
        // some kind of 2nd chance algo - this way you need at least 2 processes to have a real cleanup
2514
        // this ensures that a single process can't mess up the entire process table
2515
2516
        // mark all processes as deleted which have no "waiting" queue-entires and which are not active
2517
        $this->db->exec_UPDATEquery(
2518
            'tx_crawler_queue',
2519
            'process_id IN (SELECT process_id FROM tx_crawler_process WHERE active=0 AND deleted=0)',
2520
            [
2521
                'process_scheduled' => 0,
2522
                'process_id' => '',
2523
            ]
2524
        );
2525
        $this->db->exec_UPDATEquery(
2526
            'tx_crawler_process',
2527
            'active=0 AND deleted=0
2528
            AND NOT EXISTS (
2529
                SELECT * FROM tx_crawler_queue
2530
                WHERE tx_crawler_queue.process_id = tx_crawler_process.process_id
2531
                AND tx_crawler_queue.exec_time = 0
2532
            )',
2533
            [
2534
                'deleted' => '1',
2535
                'system_process_id' => 0,
2536
            ]
2537
        );
2538
        // mark all requested processes as non-active
2539
        $this->db->exec_UPDATEquery(
2540
            'tx_crawler_process',
2541
            'process_id IN (\'' . implode('\',\'', $releaseIds) . '\') AND deleted=0',
2542
            [
2543
                'active' => '0',
2544
            ]
2545
        );
2546
        $this->db->exec_UPDATEquery(
2547
            'tx_crawler_queue',
2548
            'exec_time=0 AND process_id IN ("' . implode('","', $releaseIds) . '")',
2549
            [
2550
                'process_scheduled' => 0,
2551
                'process_id' => '',
2552
            ]
2553
        );
2554
2555
        if (!$withinLock) {
2556
            $this->db->sql_query('COMMIT');
2557
        }
2558
2559
        return true;
2560
    }
2561
2562
    /**
2563
     * Delete processes marked as deleted
2564
     *
2565
     * @return void
2566
     */
2567 1
    public function CLI_deleteProcessesMarkedDeleted()
2568
    {
2569 1
        $this->db->exec_DELETEquery('tx_crawler_process', 'deleted = 1');
2570 1
    }
2571
2572
    /**
2573
     * Check if there are still resources left for the process with the given id
2574
     * Used to determine timeouts and to ensure a proper cleanup if there's a timeout
2575
     *
2576
     * @param  string  identification string for the process
2577
     * @return boolean determines if the process is still active / has resources
2578
     *
2579
     * FIXME: Please remove Transaction, not needed as only a select query.
2580
     */
2581
    public function CLI_checkIfProcessIsActive($pid)
2582
    {
2583
        $ret = false;
2584
        $this->db->sql_query('BEGIN');
2585
        $res = $this->db->exec_SELECTquery(
2586
            'process_id,active,ttl',
2587
            'tx_crawler_process',
2588
            'process_id = \'' . $pid . '\'  AND deleted=0',
2589
            '',
2590
            'ttl',
2591
            '0,1'
2592
        );
2593
        if ($row = $this->db->sql_fetch_assoc($res)) {
2594
            $ret = intVal($row['active']) == 1;
2595
        }
2596
        $this->db->sql_query('COMMIT');
2597
2598
        return $ret;
2599
    }
2600
2601
    /**
2602
     * Create a unique Id for the current process
2603
     *
2604
     * @return string  the ID
2605
     */
2606 2
    public function CLI_buildProcessId()
2607
    {
2608 2
        if (!$this->processID) {
2609 1
            $this->processID = GeneralUtility::shortMD5($this->microtime(true));
2610
        }
2611 2
        return $this->processID;
2612
    }
2613
2614
    /**
2615
     * @param bool $get_as_float
2616
     *
2617
     * @return mixed
2618
     */
2619
    protected function microtime($get_as_float = false)
2620
    {
2621
        return microtime($get_as_float);
2622
    }
2623
2624
    /**
2625
     * Prints a message to the stdout (only if debug-mode is enabled)
2626
     *
2627
     * @param  string $msg  the message
2628
     */
2629
    public function CLI_debug($msg)
2630
    {
2631
        if (intval($this->extensionSettings['processDebug'])) {
2632
            echo $msg . "\n";
2633
            flush();
2634
        }
2635
    }
2636
2637
    /**
2638
     * Get URL content by making direct request to TYPO3.
2639
     *
2640
     * @param  string $url          Page URL
2641
     * @param  int    $crawlerId    Crawler-ID
2642
     * @return array
2643
     */
2644 2
    protected function sendDirectRequest($url, $crawlerId)
2645
    {
2646 2
        $parsedUrl = parse_url($url);
2647 2
        if (!is_array($parsedUrl)) {
2648
            return [];
2649
        }
2650
2651 2
        $requestHeaders = $this->buildRequestHeaderArray($parsedUrl, $crawlerId);
2652
2653 2
        $cmd = escapeshellcmd($this->extensionSettings['phpPath']);
2654 2
        $cmd .= ' ';
2655 2
        $cmd .= escapeshellarg(ExtensionManagementUtility::extPath('crawler') . 'cli/bootstrap.php');
2656 2
        $cmd .= ' ';
2657 2
        $cmd .= escapeshellarg($this->getFrontendBasePath());
2658 2
        $cmd .= ' ';
2659 2
        $cmd .= escapeshellarg($url);
2660 2
        $cmd .= ' ';
2661 2
        $cmd .= escapeshellarg(base64_encode(serialize($requestHeaders)));
2662
2663 2
        $startTime = microtime(true);
2664 2
        $content = $this->executeShellCommand($cmd);
2665 2
        $this->log($url . ' ' . (microtime(true) - $startTime));
2666
2667
        $result = [
2668 2
            'request' => implode("\r\n", $requestHeaders) . "\r\n\r\n",
2669 2
            'headers' => '',
2670 2
            'content' => $content,
2671
        ];
2672
2673 2
        return $result;
2674
    }
2675
2676
    /**
2677
     * Cleans up entries that stayed for too long in the queue. These are:
2678
     * - processed entries that are over 1.5 days in age
2679
     * - scheduled entries that are over 7 days old
2680
     *
2681
     * @return void
2682
     *
2683
     * TODO: Should be switched back to protected - TNM 2018-11-16
2684
     */
2685
    public function cleanUpOldQueueEntries()
2686
    {
2687
        $processedAgeInSeconds = $this->extensionSettings['cleanUpProcessedAge'] * 86400; // 24*60*60 Seconds in 24 hours
2688
        $scheduledAgeInSeconds = $this->extensionSettings['cleanUpScheduledAge'] * 86400;
2689
2690
        $now = time();
2691
        $condition = '(exec_time<>0 AND exec_time<' . ($now - $processedAgeInSeconds) . ') OR scheduled<=' . ($now - $scheduledAgeInSeconds);
2692
        $this->flushQueue($condition);
2693
    }
2694
2695
    /**
2696
     * Initializes a TypoScript Frontend necessary for using TypoScript and TypoLink functions
2697
     *
2698
     * @param int $id
2699
     * @param int $typeNum
2700
     *
2701
     * @throws \TYPO3\CMS\Core\Error\Http\ServiceUnavailableException
2702
     *
2703
     * @return void
2704
     */
2705
    protected function initTSFE($id = 1, $typeNum = 0)
2706
    {
2707
        EidUtility::initTCA();
2708
2709
        $isVersion7 = VersionNumberUtility::convertVersionNumberToInteger(TYPO3_version) < 8000000;
2710
        if ($isVersion7 && !is_object($GLOBALS['TT'])) {
2711
            /** @var NullTimeTracker $GLOBALS['TT'] */
2712
            $GLOBALS['TT'] = new NullTimeTracker();
0 ignored issues
show
Deprecated Code introduced by
The class TYPO3\CMS\Core\TimeTracker\NullTimeTracker has been deprecated with message: since TYPO3 v8, will be removed in v9

This class, trait or interface has been deprecated. The supplier of the file has supplied an explanatory message.

The explanatory message should give you some clue as to whether and when the type will be removed from the class and what other constant to use instead.

Loading history...
2713
            $GLOBALS['TT']->start();
0 ignored issues
show
Deprecated Code introduced by
The method TYPO3\CMS\Core\TimeTrack...ullTimeTracker::start() has been deprecated with message: since TYPO3 v8, will be removed in v9, use the regular time tracking

This method has been deprecated. The supplier of the class has supplied an explanatory message.

The explanatory message should give you some clue as to whether and when the method will be removed from the class and what other method or class to use instead.

Loading history...
2714
        } else {
2715
            $timeTracker = GeneralUtility::makeInstance(TimeTracker::class);
2716
            $timeTracker->start();
2717
        }
2718
2719
        $GLOBALS['TSFE'] = GeneralUtility::makeInstance(TypoScriptFrontendController::class, $GLOBALS['TYPO3_CONF_VARS'], $id, $typeNum);
2720
        $GLOBALS['TSFE']->sys_page = GeneralUtility::makeInstance(PageRepository::class);
2721
        $GLOBALS['TSFE']->sys_page->init(true);
2722
        $GLOBALS['TSFE']->connectToDB();
2723
        $GLOBALS['TSFE']->initFEuser();
2724
        $GLOBALS['TSFE']->determineId();
2725
        $GLOBALS['TSFE']->initTemplate();
2726
        $GLOBALS['TSFE']->rootLine = $GLOBALS['TSFE']->sys_page->getRootLine($id, '');
2727
        $GLOBALS['TSFE']->getConfigArray();
2728
        PageGenerator::pagegenInit();
0 ignored issues
show
Deprecated Code introduced by
The method TYPO3\CMS\Frontend\Page\...enerator::pagegenInit() has been deprecated with message: since TYPO3 v8, will be removed in TYPO3 v9

This method has been deprecated. The supplier of the class has supplied an explanatory message.

The explanatory message should give you some clue as to whether and when the method will be removed from the class and what other method or class to use instead.

Loading history...
2729
    }
2730
2731
    /**
2732
     * Returns a md5 hash generated from a serialized configuration array.
2733
     *
2734
     * @param array $configuration
2735
     *
2736
     * @return string
2737
     */
2738 10
    protected function getConfigurationHash(array $configuration) {
2739 10
        unset($configuration['paramExpanded']);
2740 10
        unset($configuration['URLs']);
2741 10
        return md5(serialize($configuration));
2742
    }
2743
2744
    /**
2745
     * Check whether the Crawling Protocol should be http or https
2746
     *
2747
     * @param $crawlerConfiguration
2748
     * @param $pageConfiguration
2749
     *
2750
     * @return bool
2751
     */
2752 10
    protected function isCrawlingProtocolHttps($crawlerConfiguration, $pageConfiguration) {
2753
        switch($crawlerConfiguration) {
2754 10
            case -1:
2755 2
                return false;
2756 8
            case 0:
2757 4
                return $pageConfiguration;
2758 4
            case 1:
2759 2
                return true;
2760
            default:
2761 2
                return false;
2762
        }
2763
    }
2764
}
2765