Completed
Push — master ( 6a3171...e461ad )
by Tomas Norre
07:14
created

CrawlerController::getUrlsForPageId()   F

Complexity

Conditions 26
Paths 60

Size

Total Lines 133

Duplication

Lines 0
Ratio 0 %

Code Coverage

Tests 59
CRAP Score 26.322

Importance

Changes 0
Metric Value
cc 26
nc 60
nop 2
dl 0
loc 133
ccs 59
cts 64
cp 0.9219
crap 26.322
rs 3.3333
c 0
b 0
f 0

How to fix   Long Method    Complexity   

Long Method

Small methods make your code easier to understand, in particular if combined with a good name. Besides, if your method is small, finding a good name is usually much easier.

For example, if you find yourself adding comments to a method's body, this is usually a good sign to extract the commented part to a new method, and use the comment as a starting point when coming up with a good name for this new method.

Commonly applied refactorings include:

1
<?php
2
namespace AOE\Crawler\Controller;
3
4
/***************************************************************
5
 *  Copyright notice
6
 *
7
 *  (c) 2017 AOE GmbH <[email protected]>
8
 *
9
 *  All rights reserved
10
 *
11
 *  This script is part of the TYPO3 project. The TYPO3 project is
12
 *  free software; you can redistribute it and/or modify
13
 *  it under the terms of the GNU General Public License as published by
14
 *  the Free Software Foundation; either version 3 of the License, or
15
 *  (at your option) any later version.
16
 *
17
 *  The GNU General Public License can be found at
18
 *  http://www.gnu.org/copyleft/gpl.html.
19
 *
20
 *  This script is distributed in the hope that it will be useful,
21
 *  but WITHOUT ANY WARRANTY; without even the implied warranty of
22
 *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
23
 *  GNU General Public License for more details.
24
 *
25
 *  This copyright notice MUST APPEAR in all copies of the script!
26
 ***************************************************************/
27
28
use AOE\Crawler\Command\CrawlerCommandLineController;
29
use AOE\Crawler\Command\FlushCommandLineController;
30
use AOE\Crawler\Command\QueueCommandLineController;
31
use AOE\Crawler\Domain\Model\Configuration;
32
use AOE\Crawler\Domain\Model\Reason;
33
use AOE\Crawler\Domain\Repository\ConfigurationRepository;
34
use AOE\Crawler\Domain\Repository\ProcessRepository;
35
use AOE\Crawler\Domain\Repository\QueueRepository;
36
use AOE\Crawler\Event\EventDispatcher;
37
use AOE\Crawler\Utility\IconUtility;
38
use AOE\Crawler\Utility\SignalSlotUtility;
39
use TYPO3\CMS\Backend\Utility\BackendUtility;
40
use TYPO3\CMS\Backend\Tree\View\PageTreeView;
41
use TYPO3\CMS\Core\Authentication\BackendUserAuthentication;
42
use TYPO3\CMS\Core\Database\DatabaseConnection;
43
use TYPO3\CMS\Core\Log\LogLevel;
44
use TYPO3\CMS\Core\TimeTracker\NullTimeTracker;
45
use TYPO3\CMS\Core\TimeTracker\TimeTracker;
46
use TYPO3\CMS\Core\Utility\DebugUtility;
47
use TYPO3\CMS\Core\Utility\ExtensionManagementUtility;
48
use TYPO3\CMS\Core\Utility\GeneralUtility;
49
use TYPO3\CMS\Core\Utility\MathUtility;
50
use TYPO3\CMS\Core\Utility\VersionNumberUtility;
51
use TYPO3\CMS\Extbase\Object\ObjectManager;
52
use TYPO3\CMS\Frontend\Controller\TypoScriptFrontendController;
53
use TYPO3\CMS\Frontend\Page\PageGenerator;
54
use TYPO3\CMS\Frontend\Page\PageRepository;
55
use TYPO3\CMS\Frontend\Utility\EidUtility;
56
use TYPO3\CMS\Lang\LanguageService;
57
58
/**
59
 * Class CrawlerController
60
 *
61
 * @package AOE\Crawler\Controller
62
 */
63
class CrawlerController
64
{
65
    const CLI_STATUS_NOTHING_PROCCESSED = 0;
66
    const CLI_STATUS_REMAIN = 1; //queue not empty
67
    const CLI_STATUS_PROCESSED = 2; //(some) queue items where processed
68
    const CLI_STATUS_ABORTED = 4; //instance didn't finish
69
    const CLI_STATUS_POLLABLE_PROCESSED = 8;
70
71
    /**
72
     * @var integer
73
     */
74
    public $setID = 0;
75
76
    /**
77
     * @var string
78
     */
79
    public $processID = '';
80
81
    /**
82
     * One hour is max stalled time for the CLI
83
     * If the process had the status "start" for 3600 seconds, it will be regarded stalled and a new process is started
84
     *
85
     * @var integer
86
     */
87
    public $max_CLI_exec_time = 3600;
88
89
    /**
90
     * @var array
91
     */
92
    public $duplicateTrack = [];
93
94
    /**
95
     * @var array
96
     */
97
    public $downloadUrls = [];
98
99
    /**
100
     * @var array
101
     */
102
    public $incomingProcInstructions = [];
103
104
    /**
105
     * @var array
106
     */
107
    public $incomingConfigurationSelection = [];
108
109
    /**
110
     * @var bool
111
     */
112
    public $registerQueueEntriesInternallyOnly = false;
113
114
    /**
115
     * @var array
116
     */
117
    public $queueEntries = [];
118
119
    /**
120
     * @var array
121
     */
122
    public $urlList = [];
123
124
    /**
125
     * @var boolean
126
     */
127
    public $debugMode = false;
128
129
    /**
130
     * @var array
131
     */
132
    public $extensionSettings = [];
133
134
    /**
135
     * Mount Point
136
     *
137
     * @var boolean
138
     */
139
    public $MP = false;
140
141
    /**
142
     * @var string
143
     */
144
    protected $processFilename;
145
146
    /**
147
     * Holds the internal access mode can be 'gui','cli' or 'cli_im'
148
     *
149
     * @var string
150
     */
151
    protected $accessMode;
152
153
    /**
154
     * @var DatabaseConnection
155
     */
156
    private $db;
157
158
    /**
159
     * @var BackendUserAuthentication
160
     */
161
    private $backendUser;
162
163
    /**
164
     * @var integer
165
     */
166
    private $scheduledTime = 0;
167
168
    /**
169
     * @var integer
170
     */
171
    private $reqMinute = 0;
172
173
    /**
174
     * @var bool
175
     */
176
    private $submitCrawlUrls = false;
177
178
    /**
179
     * @var bool
180
     */
181
    private $downloadCrawlUrls = false;
182
183
    /**
184
     * @var QueueRepository
185
     */
186
    protected  $queueRepository;
187
188
    /**
189
     * @var ProcessRepository
190
     */
191
    protected $processRepository;
192
193
    /**
194
     * @var ConfigurationRepository
195
     */
196
    protected $configurationRepository;
197
198
    /**
199
     * Method to set the accessMode can be gui, cli or cli_im
200
     *
201
     * @return string
202
     */
203 1
    public function getAccessMode()
204
    {
205 1
        return $this->accessMode;
206
    }
207
208
    /**
209
     * @param string $accessMode
210
     */
211 1
    public function setAccessMode($accessMode)
212
    {
213 1
        $this->accessMode = $accessMode;
214 1
    }
215
216
    /**
217
     * Set disabled status to prevent processes from being processed
218
     *
219
     * @param  bool $disabled (optional, defaults to true)
220
     * @return void
221
     */
222 3
    public function setDisabled($disabled = true)
223
    {
224 3
        if ($disabled) {
225 2
            GeneralUtility::writeFile($this->processFilename, '');
226
        } else {
227 1
            if (is_file($this->processFilename)) {
228 1
                unlink($this->processFilename);
229
            }
230
        }
231 3
    }
232
233
    /**
234
     * Get disable status
235
     *
236
     * @return bool true if disabled
237
     */
238 3
    public function getDisabled()
239
    {
240 3
        if (is_file($this->processFilename)) {
241 2
            return true;
242
        } else {
243 1
            return false;
244
        }
245
    }
246
247
    /**
248
     * @param string $filenameWithPath
249
     *
250
     * @return void
251
     */
252 4
    public function setProcessFilename($filenameWithPath)
253
    {
254 4
        $this->processFilename = $filenameWithPath;
255 4
    }
256
257
    /**
258
     * @return string
259
     */
260 1
    public function getProcessFilename()
261
    {
262 1
        return $this->processFilename;
263
    }
264
265
    /************************************
266
     *
267
     * Getting URLs based on Page TSconfig
268
     *
269
     ************************************/
270
271 43
    public function __construct()
272
    {
273 43
        $objectManager = GeneralUtility::makeInstance(ObjectManager::class);
274 43
        $this->queueRepository = $objectManager->get(QueueRepository::class);
275 43
        $this->configurationRepository = $objectManager->get(ConfigurationRepository::class);
276 43
        $this->processRepository = $objectManager->get(ProcessRepository::class);
277
278 43
        $this->db = $GLOBALS['TYPO3_DB'];
279 43
        $this->backendUser = $GLOBALS['BE_USER'];
280 43
        $this->processFilename = PATH_site . 'typo3temp/tx_crawler.proc';
281
282 43
        $settings = unserialize($GLOBALS['TYPO3_CONF_VARS']['EXT']['extConf']['crawler']);
283 43
        $settings = is_array($settings) ? $settings : [];
284
285
        // read ext_em_conf_template settings and set
286 43
        $this->setExtensionSettings($settings);
287
288
        // set defaults:
289 43
        if (MathUtility::convertToPositiveInteger($this->extensionSettings['countInARun']) == 0) {
290 36
            $this->extensionSettings['countInARun'] = 100;
291
        }
292
293 43
        $this->extensionSettings['processLimit'] = MathUtility::forceIntegerInRange($this->extensionSettings['processLimit'], 1, 99, 1);
294 43
    }
295
296
    /**
297
     * Sets the extensions settings (unserialized pendant of $TYPO3_CONF_VARS['EXT']['extConf']['crawler']).
298
     *
299
     * @param array $extensionSettings
300
     * @return void
301
     */
302 52
    public function setExtensionSettings(array $extensionSettings)
303
    {
304 52
        $this->extensionSettings = $extensionSettings;
305 52
    }
306
307
    /**
308
     * Check if the given page should be crawled
309
     *
310
     * @param array $pageRow
311
     * @return false|string false if the page should be crawled (not excluded), true / skipMessage if it should be skipped
312
     */
313 10
    public function checkIfPageShouldBeSkipped(array $pageRow)
314
    {
315 10
        $skipPage = false;
316 10
        $skipMessage = 'Skipped'; // message will be overwritten later
317
318
        // if page is hidden
319 10
        if (!$this->extensionSettings['crawlHiddenPages']) {
320 10
            if ($pageRow['hidden']) {
321 1
                $skipPage = true;
322 1
                $skipMessage = 'Because page is hidden';
323
            }
324
        }
325
326 10
        if (!$skipPage) {
327 9
            if (GeneralUtility::inList('3,4', $pageRow['doktype']) || $pageRow['doktype'] >= 199) {
328 3
                $skipPage = true;
329 3
                $skipMessage = 'Because doktype is not allowed';
330
            }
331
        }
332
333 10
        if (!$skipPage) {
334 6
            if (is_array($GLOBALS['TYPO3_CONF_VARS']['EXTCONF']['crawler']['excludeDoktype'])) {
335 2
                foreach ($GLOBALS['TYPO3_CONF_VARS']['EXTCONF']['crawler']['excludeDoktype'] as $key => $doktypeList) {
336 1
                    if (GeneralUtility::inList($doktypeList, $pageRow['doktype'])) {
337 1
                        $skipPage = true;
338 1
                        $skipMessage = 'Doktype was excluded by "' . $key . '"';
339 1
                        break;
340
                    }
341
                }
342
            }
343
        }
344
345 10
        if (!$skipPage) {
346
            // veto hook
347 5
            if (is_array($GLOBALS['TYPO3_CONF_VARS']['EXTCONF']['crawler']['pageVeto'])) {
348
                foreach ($GLOBALS['TYPO3_CONF_VARS']['EXTCONF']['crawler']['pageVeto'] as $key => $func) {
349
                    $params = [
350
                        'pageRow' => $pageRow,
351
                    ];
352
                    // expects "false" if page is ok and "true" or a skipMessage if this page should _not_ be crawled
353
                    $veto = GeneralUtility::callUserFunction($func, $params, $this);
354
                    if ($veto !== false) {
355
                        $skipPage = true;
356
                        if (is_string($veto)) {
357
                            $skipMessage = $veto;
358
                        } else {
359
                            $skipMessage = 'Veto from hook "' . htmlspecialchars($key) . '"';
360
                        }
361
                        // no need to execute other hooks if a previous one return a veto
362
                        break;
363
                    }
364
                }
365
            }
366
        }
367
368 10
        return $skipPage ? $skipMessage : false;
369
    }
370
371
    /**
372
     * Wrapper method for getUrlsForPageId()
373
     * It returns an array of configurations and no urls!
374
     *
375
     * @param array $pageRow Page record with at least dok-type and uid columns.
376
     * @param string $skipMessage
377
     * @return array
378
     * @see getUrlsForPageId()
379
     */
380 6
    public function getUrlsForPageRow(array $pageRow, &$skipMessage = '')
381
    {
382 6
        $message = $this->checkIfPageShouldBeSkipped($pageRow);
383
384 6
        if ($message === false) {
385 5
            $forceSsl = ($pageRow['url_scheme'] === 2) ? true : false;
386 5
            $res = $this->getUrlsForPageId($pageRow['uid'], $forceSsl);
387 5
            $skipMessage = '';
388
        } else {
389 1
            $skipMessage = $message;
390 1
            $res = [];
391
        }
392
393 6
        return $res;
394
    }
395
396
    /**
397
     * This method is used to count if there are ANY unprocessed queue entries
398
     * of a given page_id and the configuration which matches a given hash.
399
     * If there if none, we can skip an inner detail check
400
     *
401
     * @param  int $uid
402
     * @param  string $configurationHash
403
     * @return boolean
404
     */
405 7
    protected function noUnprocessedQueueEntriesForPageWithConfigurationHashExist($uid, $configurationHash)
406
    {
407 7
        $configurationHash = $this->db->fullQuoteStr($configurationHash, 'tx_crawler_queue');
408 7
        $res = $this->db->exec_SELECTquery('count(*) as anz', 'tx_crawler_queue', "page_id=" . intval($uid) . " AND configuration_hash=" . $configurationHash . " AND exec_time=0");
409 7
        $row = $this->db->sql_fetch_assoc($res);
410
411 7
        return ($row['anz'] == 0);
412
    }
413
414
    /**
415
     * Creates a list of URLs from input array (and submits them to queue if asked for)
416
     * See Web > Info module script + "indexed_search"'s crawler hook-client using this!
417
     *
418
     * @param    array        Information about URLs from pageRow to crawl.
419
     * @param    array        Page row
420
     * @param    integer        Unix time to schedule indexing to, typically time()
421
     * @param    integer        Number of requests per minute (creates the interleave between requests)
422
     * @param    boolean        If set, submits the URLs to queue
423
     * @param    boolean        If set (and submitcrawlUrls is false) will fill $downloadUrls with entries)
424
     * @param    array        Array which is passed by reference and contains the an id per url to secure we will not crawl duplicates
425
     * @param    array        Array which will be filled with URLS for download if flag is set.
426
     * @param    array        Array of processing instructions
427
     * @return    string        List of URLs (meant for display in backend module)
428
     *
429
     */
430 4
    public function urlListFromUrlArray(
431
    array $vv,
432
    array $pageRow,
433
    $scheduledTime,
434
    $reqMinute,
435
    $submitCrawlUrls,
436
    $downloadCrawlUrls,
437
    array &$duplicateTrack,
438
    array &$downloadUrls,
439
    array $incomingProcInstructions
440
    ) {
441 4
        $urlList = '';
442
        // realurl support (thanks to Ingo Renner)
443 4
        if (ExtensionManagementUtility::isLoaded('realurl') && $vv['subCfg']['realurl']) {
444
445
            /** @var tx_realurl $urlObj */
446
            $urlObj = GeneralUtility::makeInstance('tx_realurl');
447
448
            if (!empty($vv['subCfg']['baseUrl'])) {
449
                $urlParts = parse_url($vv['subCfg']['baseUrl']);
450
                $host = strtolower($urlParts['host']);
451
                $urlObj->host = $host;
452
453
                // First pass, finding configuration OR pointer string:
454
                $urlObj->extConf = isset($GLOBALS['TYPO3_CONF_VARS']['EXTCONF']['realurl'][$urlObj->host]) ? $GLOBALS['TYPO3_CONF_VARS']['EXTCONF']['realurl'][$urlObj->host] : $GLOBALS['TYPO3_CONF_VARS']['EXTCONF']['realurl']['_DEFAULT'];
455
456
                // If it turned out to be a string pointer, then look up the real config:
457
                if (is_string($urlObj->extConf)) {
458
                    $urlObj->extConf = is_array($GLOBALS['TYPO3_CONF_VARS']['EXTCONF']['realurl'][$urlObj->extConf]) ? $GLOBALS['TYPO3_CONF_VARS']['EXTCONF']['realurl'][$urlObj->extConf] : $GLOBALS['TYPO3_CONF_VARS']['EXTCONF']['realurl']['_DEFAULT'];
459
                }
460
            }
461
462
            if (!$GLOBALS['TSFE']->sys_page) {
463
                $GLOBALS['TSFE']->sys_page = GeneralUtility::makeInstance('TYPO3\CMS\Frontend\Page\PageRepository');
464
            }
465
466
            if (!$GLOBALS['TSFE']->tmpl->rootLine[0]['uid']) {
467
                $GLOBALS['TSFE']->tmpl->rootLine[0]['uid'] = $urlObj->extConf['pagePath']['rootpage_id'];
468
            }
469
        }
470
471 4
        if (is_array($vv['URLs'])) {
472 4
            $configurationHash = $this->getConfigurationHash($vv);
473 4
            $skipInnerCheck = $this->noUnprocessedQueueEntriesForPageWithConfigurationHashExist($pageRow['uid'], $configurationHash);
474
475 4
            foreach ($vv['URLs'] as $urlQuery) {
476 4
                if ($this->drawURLs_PIfilter($vv['subCfg']['procInstrFilter'], $incomingProcInstructions)) {
477
478
                    // Calculate cHash:
479 4
                    if ($vv['subCfg']['cHash']) {
480
                        /* @var $cacheHash \TYPO3\CMS\Frontend\Page\CacheHashCalculator */
481
                        $cacheHash = GeneralUtility::makeInstance('TYPO3\CMS\Frontend\Page\CacheHashCalculator');
482
                        $urlQuery .= '&cHash=' . $cacheHash->generateForParameters($urlQuery);
483
                    }
484
485
                    // Create key by which to determine unique-ness:
486 4
                    $uKey = $urlQuery . '|' . $vv['subCfg']['userGroups'] . '|' . $vv['subCfg']['baseUrl'] . '|' . $vv['subCfg']['procInstrFilter'];
487
488
                    // realurl support (thanks to Ingo Renner)
489 4
                    $urlQuery = 'index.php' . $urlQuery;
490 4
                    if (ExtensionManagementUtility::isLoaded('realurl') && $vv['subCfg']['realurl']) {
491
                        $params = [
492
                            'LD' => [
493
                                'totalURL' => $urlQuery,
494
                            ],
495
                            'TCEmainHook' => true,
496
                        ];
497
                        $urlObj->encodeSpURL($params);
0 ignored issues
show
Bug introduced by
The variable $urlObj does not seem to be defined for all execution paths leading up to this point.

If you define a variable conditionally, it can happen that it is not defined for all execution paths.

Let’s take a look at an example:

function myFunction($a) {
    switch ($a) {
        case 'foo':
            $x = 1;
            break;

        case 'bar':
            $x = 2;
            break;
    }

    // $x is potentially undefined here.
    echo $x;
}

In the above example, the variable $x is defined if you pass “foo” or “bar” as argument for $a. However, since the switch statement has no default case statement, if you pass any other value, the variable $x would be undefined.

Available Fixes

  1. Check for existence of the variable explicitly:

    function myFunction($a) {
        switch ($a) {
            case 'foo':
                $x = 1;
                break;
    
            case 'bar':
                $x = 2;
                break;
        }
    
        if (isset($x)) { // Make sure it's always set.
            echo $x;
        }
    }
    
  2. Define a default value for the variable:

    function myFunction($a) {
        $x = ''; // Set a default which gets overridden for certain paths.
        switch ($a) {
            case 'foo':
                $x = 1;
                break;
    
            case 'bar':
                $x = 2;
                break;
        }
    
        echo $x;
    }
    
  3. Add a value for the missing path:

    function myFunction($a) {
        switch ($a) {
            case 'foo':
                $x = 1;
                break;
    
            case 'bar':
                $x = 2;
                break;
    
            // We add support for the missing case.
            default:
                $x = '';
                break;
        }
    
        echo $x;
    }
    
Loading history...
498
                        $urlQuery = $params['LD']['totalURL'];
499
                    }
500
501
                    // Scheduled time:
502 4
                    $schTime = $scheduledTime + round(count($duplicateTrack) * (60 / $reqMinute));
503 4
                    $schTime = floor($schTime / 60) * 60;
504
505 4
                    if (isset($duplicateTrack[$uKey])) {
506
507
                        //if the url key is registered just display it and do not resubmit is
508
                        $urlList = '<em><span class="typo3-dimmed">' . htmlspecialchars($urlQuery) . '</span></em><br/>';
509
                    } else {
510 4
                        $urlList = '[' . date('d.m.y H:i', $schTime) . '] ' . htmlspecialchars($urlQuery);
511 4
                        $this->urlList[] = '[' . date('d.m.y H:i', $schTime) . '] ' . $urlQuery;
512
513 4
                        $theUrl = ($vv['subCfg']['baseUrl'] ? $vv['subCfg']['baseUrl'] : GeneralUtility::getIndpEnv('TYPO3_SITE_URL')) . $urlQuery;
514
515
                        // Submit for crawling!
516 4
                        if ($submitCrawlUrls) {
517 4
                            $added = $this->addUrl(
518 4
                            $pageRow['uid'],
519 4
                            $theUrl,
520 4
                            $vv['subCfg'],
521 4
                            $scheduledTime,
522 4
                            $configurationHash,
523 4
                            $skipInnerCheck
524
                            );
525 4
                            if ($added === false) {
526 4
                                $urlList .= ' (Url already existed)';
527
                            }
528
                        } elseif ($downloadCrawlUrls) {
529
                            $downloadUrls[$theUrl] = $theUrl;
530
                        }
531
532 4
                        $urlList .= '<br />';
533
                    }
534 4
                    $duplicateTrack[$uKey] = true;
535
                }
536
            }
537
        } else {
538
            $urlList = 'ERROR - no URL generated';
539
        }
540
541 4
        return $urlList;
542
    }
543
544
    /**
545
     * Returns true if input processing instruction is among registered ones.
546
     *
547
     * @param string $piString PI to test
548
     * @param array $incomingProcInstructions Processing instructions
549
     * @return boolean
550
     */
551 5
    public function drawURLs_PIfilter($piString, array $incomingProcInstructions)
552
    {
553 5
        if (empty($incomingProcInstructions)) {
554 1
            return true;
555
        }
556
557 4
        foreach ($incomingProcInstructions as $pi) {
558 4
            if (GeneralUtility::inList($piString, $pi)) {
559 4
                return true;
560
            }
561
        }
562 2
    }
563
564 5
    public function getPageTSconfigForId($id)
565
    {
566 5
        if (!$this->MP) {
567 5
            $pageTSconfig = BackendUtility::getPagesTSconfig($id);
568
        } else {
569
            list(, $mountPointId) = explode('-', $this->MP);
570
            $pageTSconfig = BackendUtility::getPagesTSconfig($mountPointId);
571
        }
572
573
        // Call a hook to alter configuration
574 5
        if (is_array($GLOBALS['TYPO3_CONF_VARS']['EXTCONF']['crawler']['getPageTSconfigForId'])) {
575
            $params = [
576
                'pageId' => $id,
577
                'pageTSConfig' => &$pageTSconfig,
578
            ];
579
            foreach ($GLOBALS['TYPO3_CONF_VARS']['EXTCONF']['crawler']['getPageTSconfigForId'] as $userFunc) {
580
                GeneralUtility::callUserFunction($userFunc, $params, $this);
581
            }
582
        }
583
584 5
        return $pageTSconfig;
585
    }
586
587
    /**
588
     * This methods returns an array of configurations.
589
     * And no urls!
590
     *
591
     * @param integer $id Page ID
592
     * @param bool $forceSsl Use https
593
     * @return array
594
     *
595
     * TODO: Should be switched back to protected - TNM 2018-11-16
596
     */
597 4
    public function getUrlsForPageId($id, $forceSsl = false)
598
    {
599
600
        /**
601
         * Get configuration from tsConfig
602
         */
603
604
        // Get page TSconfig for page ID:
605 4
        $pageTSconfig = $this->getPageTSconfigForId($id);
606
607 4
        $res = [];
608
609 4
        if (is_array($pageTSconfig) && is_array($pageTSconfig['tx_crawler.']['crawlerCfg.'])) {
610 3
            $crawlerCfg = $pageTSconfig['tx_crawler.']['crawlerCfg.'];
611
612 3
            if (is_array($crawlerCfg['paramSets.'])) {
613 3
                foreach ($crawlerCfg['paramSets.'] as $key => $values) {
614 3
                    if (is_array($values)) {
615 3
                        $key = str_replace('.', '', $key);
616
                        // Sub configuration for a single configuration string:
617 3
                        $subCfg = (array)$crawlerCfg['paramSets.'][$key . '.'];
618 3
                        $subCfg['key'] = $key;
619
620 3
                        if (strcmp($subCfg['procInstrFilter'], '')) {
621 3
                            $subCfg['procInstrFilter'] = implode(',', GeneralUtility::trimExplode(',', $subCfg['procInstrFilter']));
622
                        }
623 3
                        $pidOnlyList = implode(',', GeneralUtility::trimExplode(',', $subCfg['pidsOnly'], true));
624
625
                        // process configuration if it is not page-specific or if the specific page is the current page:
626 3
                        if (!strcmp($subCfg['pidsOnly'], '') || GeneralUtility::inList($pidOnlyList, $id)) {
627
628
                                // add trailing slash if not present
629 3
                            if (!empty($subCfg['baseUrl']) && substr($subCfg['baseUrl'], -1) != '/') {
630
                                $subCfg['baseUrl'] .= '/';
631
                            }
632
633
                            // Explode, process etc.:
634 3
                            $res[$key] = [];
635 3
                            $res[$key]['subCfg'] = $subCfg;
636 3
                            $res[$key]['paramParsed'] = $this->parseParams($crawlerCfg['paramSets.'][$key]);
637 3
                            $res[$key]['paramExpanded'] = $this->expandParameters($res[$key]['paramParsed'], $id);
638 3
                            $res[$key]['origin'] = 'pagets';
639
640
                            // recognize MP value
641 3
                            if (!$this->MP) {
642 3
                                $res[$key]['URLs'] = $this->compileUrls($res[$key]['paramExpanded'], ['?id=' . $id]);
643
                            } else {
644 3
                                $res[$key]['URLs'] = $this->compileUrls($res[$key]['paramExpanded'], ['?id=' . $id . '&MP=' . $this->MP]);
645
                            }
646
                        }
647
                    }
648
                }
649
            }
650
        }
651
652
        /**
653
         * Get configuration from tx_crawler_configuration records
654
         */
655
656
        // get records along the rootline
657 4
        $rootLine = BackendUtility::BEgetRootLine($id);
658
659 4
        foreach ($rootLine as $page) {
660 4
            $configurationRecordsForCurrentPage = $this->configurationRepository->getConfigurationRecordsPageUid($page['uid'])->toArray();
661
662 4
            if (is_array($configurationRecordsForCurrentPage)) {
663
                /** @var Configuration $configurationRecord */
664 4
                foreach ($configurationRecordsForCurrentPage as $configurationRecord) {
665
666
                        // check access to the configuration record
667 1
                    if (empty($configurationRecord->getBeGroups()) || $GLOBALS['BE_USER']->isAdmin() || $this->hasGroupAccess($GLOBALS['BE_USER']->user['usergroup_cached_list'], $configurationRecord->getBeGroups())) {
668 1
                        $pidOnlyList = implode(',', GeneralUtility::trimExplode(',', $configurationRecord->getPidsOnly(), true));
669
670
                        // process configuration if it is not page-specific or if the specific page is the current page:
671 1
                        if (!strcmp($configurationRecord->getPidsOnly(), '') || GeneralUtility::inList($pidOnlyList, $id)) {
672 1
                            $key = $configurationRecord->getName();
673
674
                            // don't overwrite previously defined paramSets
675 1
                            if (!isset($res[$key])) {
676
677
                                    /* @var $TSparserObject \TYPO3\CMS\Core\TypoScript\Parser\TypoScriptParser */
678 1
                                $TSparserObject = GeneralUtility::makeInstance('TYPO3\CMS\Core\TypoScript\Parser\TypoScriptParser');
679
                                // Todo: Check where the field processing_instructions_parameters_ts comes from.
680 1
                                $TSparserObject->parse($configurationRecord->getProcessingInstructionFilter()); //['processing_instruction_parameters_ts']);
681
682 1
                                $isCrawlingProtocolHttps = $this->isCrawlingProtocolHttps($configurationRecord->isForceSsl(), $forceSsl);
683
684
                                $subCfg = [
685 1
                                    'procInstrFilter' => $configurationRecord->getProcessingInstructionFilter(),
686 1
                                    'procInstrParams.' => $TSparserObject->setup,
687 1
                                    'baseUrl' => $this->getBaseUrlForConfigurationRecord(
688 1
                                        $configurationRecord->getBaseUrl(),
689 1
                                        $configurationRecord->getSysDomainBaseUrl(),
690 1
                                        $isCrawlingProtocolHttps
691
                                    ),
692 1
                                    'realurl' => $configurationRecord->getRealUrl(),
693 1
                                    'cHash' => $configurationRecord->getCHash(),
694 1
                                    'userGroups' => $configurationRecord->getFeGroups(),
695 1
                                    'exclude' => $configurationRecord->getExcludeText(),
696 1
                                    'rootTemplatePid' => (int) $configurationRecord->getRootTemplatePid(),
697 1
                                    'key' => $key,
698
                                ];
699
700
                                // add trailing slash if not present
701 1
                                if (!empty($subCfg['baseUrl']) && substr($subCfg['baseUrl'], -1) != '/') {
702
                                    $subCfg['baseUrl'] .= '/';
703
                                }
704 1
                                if (!in_array($id, $this->expandExcludeString($subCfg['exclude']))) {
705 1
                                    $res[$key] = [];
706 1
                                    $res[$key]['subCfg'] = $subCfg;
707 1
                                    $res[$key]['paramParsed'] = $this->parseParams($configurationRecord->getConfiguration());
708 1
                                    $res[$key]['paramExpanded'] = $this->expandParameters($res[$key]['paramParsed'], $id);
709 1
                                    $res[$key]['URLs'] = $this->compileUrls($res[$key]['paramExpanded'], ['?id=' . $id]);
710 4
                                    $res[$key]['origin'] = 'tx_crawler_configuration_' . $configurationRecord->getUid();
711
                                }
712
                            }
713
                        }
714
                    }
715
                }
716
            }
717
        }
718
719 4
        if (is_array($GLOBALS['TYPO3_CONF_VARS']['EXTCONF']['crawler']['processUrls'])) {
720
            foreach ($GLOBALS['TYPO3_CONF_VARS']['EXTCONF']['crawler']['processUrls'] as $func) {
721
                $params = [
722
                    'res' => &$res,
723
                ];
724
                GeneralUtility::callUserFunction($func, $params, $this);
725
            }
726
        }
727
728 4
        return $res;
729
    }
730
731
    /**
732
     * Checks if a domain record exist and returns the base-url based on the record. If not the given baseUrl string is used.
733
     *
734
     * @param string $baseUrl
735
     * @param integer $sysDomainUid
736
     * @param bool $ssl
737
     * @return string
738
     */
739 4
    protected function getBaseUrlForConfigurationRecord($baseUrl, $sysDomainUid, $ssl = false)
740
    {
741 4
        $sysDomainUid = intval($sysDomainUid);
742 4
        $urlScheme = ($ssl === false) ? 'http' : 'https';
743
744 4
        if ($sysDomainUid > 0) {
745 2
            $res = $this->db->exec_SELECTquery(
746 2
                '*',
747 2
                'sys_domain',
748 2
                'uid = ' . $sysDomainUid .
749 2
                BackendUtility::BEenableFields('sys_domain') .
750 2
                BackendUtility::deleteClause('sys_domain')
751
            );
752 2
            $row = $this->db->sql_fetch_assoc($res);
753 2
            if ($row['domainName'] != '') {
754 1
                return $urlScheme . '://' . $row['domainName'];
755
            }
756
        }
757 3
        return $baseUrl;
758
    }
759
760 1
    public function getConfigurationsForBranch($rootid, $depth)
761
    {
762 1
        $configurationsForBranch = [];
763
764 1
        $pageTSconfig = $this->getPageTSconfigForId($rootid);
765 1
        if (is_array($pageTSconfig) && is_array($pageTSconfig['tx_crawler.']['crawlerCfg.']) && is_array($pageTSconfig['tx_crawler.']['crawlerCfg.']['paramSets.'])) {
766
            $sets = $pageTSconfig['tx_crawler.']['crawlerCfg.']['paramSets.'];
767
            if (is_array($sets)) {
768
                foreach ($sets as $key => $value) {
769
                    if (!is_array($value)) {
770
                        continue;
771
                    }
772
                    $configurationsForBranch[] = substr($key, -1) == '.' ? substr($key, 0, -1) : $key;
773
                }
774
            }
775
        }
776 1
        $pids = [];
777 1
        $rootLine = BackendUtility::BEgetRootLine($rootid);
778 1
        foreach ($rootLine as $node) {
779 1
            $pids[] = $node['uid'];
780
        }
781
        /* @var PageTreeView $tree */
782 1
        $tree = GeneralUtility::makeInstance(PageTreeView::class);
783 1
        $perms_clause = $GLOBALS['BE_USER']->getPagePermsClause(1);
784 1
        $tree->init('AND ' . $perms_clause);
785 1
        $tree->getTree($rootid, $depth, '');
786 1
        foreach ($tree->tree as $node) {
787
            $pids[] = $node['row']['uid'];
788
        }
789
790 1
        $res = $this->db->exec_SELECTquery(
791 1
            '*',
792 1
            'tx_crawler_configuration',
793 1
            'pid IN (' . implode(',', $pids) . ') ' .
794 1
            BackendUtility::BEenableFields('tx_crawler_configuration') .
795 1
            BackendUtility::deleteClause('tx_crawler_configuration') . ' ' .
796 1
            BackendUtility::versioningPlaceholderClause('tx_crawler_configuration') . ' '
797
        );
798
799 1
        while ($row = $this->db->sql_fetch_assoc($res)) {
800 1
            $configurationsForBranch[] = $row['name'];
801
        }
802 1
        $this->db->sql_free_result($res);
803 1
        return $configurationsForBranch;
804
    }
805
806
    /**
807
     * Check if a user has access to an item
808
     * (e.g. get the group list of the current logged in user from $GLOBALS['TSFE']->gr_list)
809
     *
810
     * @see \TYPO3\CMS\Frontend\Page\PageRepository::getMultipleGroupsWhereClause()
811
     * @param  string $groupList    Comma-separated list of (fe_)group UIDs from a user
812
     * @param  string $accessList   Comma-separated list of (fe_)group UIDs of the item to access
813
     * @return bool                 TRUE if at least one of the users group UIDs is in the access list or the access list is empty
814
     */
815 3
    public function hasGroupAccess($groupList, $accessList)
816
    {
817 3
        if (empty($accessList)) {
818 1
            return true;
819
        }
820 2
        foreach (GeneralUtility::intExplode(',', $groupList) as $groupUid) {
821 2
            if (GeneralUtility::inList($accessList, $groupUid)) {
822 2
                return true;
823
            }
824
        }
825 1
        return false;
826
    }
827
828
    /**
829
     * Parse GET vars of input Query into array with key=>value pairs
830
     *
831
     * @param string $inputQuery Input query string
832
     * @return array
833
     */
834 7
    public function parseParams($inputQuery)
835
    {
836
        // Extract all GET parameters into an ARRAY:
837 7
        $paramKeyValues = [];
838 7
        $GETparams = explode('&', $inputQuery);
839
840 7
        foreach ($GETparams as $paramAndValue) {
841 7
            list($p, $v) = explode('=', $paramAndValue, 2);
842 7
            if (strlen($p)) {
843 7
                $paramKeyValues[rawurldecode($p)] = rawurldecode($v);
844
            }
845
        }
846
847 7
        return $paramKeyValues;
848
    }
849
850
    /**
851
     * Will expand the parameters configuration to individual values. This follows a certain syntax of the value of each parameter.
852
     * Syntax of values:
853
     * - Basically: If the value is wrapped in [...] it will be expanded according to the following syntax, otherwise the value is taken literally
854
     * - Configuration is splitted by "|" and the parts are processed individually and finally added together
855
     * - For each configuration part:
856
     *         - "[int]-[int]" = Integer range, will be expanded to all values in between, values included, starting from low to high (max. 1000). Example "1-34" or "-40--30"
857
     *         - "_TABLE:[TCA table name];[_PID:[optional page id, default is current page]];[_ENABLELANG:1]" = Look up of table records from PID, filtering out deleted records. Example "_TABLE:tt_content; _PID:123"
858
     *        _ENABLELANG:1 picks only original records without their language overlays
859
     *         - Default: Literal value
860
     *
861
     * @param array $paramArray Array with key (GET var name) and values (value of GET var which is configuration for expansion)
862
     * @param integer $pid Current page ID
863
     * @return array
864
     */
865 4
    public function expandParameters($paramArray, $pid)
866
    {
867 4
        global $TCA;
868
869
        // Traverse parameter names:
870 4
        foreach ($paramArray as $p => $v) {
871 4
            $v = trim($v);
872
873
            // If value is encapsulated in square brackets it means there are some ranges of values to find, otherwise the value is literal
874 4
            if (substr($v, 0, 1) === '[' && substr($v, -1) === ']') {
875
                // So, find the value inside brackets and reset the paramArray value as an array.
876 4
                $v = substr($v, 1, -1);
877 4
                $paramArray[$p] = [];
878
879
                // Explode parts and traverse them:
880 4
                $parts = explode('|', $v);
881 4
                foreach ($parts as $pV) {
882
883
                        // Look for integer range: (fx. 1-34 or -40--30 // reads minus 40 to minus 30)
884 4
                    if (preg_match('/^(-?[0-9]+)\s*-\s*(-?[0-9]+)$/', trim($pV), $reg)) {
885
886
                        // Swap if first is larger than last:
887
                        if ($reg[1] > $reg[2]) {
888
                            $temp = $reg[2];
889
                            $reg[2] = $reg[1];
890
                            $reg[1] = $temp;
891
                        }
892
893
                        // Traverse range, add values:
894
                        $runAwayBrake = 1000; // Limit to size of range!
895
                        for ($a = $reg[1]; $a <= $reg[2];$a++) {
896
                            $paramArray[$p][] = $a;
897
                            $runAwayBrake--;
898
                            if ($runAwayBrake <= 0) {
899
                                break;
900
                            }
901
                        }
902 4
                    } elseif (substr(trim($pV), 0, 7) == '_TABLE:') {
903
904
                        // Parse parameters:
905
                        $subparts = GeneralUtility::trimExplode(';', $pV);
906
                        $subpartParams = [];
907
                        foreach ($subparts as $spV) {
908
                            list($pKey, $pVal) = GeneralUtility::trimExplode(':', $spV);
909
                            $subpartParams[$pKey] = $pVal;
910
                        }
911
912
                        // Table exists:
913
                        if (isset($TCA[$subpartParams['_TABLE']])) {
914
                            $lookUpPid = isset($subpartParams['_PID']) ? intval($subpartParams['_PID']) : $pid;
915
                            $pidField = isset($subpartParams['_PIDFIELD']) ? trim($subpartParams['_PIDFIELD']) : 'pid';
916
                            $where = isset($subpartParams['_WHERE']) ? $subpartParams['_WHERE'] : '';
917
                            $addTable = isset($subpartParams['_ADDTABLE']) ? $subpartParams['_ADDTABLE'] : '';
918
919
                            $fieldName = $subpartParams['_FIELD'] ? $subpartParams['_FIELD'] : 'uid';
920
                            if ($fieldName === 'uid' || $TCA[$subpartParams['_TABLE']]['columns'][$fieldName]) {
921
                                $andWhereLanguage = '';
922
                                $transOrigPointerField = $TCA[$subpartParams['_TABLE']]['ctrl']['transOrigPointerField'];
923
924
                                if ($subpartParams['_ENABLELANG'] && $transOrigPointerField) {
925
                                    $andWhereLanguage = ' AND ' . $this->db->quoteStr($transOrigPointerField, $subpartParams['_TABLE']) . ' <= 0 ';
926
                                }
927
928
                                $where = $this->db->quoteStr($pidField, $subpartParams['_TABLE']) . '=' . intval($lookUpPid) . ' ' .
929
                                    $andWhereLanguage . $where;
930
931
                                $rows = $this->db->exec_SELECTgetRows(
932
                                    $fieldName,
933
                                    $subpartParams['_TABLE'] . $addTable,
934
                                    $where . BackendUtility::deleteClause($subpartParams['_TABLE']),
935
                                    '',
936
                                    '',
937
                                    '',
938
                                    $fieldName
939
                                );
940
941
                                if (is_array($rows)) {
942
                                    $paramArray[$p] = array_merge($paramArray[$p], array_keys($rows));
943
                                }
944
                            }
945
                        }
946
                    } else { // Just add value:
947 4
                        $paramArray[$p][] = $pV;
948
                    }
949
                    // Hook for processing own expandParameters place holder
950 4
                    if (is_array($GLOBALS['TYPO3_CONF_VARS']['SC_OPTIONS']['crawler/class.tx_crawler_lib.php']['expandParameters'])) {
951
                        $_params = [
952
                            'pObj' => &$this,
953
                            'paramArray' => &$paramArray,
954
                            'currentKey' => $p,
955
                            'currentValue' => $pV,
956
                            'pid' => $pid,
957
                        ];
958
                        foreach ($GLOBALS['TYPO3_CONF_VARS']['SC_OPTIONS']['crawler/class.tx_crawler_lib.php']['expandParameters'] as $key => $_funcRef) {
959 4
                            GeneralUtility::callUserFunction($_funcRef, $_params, $this);
960
                        }
961
                    }
962
                }
963
964
                // Make unique set of values and sort array by key:
965 4
                $paramArray[$p] = array_unique($paramArray[$p]);
966 4
                ksort($paramArray);
967
            } else {
968
                // Set the literal value as only value in array:
969 4
                $paramArray[$p] = [$v];
970
            }
971
        }
972
973 4
        return $paramArray;
974
    }
975
976
    /**
977
     * Compiling URLs from parameter array (output of expandParameters())
978
     * The number of URLs will be the multiplication of the number of parameter values for each key
979
     *
980
     * @param array $paramArray Output of expandParameters(): Array with keys (GET var names) and for each an array of values
981
     * @param array $urls URLs accumulated in this array (for recursion)
982
     * @return array
983
     */
984 7
    public function compileUrls($paramArray, $urls = [])
985
    {
986 7
        if (count($paramArray) && is_array($urls)) {
987
            // shift first off stack:
988 6
            reset($paramArray);
989 6
            $varName = key($paramArray);
990 6
            $valueSet = array_shift($paramArray);
991
992
            // Traverse value set:
993 6
            $newUrls = [];
994 6
            foreach ($urls as $url) {
995 5
                foreach ($valueSet as $val) {
996 5
                    $newUrls[] = $url . (strcmp($val, '') ? '&' . rawurlencode($varName) . '=' . rawurlencode($val) : '');
997
998 5
                    if (count($newUrls) > MathUtility::forceIntegerInRange($this->extensionSettings['maxCompileUrls'], 1, 1000000000, 10000)) {
999 5
                        break;
1000
                    }
1001
                }
1002
            }
1003 6
            $urls = $newUrls;
1004 6
            $urls = $this->compileUrls($paramArray, $urls);
1005
        }
1006
1007 7
        return $urls;
1008
    }
1009
1010
    /************************************
1011
     *
1012
     * Crawler log
1013
     *
1014
     ************************************/
1015
1016
    /**
1017
     * Return array of records from crawler queue for input page ID
1018
     *
1019
     * @param integer $id Page ID for which to look up log entries.
1020
     * @param string$filter Filter: "all" => all entries, "pending" => all that is not yet run, "finished" => all complete ones
1021
     * @param boolean $doFlush If TRUE, then entries selected at DELETED(!) instead of selected!
1022
     * @param boolean $doFullFlush
1023
     * @param integer $itemsPerPage Limit the amount of entries per page default is 10
1024
     * @return array
1025
     */
1026 4
    public function getLogEntriesForPageId($id, $filter = '', $doFlush = false, $doFullFlush = false, $itemsPerPage = 10)
1027
    {
1028
        switch ($filter) {
1029 4
            case 'pending':
1030
                $addWhere = ' AND exec_time=0';
1031
                break;
1032 4
            case 'finished':
1033
                $addWhere = ' AND exec_time>0';
1034
                break;
1035
            default:
1036 4
                $addWhere = '';
1037 4
                break;
1038
        }
1039
1040
        // FIXME: Write unit test that ensures that the right records are deleted.
1041 4
        if ($doFlush) {
1042 2
            $this->flushQueue(($doFullFlush ? '1=1' : ('page_id=' . intval($id))) . $addWhere);
1043 2
            return [];
1044
        } else {
1045 2
            return $this->db->exec_SELECTgetRows(
1046 2
                '*',
1047 2
                'tx_crawler_queue',
1048 2
                'page_id=' . intval($id) . $addWhere,
1049 2
                '',
1050 2
                'scheduled DESC',
1051 2
                (intval($itemsPerPage) > 0 ? intval($itemsPerPage) : '')
1052
            );
1053
        }
1054
    }
1055
1056
    /**
1057
     * Return array of records from crawler queue for input set ID
1058
     *
1059
     * @param integer $set_id Set ID for which to look up log entries.
1060
     * @param string $filter Filter: "all" => all entries, "pending" => all that is not yet run, "finished" => all complete ones
1061
     * @param boolean $doFlush If TRUE, then entries selected at DELETED(!) instead of selected!
1062
     * @param integer $itemsPerPage Limit the amount of entires per page default is 10
1063
     * @return array
1064
     */
1065 6
    public function getLogEntriesForSetId($set_id, $filter = '', $doFlush = false, $doFullFlush = false, $itemsPerPage = 10)
1066
    {
1067
        // FIXME: Write Unit tests for Filters
1068
        switch ($filter) {
1069 6
            case 'pending':
1070 1
                $addWhere = ' AND exec_time=0';
1071 1
                break;
1072 5
            case 'finished':
1073 1
                $addWhere = ' AND exec_time>0';
1074 1
                break;
1075
            default:
1076 4
                $addWhere = '';
1077 4
                break;
1078
        }
1079
        // FIXME: Write unit test that ensures that the right records are deleted.
1080 6
        if ($doFlush) {
1081 4
            $this->flushQueue($doFullFlush ? '' : ('set_id=' . intval($set_id) . $addWhere));
1082 4
            return [];
1083
        } else {
1084 2
            return $this->db->exec_SELECTgetRows(
1085 2
                '*',
1086 2
                'tx_crawler_queue',
1087 2
                'set_id=' . intval($set_id) . $addWhere,
1088 2
                '',
1089 2
                'scheduled DESC',
1090 2
                (intval($itemsPerPage) > 0 ? intval($itemsPerPage) : '')
1091
            );
1092
        }
1093
    }
1094
1095
    /**
1096
     * Removes queue entries
1097
     *
1098
     * @param string $where SQL related filter for the entries which should be removed
1099
     * @return void
1100
     */
1101 10
    protected function flushQueue($where = '')
1102
    {
1103 10
        $realWhere = strlen($where) > 0 ? $where : '1=1';
1104
1105 10
        if (EventDispatcher::getInstance()->hasObserver('queueEntryFlush') || SignalSlotUtility::hasSignal(__CLASS__, SignalSlotUtility::SIGNAL_QUEUE_ENTRY_FLUSH)) {
1106
            $groups = $this->db->exec_SELECTgetRows('DISTINCT set_id', 'tx_crawler_queue', $realWhere);
1107
            if (is_array($groups)) {
1108
                foreach ($groups as $group) {
1109
1110
                    // The event dispatcher is deprecated since crawler v6.4.0, will be removed in crawler v7.0.0.
1111
                    // Please use the Signal instead.
1112
                    if (EventDispatcher::getInstance()->hasObserver('queueEntryFlush')) {
1113
                        EventDispatcher::getInstance()->post(
1114
                            'queueEntryFlush',
1115
                            $group['set_id'],
1116
                            $this->db->exec_SELECTgetRows('uid, set_id', 'tx_crawler_queue', $realWhere . ' AND set_id="' . $group['set_id'] . '"')
1117
                        );
1118
                    }
1119
1120
                    if (SignalSlotUtility::hasSignal(__CLASS__, SignalSlotUtility::SIGNAL_QUEUE_ENTRY_FLUSH)) {
1121
                        $signalInputArray = $this->db->exec_SELECTgetRows('uid, set_id', 'tx_crawler_queue', $realWhere . ' AND set_id="' . $group['set_id'] . '"');
1122
                        SignalSlotUtility::emitSignal(
1123
                            __CLASS__,
1124
                            SignalSlotUtility::SIGNAL_QUEUE_ENTRY_FLUSH,
1125
                            $signalInputArray
0 ignored issues
show
Bug introduced by
It seems like $signalInputArray defined by $this->db->exec_SELECTge...$group['set_id'] . '"') on line 1121 can also be of type null; however, AOE\Crawler\Utility\Sign...otUtility::emitSignal() does only seem to accept array, maybe add an additional type check?

If a method or function can return multiple different values and unless you are sure that you only can receive a single value in this context, we recommend to add an additional type check:

/**
 * @return array|string
 */
function returnsDifferentValues($x) {
    if ($x) {
        return 'foo';
    }

    return array();
}

$x = returnsDifferentValues($y);
if (is_array($x)) {
    // $x is an array.
}

If this a common case that PHP Analyzer should handle natively, please let us know by opening an issue.

Loading history...
1126
                        );
1127
                    }
1128
                }
1129
            }
1130
        }
1131
1132 10
        $GLOBALS['TYPO3_DB']->exec_DELETEquery('tx_crawler_queue', $realWhere);
1133 10
    }
1134
1135
    /**
1136
     * Adding call back entries to log (called from hooks typically, see indexed search class "class.crawler.php"
1137
     *
1138
     * @param integer $setId Set ID
1139
     * @param array $params Parameters to pass to call back function
1140
     * @param string $callBack Call back object reference, eg. 'EXT:indexed_search/class.crawler.php:&tx_indexedsearch_crawler'
1141
     * @param integer $page_id Page ID to attach it to
1142
     * @param integer $schedule Time at which to activate
1143
     * @return void
1144
     */
1145
    public function addQueueEntry_callBack($setId, $params, $callBack, $page_id = 0, $schedule = 0)
1146
    {
1147
        if (!is_array($params)) {
1148
            $params = [];
1149
        }
1150
        $params['_CALLBACKOBJ'] = $callBack;
1151
1152
        // Compile value array:
1153
        $fieldArray = [
1154
            'page_id' => intval($page_id),
1155
            'parameters' => serialize($params),
1156
            'scheduled' => intval($schedule) ? intval($schedule) : $this->getCurrentTime(),
1157
            'exec_time' => 0,
1158
            'set_id' => intval($setId),
1159
            'result_data' => '',
1160
        ];
1161
1162
        $this->db->exec_INSERTquery('tx_crawler_queue', $fieldArray);
1163
    }
1164
1165
    /************************************
1166
     *
1167
     * URL setting
1168
     *
1169
     ************************************/
1170
1171
    /**
1172
     * Setting a URL for crawling:
1173
     *
1174
     * @param integer $id Page ID
1175
     * @param string $url Complete URL
1176
     * @param array $subCfg Sub configuration array (from TS config)
1177
     * @param integer $tstamp Scheduled-time
1178
     * @param string $configurationHash (optional) configuration hash
1179
     * @param bool $skipInnerDuplicationCheck (optional) skip inner duplication check
1180
     * @return bool
1181
     */
1182 8
    public function addUrl(
1183
        $id,
1184
        $url,
1185
        array $subCfg,
1186
        $tstamp,
1187
        $configurationHash = '',
1188
        $skipInnerDuplicationCheck = false
1189
    ) {
1190 8
        $urlAdded = false;
1191 8
        $rows = [];
1192
1193
        // Creating parameters:
1194
        $parameters = [
1195 8
            'url' => $url,
1196
        ];
1197
1198
        // fe user group simulation:
1199 8
        $uGs = implode(',', array_unique(GeneralUtility::intExplode(',', $subCfg['userGroups'], true)));
1200 8
        if ($uGs) {
1201 1
            $parameters['feUserGroupList'] = $uGs;
1202
        }
1203
1204
        // Setting processing instructions
1205 8
        $parameters['procInstructions'] = GeneralUtility::trimExplode(',', $subCfg['procInstrFilter']);
1206 8
        if (is_array($subCfg['procInstrParams.'])) {
1207 5
            $parameters['procInstrParams'] = $subCfg['procInstrParams.'];
1208
        }
1209
1210
        // Possible TypoScript Template Parents
1211 8
        $parameters['rootTemplatePid'] = $subCfg['rootTemplatePid'];
1212
1213
        // Compile value array:
1214 8
        $parameters_serialized = serialize($parameters);
1215
        $fieldArray = [
1216 8
            'page_id' => intval($id),
1217 8
            'parameters' => $parameters_serialized,
1218 8
            'parameters_hash' => GeneralUtility::shortMD5($parameters_serialized),
1219 8
            'configuration_hash' => $configurationHash,
1220 8
            'scheduled' => $tstamp,
1221 8
            'exec_time' => 0,
1222 8
            'set_id' => intval($this->setID),
1223 8
            'result_data' => '',
1224 8
            'configuration' => $subCfg['key'],
1225
        ];
1226
1227 8
        if ($this->registerQueueEntriesInternallyOnly) {
1228
            //the entries will only be registered and not stored to the database
1229 1
            $this->queueEntries[] = $fieldArray;
1230
        } else {
1231 7
            if (!$skipInnerDuplicationCheck) {
1232
                // check if there is already an equal entry
1233 6
                $rows = $this->getDuplicateRowsIfExist($tstamp, $fieldArray);
1234
            }
1235
1236 7
            if (count($rows) == 0) {
1237 6
                $this->db->exec_INSERTquery('tx_crawler_queue', $fieldArray);
1238 6
                $uid = $this->db->sql_insert_id();
1239 6
                $rows[] = $uid;
1240 6
                $urlAdded = true;
1241
1242
                // The event dispatcher is deprecated since crawler v6.4.0, will be removed in crawler v7.0.0.
1243
                // Please use the Signal instead.
1244 6
                EventDispatcher::getInstance()->post('urlAddedToQueue', $this->setID, ['uid' => $uid, 'fieldArray' => $fieldArray]);
1245
1246 6
                $signalPayload = ['uid' => $uid, 'fieldArray' => $fieldArray];
1247 6
                SignalSlotUtility::emitSignal(
1248 6
                    __CLASS__,
1249 6
                    SignalSlotUtility::SIGNAL_URL_ADDED_TO_QUEUE,
1250 6
                    $signalPayload
1251
                );
1252
1253
            } else {
1254
                // The event dispatcher is deprecated since crawler v6.4.0, will be removed in crawler v7.0.0.
1255
                // Please use the Signal instead.
1256 3
                EventDispatcher::getInstance()->post('duplicateUrlInQueue', $this->setID, ['rows' => $rows, 'fieldArray' => $fieldArray]);
1257
1258 3
                $signalPayload = ['rows' => $rows, 'fieldArray' => $fieldArray];
1259 3
                SignalSlotUtility::emitSignal(
1260 3
                    __CLASS__,
1261 3
                    SignalSlotUtility::SIGNAL_DUPLICATE_URL_IN_QUEUE,
1262 3
                    $signalPayload
1263
                );
1264
            }
1265
        }
1266
1267 8
        return $urlAdded;
1268
    }
1269
1270
    /**
1271
     * This method determines duplicates for a queue entry with the same parameters and this timestamp.
1272
     * If the timestamp is in the past, it will check if there is any unprocessed queue entry in the past.
1273
     * If the timestamp is in the future it will check, if the queued entry has exactly the same timestamp
1274
     *
1275
     * @param int $tstamp
1276
     * @param array $fieldArray
1277
     *
1278
     * @return array
1279
     */
1280 9
    protected function getDuplicateRowsIfExist($tstamp, $fieldArray)
1281
    {
1282 9
        $rows = [];
1283
1284 9
        $currentTime = $this->getCurrentTime();
1285
1286
        //if this entry is scheduled with "now"
1287 9
        if ($tstamp <= $currentTime) {
1288 3
            if ($this->extensionSettings['enableTimeslot']) {
1289 2
                $timeBegin = $currentTime - 100;
1290 2
                $timeEnd = $currentTime + 100;
1291 2
                $where = ' ((scheduled BETWEEN ' . $timeBegin . ' AND ' . $timeEnd . ' ) OR scheduled <= ' . $currentTime . ') ';
1292
            } else {
1293 3
                $where = 'scheduled <= ' . $currentTime;
1294
            }
1295 6
        } elseif ($tstamp > $currentTime) {
1296
            //entry with a timestamp in the future need to have the same schedule time
1297 6
            $where = 'scheduled = ' . $tstamp ;
1298
        }
1299
1300 9
        if (!empty($where)) {
1301 9
            $result = $this->db->exec_SELECTgetRows(
1302 9
                'qid',
1303 9
                'tx_crawler_queue',
1304
                $where .
1305 9
                ' AND NOT exec_time' .
1306 9
                ' AND NOT process_id ' .
1307 9
                ' AND page_id=' . intval($fieldArray['page_id']) .
1308 9
                ' AND parameters_hash = ' . $this->db->fullQuoteStr($fieldArray['parameters_hash'], 'tx_crawler_queue')
1309
            );
1310
1311 9
            if (is_array($result)) {
1312 9
                foreach ($result as $value) {
1313 7
                    $rows[] = $value['qid'];
1314
                }
1315
            }
1316
        }
1317
1318 9
        return $rows;
1319
    }
1320
1321
    /**
1322
     * Returns the current system time
1323
     *
1324
     * @return int
1325
     */
1326
    public function getCurrentTime()
1327
    {
1328
        return time();
1329
    }
1330
1331
    /************************************
1332
     *
1333
     * URL reading
1334
     *
1335
     ************************************/
1336
1337
    /**
1338
     * Read URL for single queue entry
1339
     *
1340
     * @param integer $queueId
1341
     * @param boolean $force If set, will process even if exec_time has been set!
1342
     * @return integer
1343
     */
1344
    public function readUrl($queueId, $force = false)
1345
    {
1346
        $ret = 0;
1347
        if ($this->debugMode) {
1348
            GeneralUtility::devlog('crawler-readurl start ' . microtime(true), __FUNCTION__);
1349
        }
1350
        // Get entry:
1351
        list($queueRec) = $this->db->exec_SELECTgetRows(
1352
            '*',
1353
            'tx_crawler_queue',
1354
            'qid=' . intval($queueId) . ($force ? '' : ' AND exec_time=0 AND process_scheduled > 0')
1355
        );
1356
1357
        if (!is_array($queueRec)) {
1358
            return;
1359
        }
1360
1361
        $parameters = unserialize($queueRec['parameters']);
1362
        if ($parameters['rootTemplatePid']) {
1363
            $this->initTSFE((int)$parameters['rootTemplatePid']);
1364
        } else {
1365
            GeneralUtility::sysLog(
1366
                'Page with (' . $queueRec['page_id'] . ') could not be crawled, please check your crawler configuration. Perhaps no Root Template Pid is set',
1367
                'crawler',
1368
                GeneralUtility::SYSLOG_SEVERITY_WARNING
1369
            );
1370
        }
1371
1372
        $signalPayload = [$queueId, $queueRec];
1373
        SignalSlotUtility::emitSignal(
1374
            __CLASS__,
1375
            SignalSlotUtility::SIGNAL_QUEUEITEM_PREPROCESS,
1376
            $signalPayload
1377
        );
1378
1379
        // Set exec_time to lock record:
1380
        $field_array = ['exec_time' => $this->getCurrentTime()];
1381
1382
        if (isset($this->processID)) {
1383
            //if mulitprocessing is used we need to store the id of the process which has handled this entry
1384
            $field_array['process_id_completed'] = $this->processID;
1385
        }
1386
        $this->db->exec_UPDATEquery('tx_crawler_queue', 'qid=' . intval($queueId), $field_array);
1387
1388
        $result = $this->readUrl_exec($queueRec);
1389
        $resultData = unserialize($result['content']);
1390
1391
        //atm there's no need to point to specific pollable extensions
1392
        if (is_array($GLOBALS['TYPO3_CONF_VARS']['EXTCONF']['crawler']['pollSuccess'])) {
1393
            foreach ($GLOBALS['TYPO3_CONF_VARS']['EXTCONF']['crawler']['pollSuccess'] as $pollable) {
1394
                // only check the success value if the instruction is runnig
1395
                // it is important to name the pollSuccess key same as the procInstructions key
1396
                if (is_array($resultData['parameters']['procInstructions']) && in_array(
1397
                    $pollable,
1398
                        $resultData['parameters']['procInstructions']
1399
                )
1400
                ) {
1401
                    if (!empty($resultData['success'][$pollable]) && $resultData['success'][$pollable]) {
1402
                        $ret |= self::CLI_STATUS_POLLABLE_PROCESSED;
1403
                    }
1404
                }
1405
            }
1406
        }
1407
1408
        // Set result in log which also denotes the end of the processing of this entry.
1409
        $field_array = ['result_data' => serialize($result)];
1410
1411
        $signalPayload = [$queueId, $field_array];
1412
        SignalSlotUtility::emitSignal(
1413
            __CLASS__,
1414
            SignalSlotUtility::SIGNAL_QUEUEITEM_POSTPROCESS,
1415
            $signalPayload
1416
        );
1417
1418
        $this->db->exec_UPDATEquery('tx_crawler_queue', 'qid=' . intval($queueId), $field_array);
1419
1420
        if ($this->debugMode) {
1421
            GeneralUtility::devlog('crawler-readurl stop ' . microtime(true), __FUNCTION__);
1422
        }
1423
1424
        return $ret;
1425
    }
1426
1427
    /**
1428
     * Read URL for not-yet-inserted log-entry
1429
     *
1430
     * @param array $field_array Queue field array,
1431
     *
1432
     * @return string
1433
     */
1434
    public function readUrlFromArray($field_array)
1435
    {
1436
1437
            // Set exec_time to lock record:
1438
        $field_array['exec_time'] = $this->getCurrentTime();
1439
        $this->db->exec_INSERTquery('tx_crawler_queue', $field_array);
1440
        $queueId = $field_array['qid'] = $this->db->sql_insert_id();
1441
1442
        $result = $this->readUrl_exec($field_array);
1443
1444
        // Set result in log which also denotes the end of the processing of this entry.
1445
        $field_array = ['result_data' => serialize($result)];
1446
1447
        $signalPayload = [$queueId, $field_array];
1448
        SignalSlotUtility::emitSignal(
1449
            __CLASS__,
1450
            SignalSlotUtility::SIGNAL_QUEUEITEM_POSTPROCESS,
1451
            $signalPayload
1452
        );
1453
1454
        $this->db->exec_UPDATEquery('tx_crawler_queue', 'qid=' . intval($queueId), $field_array);
1455
1456
        return $result;
1457
    }
1458
1459
    /**
1460
     * Read URL for a queue record
1461
     *
1462
     * @param array $queueRec Queue record
1463
     * @return string
1464
     */
1465
    public function readUrl_exec($queueRec)
1466
    {
1467
        // Decode parameters:
1468
        $parameters = unserialize($queueRec['parameters']);
1469
        $result = 'ERROR';
1470
        if (is_array($parameters)) {
1471
            if ($parameters['_CALLBACKOBJ']) { // Calling object:
1472
                $objRef = $parameters['_CALLBACKOBJ'];
1473
                $callBackObj = &GeneralUtility::getUserObj($objRef);
1474
                if (is_object($callBackObj)) {
1475
                    unset($parameters['_CALLBACKOBJ']);
1476
                    $result = ['content' => serialize($callBackObj->crawler_execute($parameters, $this))];
1477
                } else {
1478
                    $result = ['content' => 'No object: ' . $objRef];
1479
                }
1480
            } else { // Regular FE request:
1481
1482
                // Prepare:
1483
                $crawlerId = $queueRec['qid'] . ':' . md5($queueRec['qid'] . '|' . $queueRec['set_id'] . '|' . $GLOBALS['TYPO3_CONF_VARS']['SYS']['encryptionKey']);
1484
1485
                // Get result:
1486
                $result = $this->requestUrl($parameters['url'], $crawlerId);
1487
1488
                // The event dispatcher is deprecated since crawler v6.4.0, will be removed in crawler v7.0.0.
1489
                // Please use the Signal instead.
1490
                EventDispatcher::getInstance()->post('urlCrawled', $queueRec['set_id'], ['url' => $parameters['url'], 'result' => $result]);
1491
1492
                $signalPayload = ['url' => $parameters['url'], 'result' => $result];
1493
                SignalSlotUtility::emitSignal(
1494
                    __CLASS__,
1495
                    SignalSlotUtility::SIGNAL_URL_CRAWLED,
1496
                    $signalPayload
1497
                );
1498
            }
1499
        }
1500
1501
        return $result;
1502
    }
1503
1504
    /**
1505
     * Gets the content of a URL.
1506
     *
1507
     * @param string $originalUrl URL to read
1508
     * @param string $crawlerId Crawler ID string (qid + hash to verify)
1509
     * @param integer $timeout Timeout time
1510
     * @param integer $recursion Recursion limiter for 302 redirects
1511
     * @return array
1512
     */
1513 2
    public function requestUrl($originalUrl, $crawlerId, $timeout = 2, $recursion = 10)
1514
    {
1515 2
        if (!$recursion) {
1516
            return false;
1517
        }
1518
1519
        // Parse URL, checking for scheme:
1520 2
        $url = parse_url($originalUrl);
1521
1522 2
        if ($url === false) {
1523
            if (TYPO3_DLOG) {
1524
                GeneralUtility::devLog(sprintf('Could not parse_url() for string "%s"', $url), 'crawler', 4, ['crawlerId' => $crawlerId]);
1525
            }
1526
            return false;
1527
        }
1528
1529 2
        if (!in_array($url['scheme'], ['','http','https'])) {
1530
            if (TYPO3_DLOG) {
1531
                GeneralUtility::devLog(sprintf('Scheme does not match for url "%s"', $url), 'crawler', 4, ['crawlerId' => $crawlerId]);
1532
            }
1533
            return false;
1534
        }
1535
1536
        // direct request
1537 2
        if ($this->extensionSettings['makeDirectRequests']) {
1538 2
            $result = $this->sendDirectRequest($originalUrl, $crawlerId);
1539 2
            return $result;
1540
        }
1541
1542
        $reqHeaders = $this->buildRequestHeaderArray($url, $crawlerId);
1543
1544
        // thanks to Pierrick Caillon for adding proxy support
1545
        $rurl = $url;
1546
1547
        if ($this->extensionSettings['curlUse'] && $this->extensionSettings['curlProxyServer']) {
1548
            $rurl = parse_url($this->extensionSettings['curlProxyServer']);
1549
            $url['path'] = $url['scheme'] . '://' . $url['host'] . ($url['port'] > 0 ? ':' . $url['port'] : '') . $url['path'];
1550
            $reqHeaders = $this->buildRequestHeaderArray($url, $crawlerId);
1551
        }
1552
1553
        $host = $rurl['host'];
1554
1555
        if ($url['scheme'] == 'https') {
1556
            $host = 'ssl://' . $host;
1557
            $port = ($rurl['port'] > 0) ? $rurl['port'] : 443;
1558
        } else {
1559
            $port = ($rurl['port'] > 0) ? $rurl['port'] : 80;
1560
        }
1561
1562
        $startTime = microtime(true);
1563
        $fp = fsockopen($host, $port, $errno, $errstr, $timeout);
1564
1565
        if (!$fp) {
1566
            if (TYPO3_DLOG) {
1567
                GeneralUtility::devLog(sprintf('Error while opening "%s"', $url), 'crawler', 4, ['crawlerId' => $crawlerId]);
1568
            }
1569
            return false;
1570
        } else {
1571
            // Request message:
1572
            $msg = implode("\r\n", $reqHeaders) . "\r\n\r\n";
1573
            fputs($fp, $msg);
1574
1575
            // Read response:
1576
            $d = $this->getHttpResponseFromStream($fp);
1577
            fclose($fp);
1578
1579
            $time = microtime(true) - $startTime;
1580
            $this->log($originalUrl . ' ' . $time);
1581
1582
            // Implode content and headers:
1583
            $result = [
1584
                'request' => $msg,
1585
                'headers' => implode('', $d['headers']),
1586
                'content' => implode('', (array)$d['content']),
1587
            ];
1588
1589
            if (($this->extensionSettings['follow30x']) && ($newUrl = $this->getRequestUrlFrom302Header($d['headers'], $url['user'], $url['pass']))) {
1590
                $result = array_merge(['parentRequest' => $result], $this->requestUrl($newUrl, $crawlerId, $recursion--));
0 ignored issues
show
Bug introduced by
It seems like $newUrl defined by $this->getRequestUrlFrom...['user'], $url['pass']) on line 1589 can also be of type boolean; however, AOE\Crawler\Controller\C...ontroller::requestUrl() does only seem to accept string, maybe add an additional type check?

If a method or function can return multiple different values and unless you are sure that you only can receive a single value in this context, we recommend to add an additional type check:

/**
 * @return array|string
 */
function returnsDifferentValues($x) {
    if ($x) {
        return 'foo';
    }

    return array();
}

$x = returnsDifferentValues($y);
if (is_array($x)) {
    // $x is an array.
}

If this a common case that PHP Analyzer should handle natively, please let us know by opening an issue.

Loading history...
1591
                $newRequestUrl = $this->requestUrl($newUrl, $crawlerId, $timeout, --$recursion);
0 ignored issues
show
Bug introduced by
It seems like $newUrl defined by $this->getRequestUrlFrom...['user'], $url['pass']) on line 1589 can also be of type boolean; however, AOE\Crawler\Controller\C...ontroller::requestUrl() does only seem to accept string, maybe add an additional type check?

If a method or function can return multiple different values and unless you are sure that you only can receive a single value in this context, we recommend to add an additional type check:

/**
 * @return array|string
 */
function returnsDifferentValues($x) {
    if ($x) {
        return 'foo';
    }

    return array();
}

$x = returnsDifferentValues($y);
if (is_array($x)) {
    // $x is an array.
}

If this a common case that PHP Analyzer should handle natively, please let us know by opening an issue.

Loading history...
1592
1593
                if (is_array($newRequestUrl)) {
1594
                    $result = array_merge(['parentRequest' => $result], $newRequestUrl);
1595
                } else {
1596
                    if (TYPO3_DLOG) {
1597
                        GeneralUtility::devLog(sprintf('Error while opening "%s"', $url), 'crawler', 4, ['crawlerId' => $crawlerId]);
1598
                    }
1599
                    return false;
1600
                }
1601
            }
1602
1603
            return $result;
1604
        }
1605
    }
1606
1607
    /**
1608
     * Gets the base path of the website frontend.
1609
     * (e.g. if you call http://mydomain.com/cms/index.php in
1610
     * the browser the base path is "/cms/")
1611
     *
1612
     * @return string Base path of the website frontend
1613
     */
1614
    protected function getFrontendBasePath()
1615
    {
1616
        $frontendBasePath = '/';
1617
1618
        // Get the path from the extension settings:
1619
        if (isset($this->extensionSettings['frontendBasePath']) && $this->extensionSettings['frontendBasePath']) {
1620
            $frontendBasePath = $this->extensionSettings['frontendBasePath'];
1621
            // If empty, try to use config.absRefPrefix:
1622
        } elseif (isset($GLOBALS['TSFE']->absRefPrefix) && !empty($GLOBALS['TSFE']->absRefPrefix)) {
1623
            $frontendBasePath = $GLOBALS['TSFE']->absRefPrefix;
1624
            // If not in CLI mode the base path can be determined from $_SERVER environment:
1625
        } elseif (!defined('TYPO3_REQUESTTYPE_CLI') || !TYPO3_REQUESTTYPE_CLI) {
1626
            $frontendBasePath = GeneralUtility::getIndpEnv('TYPO3_SITE_PATH');
1627
        }
1628
1629
        // Base path must be '/<pathSegements>/':
1630
        if ($frontendBasePath != '/') {
1631
            $frontendBasePath = '/' . ltrim($frontendBasePath, '/');
1632
            $frontendBasePath = rtrim($frontendBasePath, '/') . '/';
1633
        }
1634
1635
        return $frontendBasePath;
1636
    }
1637
1638
    /**
1639
     * Executes a shell command and returns the outputted result.
1640
     *
1641
     * @param string $command Shell command to be executed
1642
     * @return string Outputted result of the command execution
1643
     */
1644
    protected function executeShellCommand($command)
1645
    {
1646
        $result = shell_exec($command);
1647
        return $result;
1648
    }
1649
1650
    /**
1651
     * Reads HTTP response from the given stream.
1652
     *
1653
     * @param  resource $streamPointer  Pointer to connection stream.
1654
     * @return array                    Associative array with the following items:
1655
     *                                  headers <array> Response headers sent by server.
1656
     *                                  content <array> Content, with each line as an array item.
1657
     */
1658 1
    protected function getHttpResponseFromStream($streamPointer)
1659
    {
1660 1
        $response = ['headers' => [], 'content' => []];
1661
1662 1
        if (is_resource($streamPointer)) {
1663
            // read headers
1664 1
            while ($line = fgets($streamPointer, '2048')) {
1665 1
                $line = trim($line);
1666 1
                if ($line !== '') {
1667 1
                    $response['headers'][] = $line;
1668
                } else {
1669 1
                    break;
1670
                }
1671
            }
1672
1673
            // read content
1674 1
            while ($line = fgets($streamPointer, '2048')) {
1675 1
                $response['content'][] = $line;
1676
            }
1677
        }
1678
1679 1
        return $response;
1680
    }
1681
1682
    /**
1683
     * @param message
1684
     */
1685 2
    protected function log($message)
1686
    {
1687 2
        if (!empty($this->extensionSettings['logFileName'])) {
1688
            $fileResult = @file_put_contents($this->extensionSettings['logFileName'], date('Ymd His') . ' ' . $message . PHP_EOL, FILE_APPEND);
1689
            if (!$fileResult) {
1690
                GeneralUtility::devLog('File "' . $this->extensionSettings['logFileName'] . '" could not be written, please check file permissions.', 'crawler', LogLevel::INFO);
1691
            }
1692
        }
1693 2
    }
1694
1695
    /**
1696
     * Builds HTTP request headers.
1697
     *
1698
     * @param array $url
1699
     * @param string $crawlerId
1700
     *
1701
     * @return array
1702
     */
1703 6
    protected function buildRequestHeaderArray(array $url, $crawlerId)
1704
    {
1705 6
        $reqHeaders = [];
1706 6
        $reqHeaders[] = 'GET ' . $url['path'] . ($url['query'] ? '?' . $url['query'] : '') . ' HTTP/1.0';
1707 6
        $reqHeaders[] = 'Host: ' . $url['host'];
1708 6
        if (stristr($url['query'], 'ADMCMD_previewWS')) {
1709 2
            $reqHeaders[] = 'Cookie: $Version="1"; be_typo_user="1"; $Path=/';
1710
        }
1711 6
        $reqHeaders[] = 'Connection: close';
1712 6
        if ($url['user'] != '') {
1713 2
            $reqHeaders[] = 'Authorization: Basic ' . base64_encode($url['user'] . ':' . $url['pass']);
1714
        }
1715 6
        $reqHeaders[] = 'X-T3crawler: ' . $crawlerId;
1716 6
        $reqHeaders[] = 'User-Agent: TYPO3 crawler';
1717 6
        return $reqHeaders;
1718
    }
1719
1720
    /**
1721
     * Check if the submitted HTTP-Header contains a redirect location and built new crawler-url
1722
     *
1723
     * @param array $headers HTTP Header
1724
     * @param string $user HTTP Auth. User
1725
     * @param string $pass HTTP Auth. Password
1726
     * @return bool|string
1727
     */
1728 12
    protected function getRequestUrlFrom302Header($headers, $user = '', $pass = '')
1729
    {
1730 12
        $header = [];
1731 12
        if (!is_array($headers)) {
1732 1
            return false;
1733
        }
1734 11
        if (!(stristr($headers[0], '301 Moved') || stristr($headers[0], '302 Found') || stristr($headers[0], '302 Moved'))) {
1735 2
            return false;
1736
        }
1737
1738 9
        foreach ($headers as $hl) {
1739 9
            $tmp = explode(": ", $hl);
1740 9
            $header[trim($tmp[0])] = trim($tmp[1]);
1741 9
            if (trim($tmp[0]) == 'Location') {
1742 9
                break;
1743
            }
1744
        }
1745 9
        if (!array_key_exists('Location', $header)) {
1746 3
            return false;
1747
        }
1748
1749 6
        if ($user != '') {
1750 3
            if (!($tmp = parse_url($header['Location']))) {
1751 1
                return false;
1752
            }
1753 2
            $newUrl = $tmp['scheme'] . '://' . $user . ':' . $pass . '@' . $tmp['host'] . $tmp['path'];
1754 2
            if ($tmp['query'] != '') {
1755 2
                $newUrl .= '?' . $tmp['query'];
1756
            }
1757
        } else {
1758 3
            $newUrl = $header['Location'];
1759
        }
1760 5
        return $newUrl;
1761
    }
1762
1763
    /**************************
1764
     *
1765
     * tslib_fe hooks:
1766
     *
1767
     **************************/
1768
1769
    /**
1770
     * Initialization hook (called after database connection)
1771
     * Takes the "HTTP_X_T3CRAWLER" header and looks up queue record and verifies if the session comes from the system (by comparing hashes)
1772
     *
1773
     * @param array $params Parameters from frontend
1774
     * @param object $ref TSFE object (reference under PHP5)
1775
     * @return void
1776
     *
1777
     * FIXME: Look like this is not used, in commit 9910d3f40cce15f4e9b7bcd0488bf21f31d53ebc it's added as public,
1778
     * FIXME: I think this can be removed. (TNM)
1779
     */
1780
    public function fe_init(&$params, $ref)
0 ignored issues
show
Unused Code introduced by
The parameter $ref is not used and could be removed.

This check looks from parameters that have been defined for a function or method, but which are not used in the method body.

Loading history...
1781
    {
1782
        // Authenticate crawler request:
1783
        if (isset($_SERVER['HTTP_X_T3CRAWLER'])) {
1784
            list($queueId, $hash) = explode(':', $_SERVER['HTTP_X_T3CRAWLER']);
1785
            list($queueRec) = $this->db->exec_SELECTgetSingleRow('*', 'tx_crawler_queue', 'qid=' . intval($queueId));
1786
1787
            // If a crawler record was found and hash was matching, set it up:
1788
            if (is_array($queueRec) && $hash === md5($queueRec['qid'] . '|' . $queueRec['set_id'] . '|' . $GLOBALS['TYPO3_CONF_VARS']['SYS']['encryptionKey'])) {
1789
                $params['pObj']->applicationData['tx_crawler']['running'] = true;
1790
                $params['pObj']->applicationData['tx_crawler']['parameters'] = unserialize($queueRec['parameters']);
1791
                $params['pObj']->applicationData['tx_crawler']['log'] = [];
1792
            } else {
1793
                die('No crawler entry found!');
1794
            }
1795
        }
1796
    }
1797
1798
    /*****************************
1799
     *
1800
     * Compiling URLs to crawl - tools
1801
     *
1802
     *****************************/
1803
1804
    /**
1805
     * @param integer $id Root page id to start from.
1806
     * @param integer $depth Depth of tree, 0=only id-page, 1= on sublevel, 99 = infinite
1807
     * @param integer $scheduledTime Unix Time when the URL is timed to be visited when put in queue
1808
     * @param integer $reqMinute Number of requests per minute (creates the interleave between requests)
1809
     * @param boolean $submitCrawlUrls If set, submits the URLs to queue in database (real crawling)
1810
     * @param boolean $downloadCrawlUrls If set (and submitcrawlUrls is false) will fill $downloadUrls with entries)
1811
     * @param array $incomingProcInstructions Array of processing instructions
1812
     * @param array $configurationSelection Array of configuration keys
1813
     * @return string
1814
     */
1815
    public function getPageTreeAndUrls(
1816
        $id,
1817
        $depth,
1818
        $scheduledTime,
1819
        $reqMinute,
1820
        $submitCrawlUrls,
1821
        $downloadCrawlUrls,
1822
        array $incomingProcInstructions,
1823
        array $configurationSelection
1824
    ) {
1825
        global $BACK_PATH;
1826
        global $LANG;
1827
        if (!is_object($LANG)) {
1828
            $LANG = GeneralUtility::makeInstance(LanguageService::class);
1829
            $LANG->init(0);
1830
        }
1831
        $this->scheduledTime = $scheduledTime;
1832
        $this->reqMinute = $reqMinute;
1833
        $this->submitCrawlUrls = $submitCrawlUrls;
1834
        $this->downloadCrawlUrls = $downloadCrawlUrls;
1835
        $this->incomingProcInstructions = $incomingProcInstructions;
1836
        $this->incomingConfigurationSelection = $configurationSelection;
1837
1838
        $this->duplicateTrack = [];
1839
        $this->downloadUrls = [];
1840
1841
        // Drawing tree:
1842
        /* @var PageTreeView $tree */
1843
        $tree = GeneralUtility::makeInstance(PageTreeView::class);
1844
        $perms_clause = $GLOBALS['BE_USER']->getPagePermsClause(1);
1845
        $tree->init('AND ' . $perms_clause);
1846
1847
        $pageInfo = BackendUtility::readPageAccess($id, $perms_clause);
1848
        if (is_array($pageInfo)) {
1849
            // Set root row:
1850
            $tree->tree[] = [
1851
                'row' => $pageInfo,
1852
                'HTML' => IconUtility::getIconForRecord('pages', $pageInfo),
1853
            ];
1854
        }
1855
1856
        // Get branch beneath:
1857
        if ($depth) {
1858
            $tree->getTree($id, $depth, '');
1859
        }
1860
1861
        // Traverse page tree:
1862
        $code = '';
1863
1864
        foreach ($tree->tree as $data) {
1865
            $this->MP = false;
1866
1867
            // recognize mount points
1868
            if ($data['row']['doktype'] == 7) {
1869
                $mountpage = $this->db->exec_SELECTgetRows('*', 'pages', 'uid = ' . $data['row']['uid']);
1870
1871
                // fetch mounted pages
1872
                $this->MP = $mountpage[0]['mount_pid'] . '-' . $data['row']['uid'];
0 ignored issues
show
Documentation Bug introduced by
The property $MP was declared of type boolean, but $mountpage[0]['mount_pid...' . $data['row']['uid'] is of type string. Maybe add a type cast?

This check looks for assignments to scalar types that may be of the wrong type.

To ensure the code behaves as expected, it may be a good idea to add an explicit type cast.

$answer = 42;

$correct = false;

$correct = (bool) $answer;
Loading history...
1873
1874
                $mountTree = GeneralUtility::makeInstance(PageTreeView::class);
1875
                $mountTree->init('AND ' . $perms_clause);
1876
                $mountTree->getTree($mountpage[0]['mount_pid'], $depth, '');
1877
1878
                foreach ($mountTree->tree as $mountData) {
1879
                    $code .= $this->drawURLs_addRowsForPage(
1880
                        $mountData['row'],
1881
                        $mountData['HTML'] . BackendUtility::getRecordTitle('pages', $mountData['row'], true)
1882
                    );
1883
                }
1884
1885
                // replace page when mount_pid_ol is enabled
1886
                if ($mountpage[0]['mount_pid_ol']) {
1887
                    $data['row']['uid'] = $mountpage[0]['mount_pid'];
1888
                } else {
1889
                    // if the mount_pid_ol is not set the MP must not be used for the mountpoint page
1890
                    $this->MP = false;
1891
                }
1892
            }
1893
1894
            $code .= $this->drawURLs_addRowsForPage(
1895
                $data['row'],
1896
                $data['HTML'] . BackendUtility::getRecordTitle('pages', $data['row'], true)
1897
            );
1898
        }
1899
1900
        return $code;
1901
    }
1902
1903
    /**
1904
     * Expands exclude string
1905
     *
1906
     * @param string $excludeString Exclude string
1907
     * @return array
1908
     */
1909 1
    public function expandExcludeString($excludeString)
1910
    {
1911
        // internal static caches;
1912 1
        static $expandedExcludeStringCache;
1913 1
        static $treeCache;
1914
1915 1
        if (empty($expandedExcludeStringCache[$excludeString])) {
1916 1
            $pidList = [];
1917
1918 1
            if (!empty($excludeString)) {
1919
                /** @var PageTreeView $tree */
1920
                $tree = GeneralUtility::makeInstance(PageTreeView::class);
1921
                $tree->init('AND ' . $this->backendUser->getPagePermsClause(1));
1922
1923
                $excludeParts = GeneralUtility::trimExplode(',', $excludeString);
1924
1925
                foreach ($excludeParts as $excludePart) {
1926
                    list($pid, $depth) = GeneralUtility::trimExplode('+', $excludePart);
1927
1928
                    // default is "page only" = "depth=0"
1929
                    if (empty($depth)) {
1930
                        $depth = (stristr($excludePart, '+')) ? 99 : 0;
1931
                    }
1932
1933
                    $pidList[] = $pid;
1934
1935
                    if ($depth > 0) {
1936
                        if (empty($treeCache[$pid][$depth])) {
1937
                            $tree->reset();
1938
                            $tree->getTree($pid, $depth);
1939
                            $treeCache[$pid][$depth] = $tree->tree;
1940
                        }
1941
1942
                        foreach ($treeCache[$pid][$depth] as $data) {
1943
                            $pidList[] = $data['row']['uid'];
1944
                        }
1945
                    }
1946
                }
1947
            }
1948
1949 1
            $expandedExcludeStringCache[$excludeString] = array_unique($pidList);
1950
        }
1951
1952 1
        return $expandedExcludeStringCache[$excludeString];
1953
    }
1954
1955
    /**
1956
     * Create the rows for display of the page tree
1957
     * For each page a number of rows are shown displaying GET variable configuration
1958
     *
1959
     * @param    array        Page row
1960
     * @param    string        Page icon and title for row
1961
     * @return    string        HTML <tr> content (one or more)
1962
     */
1963
    public function drawURLs_addRowsForPage(array $pageRow, $pageTitleAndIcon)
1964
    {
1965
        $skipMessage = '';
1966
1967
        // Get list of configurations
1968
        $configurations = $this->getUrlsForPageRow($pageRow, $skipMessage);
1969
1970
        if (count($this->incomingConfigurationSelection) > 0) {
1971
            // remove configuration that does not match the current selection
1972
            foreach ($configurations as $confKey => $confArray) {
1973
                if (!in_array($confKey, $this->incomingConfigurationSelection)) {
1974
                    unset($configurations[$confKey]);
1975
                }
1976
            }
1977
        }
1978
1979
        // Traverse parameter combinations:
1980
        $c = 0;
1981
        $content = '';
1982
        if (count($configurations)) {
1983
            foreach ($configurations as $confKey => $confArray) {
1984
1985
                    // Title column:
1986
                if (!$c) {
1987
                    $titleClm = '<td rowspan="' . count($configurations) . '">' . $pageTitleAndIcon . '</td>';
1988
                } else {
1989
                    $titleClm = '';
1990
                }
1991
1992
                if (!in_array($pageRow['uid'], $this->expandExcludeString($confArray['subCfg']['exclude']))) {
1993
1994
                        // URL list:
1995
                    $urlList = $this->urlListFromUrlArray(
1996
                        $confArray,
1997
                        $pageRow,
1998
                        $this->scheduledTime,
1999
                        $this->reqMinute,
2000
                        $this->submitCrawlUrls,
2001
                        $this->downloadCrawlUrls,
2002
                        $this->duplicateTrack,
2003
                        $this->downloadUrls,
2004
                        $this->incomingProcInstructions // if empty the urls won't be filtered by processing instructions
2005
                    );
2006
2007
                    // Expanded parameters:
2008
                    $paramExpanded = '';
2009
                    $calcAccu = [];
2010
                    $calcRes = 1;
2011
                    foreach ($confArray['paramExpanded'] as $gVar => $gVal) {
2012
                        $paramExpanded .= '
2013
                            <tr>
2014
                                <td class="bgColor4-20">' . htmlspecialchars('&' . $gVar . '=') . '<br/>' .
2015
                                                '(' . count($gVal) . ')' .
2016
                                                '</td>
2017
                                <td class="bgColor4" nowrap="nowrap">' . nl2br(htmlspecialchars(implode(chr(10), $gVal))) . '</td>
2018
                            </tr>
2019
                        ';
2020
                        $calcRes *= count($gVal);
2021
                        $calcAccu[] = count($gVal);
2022
                    }
2023
                    $paramExpanded = '<table class="lrPadding c-list param-expanded">' . $paramExpanded . '</table>';
2024
                    $paramExpanded .= 'Comb: ' . implode('*', $calcAccu) . '=' . $calcRes;
2025
2026
                    // Options
2027
                    $optionValues = '';
2028
                    if ($confArray['subCfg']['userGroups']) {
2029
                        $optionValues .= 'User Groups: ' . $confArray['subCfg']['userGroups'] . '<br/>';
2030
                    }
2031
                    if ($confArray['subCfg']['baseUrl']) {
2032
                        $optionValues .= 'Base Url: ' . $confArray['subCfg']['baseUrl'] . '<br/>';
2033
                    }
2034
                    if ($confArray['subCfg']['procInstrFilter']) {
2035
                        $optionValues .= 'ProcInstr: ' . $confArray['subCfg']['procInstrFilter'] . '<br/>';
2036
                    }
2037
2038
                    // Compile row:
2039
                    $content .= '
2040
                        <tr class="bgColor' . ($c % 2 ? '-20' : '-10') . '">
2041
                            ' . $titleClm . '
2042
                            <td>' . htmlspecialchars($confKey) . '</td>
2043
                            <td>' . nl2br(htmlspecialchars(rawurldecode(trim(str_replace('&', chr(10) . '&', GeneralUtility::implodeArrayForUrl('', $confArray['paramParsed'])))))) . '</td>
2044
                            <td>' . $paramExpanded . '</td>
2045
                            <td nowrap="nowrap">' . $urlList . '</td>
2046
                            <td nowrap="nowrap">' . $optionValues . '</td>
2047
                            <td nowrap="nowrap">' . DebugUtility::viewArray($confArray['subCfg']['procInstrParams.']) . '</td>
2048
                        </tr>';
2049
                } else {
2050
                    $content .= '<tr class="bgColor' . ($c % 2 ? '-20' : '-10') . '">
2051
                            ' . $titleClm . '
2052
                            <td>' . htmlspecialchars($confKey) . '</td>
2053
                            <td colspan="5"><em>No entries</em> (Page is excluded in this configuration)</td>
2054
                        </tr>';
2055
                }
2056
2057
                $c++;
2058
            }
2059
        } else {
2060
            $message = !empty($skipMessage) ? ' (' . $skipMessage . ')' : '';
2061
2062
            // Compile row:
2063
            $content .= '
2064
                <tr class="bgColor-20" style="border-bottom: 1px solid black;">
2065
                    <td>' . $pageTitleAndIcon . '</td>
2066
                    <td colspan="6"><em>No entries</em>' . $message . '</td>
2067
                </tr>';
2068
        }
2069
2070
        return $content;
2071
    }
2072
2073
    /*****************************
2074
     *
2075
     * CLI functions
2076
     *
2077
     *****************************/
2078
2079
    /**
2080
     * Main function for running from Command Line PHP script (cron job)
2081
     * See ext/crawler/cli/crawler_cli.phpsh for details
2082
     *
2083
     * @return int number of remaining items or false if error
2084
     */
2085
    public function CLI_main()
2086
    {
2087
        $this->setAccessMode('cli');
2088
        $result = self::CLI_STATUS_NOTHING_PROCCESSED;
2089
        $cliObj = GeneralUtility::makeInstance(CrawlerCommandLineController::class);
2090
2091
        if (isset($cliObj->cli_args['-h']) || isset($cliObj->cli_args['--help'])) {
2092
            $cliObj->cli_validateArgs();
2093
            $cliObj->cli_help();
2094
            exit;
2095
        }
2096
2097
        if (!$this->getDisabled() && $this->CLI_checkAndAcquireNewProcess($this->CLI_buildProcessId())) {
2098
            $countInARun = $cliObj->cli_argValue('--countInARun') ? intval($cliObj->cli_argValue('--countInARun')) : $this->extensionSettings['countInARun'];
2099
            // Seconds
2100
            $sleepAfterFinish = $cliObj->cli_argValue('--sleepAfterFinish') ? intval($cliObj->cli_argValue('--sleepAfterFinish')) : $this->extensionSettings['sleepAfterFinish'];
2101
            // Milliseconds
2102
            $sleepTime = $cliObj->cli_argValue('--sleepTime') ? intval($cliObj->cli_argValue('--sleepTime')) : $this->extensionSettings['sleepTime'];
2103
2104
            try {
2105
                // Run process:
2106
                $result = $this->CLI_run($countInARun, $sleepTime, $sleepAfterFinish);
2107
            } catch (\Exception $e) {
2108
                $this->CLI_debug(get_class($e) . ': ' . $e->getMessage());
2109
                $result = self::CLI_STATUS_ABORTED;
2110
            }
2111
2112
            // Cleanup
2113
            $this->db->exec_DELETEquery('tx_crawler_process', 'assigned_items_count = 0');
2114
2115
            //TODO can't we do that in a clean way?
2116
            $releaseStatus = $this->CLI_releaseProcesses($this->CLI_buildProcessId());
0 ignored issues
show
Unused Code introduced by
$releaseStatus is not used, you could remove the assignment.

This check looks for variable assignements that are either overwritten by other assignments or where the variable is not used subsequently.

$myVar = 'Value';
$higher = false;

if (rand(1, 6) > 3) {
    $higher = true;
} else {
    $higher = false;
}

Both the $myVar assignment in line 1 and the $higher assignment in line 2 are dead. The first because $myVar is never used and the second because $higher is always overwritten for every possible time line.

Loading history...
Deprecated Code introduced by
The method AOE\Crawler\Controller\C...:CLI_releaseProcesses() has been deprecated with message: since crawler v6.5.1, will be removed in crawler v9.0.0.

This method has been deprecated. The supplier of the class has supplied an explanatory message.

The explanatory message should give you some clue as to whether and when the method will be removed from the class and what other method or class to use instead.

Loading history...
2117
2118
            $this->CLI_debug("Unprocessed Items remaining:" . $this->queueRepository->countUnprocessedItems() . " (" . $this->CLI_buildProcessId() . ")");
2119
            $result |= ($this->queueRepository->countUnprocessedItems() > 0 ? self::CLI_STATUS_REMAIN : self::CLI_STATUS_NOTHING_PROCCESSED);
2120
        } else {
2121
            $result |= self::CLI_STATUS_ABORTED;
2122
        }
2123
2124
        return $result;
2125
    }
2126
2127
    /**
2128
     * Function executed by crawler_im.php cli script.
2129
     *
2130
     * @return void
2131
     */
2132
    public function CLI_main_im()
2133
    {
2134
        $this->setAccessMode('cli_im');
2135
2136
        $cliObj = GeneralUtility::makeInstance(QueueCommandLineController::class);
2137
2138
        // Force user to admin state and set workspace to "Live":
2139
        $this->backendUser->user['admin'] = 1;
2140
        $this->backendUser->setWorkspace(0);
2141
2142
        // Print help
2143
        if (!isset($cliObj->cli_args['_DEFAULT'][1])) {
2144
            $cliObj->cli_validateArgs();
2145
            $cliObj->cli_help();
2146
            exit;
2147
        }
2148
2149
        $cliObj->cli_validateArgs();
2150
2151
        if ($cliObj->cli_argValue('-o') === 'exec') {
2152
            $this->registerQueueEntriesInternallyOnly = true;
2153
        }
2154
2155
        if (isset($cliObj->cli_args['_DEFAULT'][2])) {
2156
            // Crawler is called over TYPO3 BE
2157
            $pageId = MathUtility::forceIntegerInRange($cliObj->cli_args['_DEFAULT'][2], 0);
2158
        } else {
2159
            // Crawler is called over cli
2160
            $pageId = MathUtility::forceIntegerInRange($cliObj->cli_args['_DEFAULT'][1], 0);
2161
        }
2162
2163
        $configurationKeys = $this->getConfigurationKeys($cliObj);
0 ignored issues
show
Deprecated Code introduced by
The method AOE\Crawler\Controller\C...:getConfigurationKeys() has been deprecated with message: since crawler v6.3.0, will be removed in crawler v7.0.0.

This method has been deprecated. The supplier of the class has supplied an explanatory message.

The explanatory message should give you some clue as to whether and when the method will be removed from the class and what other method or class to use instead.

Loading history...
2164
2165
        if (!is_array($configurationKeys)) {
2166
            $configurations = $this->getUrlsForPageId($pageId);
2167
            if (is_array($configurations)) {
2168
                $configurationKeys = array_keys($configurations);
2169
            } else {
2170
                $configurationKeys = [];
2171
            }
2172
        }
2173
2174
        if ($cliObj->cli_argValue('-o') === 'queue' || $cliObj->cli_argValue('-o') === 'exec') {
2175
            $reason = new Reason();
2176
            $reason->setReason(Reason::REASON_GUI_SUBMIT);
2177
            $reason->setDetailText('The cli script of the crawler added to the queue');
2178
2179
            // The event dispatcher is deprecated since crawler v6.4.0, will be removed in crawler v7.0.0.
2180
            // Please use the Signal instead.
2181
            EventDispatcher::getInstance()->post(
2182
                'invokeQueueChange',
2183
                $this->setID,
2184
                ['reason' => $reason]
2185
            );
2186
2187
            $signalPayload = ['reason' => $reason];
2188
            SignalSlotUtility::emitSignal(
2189
                __CLASS__,
2190
                SignalSlotUtility::SIGNAL_INVOKE_QUEUE_CHANGE,
2191
                $signalPayload
2192
            );
2193
2194
        }
2195
2196
        if ($this->extensionSettings['cleanUpOldQueueEntries']) {
2197
            $this->cleanUpOldQueueEntries();
2198
        }
2199
2200
        $this->setID = (int) GeneralUtility::md5int(microtime());
2201
        $this->getPageTreeAndUrls(
2202
            $pageId,
2203
            MathUtility::forceIntegerInRange($cliObj->cli_argValue('-d'), 0, 99),
2204
            $this->getCurrentTime(),
2205
            MathUtility::forceIntegerInRange($cliObj->cli_isArg('-n') ? $cliObj->cli_argValue('-n') : 30, 1, 1000),
2206
            $cliObj->cli_argValue('-o') === 'queue' || $cliObj->cli_argValue('-o') === 'exec',
2207
            $cliObj->cli_argValue('-o') === 'url',
2208
            GeneralUtility::trimExplode(',', $cliObj->cli_argValue('-proc'), true),
2209
            $configurationKeys
2210
        );
2211
2212
        if ($cliObj->cli_argValue('-o') === 'url') {
2213
            $cliObj->cli_echo(implode(chr(10), $this->downloadUrls) . chr(10), true);
2214
        } elseif ($cliObj->cli_argValue('-o') === 'exec') {
2215
            $cliObj->cli_echo("Executing " . count($this->urlList) . " requests right away:\n\n");
2216
            $cliObj->cli_echo(implode(chr(10), $this->urlList) . chr(10));
2217
            $cliObj->cli_echo("\nProcessing:\n");
2218
2219
            foreach ($this->queueEntries as $queueRec) {
2220
                $p = unserialize($queueRec['parameters']);
2221
                $cliObj->cli_echo($p['url'] . ' (' . implode(',', $p['procInstructions']) . ') => ');
2222
2223
                $result = $this->readUrlFromArray($queueRec);
2224
2225
                $requestResult = unserialize($result['content']);
2226
                if (is_array($requestResult)) {
2227
                    $resLog = is_array($requestResult['log']) ? chr(10) . chr(9) . chr(9) . implode(chr(10) . chr(9) . chr(9), $requestResult['log']) : '';
2228
                    $cliObj->cli_echo('OK: ' . $resLog . chr(10));
2229
                } else {
2230
                    $cliObj->cli_echo('Error checking Crawler Result: ' . substr(preg_replace('/\s+/', ' ', strip_tags($result['content'])), 0, 30000) . '...' . chr(10));
2231
                }
2232
            }
2233
        } elseif ($cliObj->cli_argValue('-o') === 'queue') {
2234
            $cliObj->cli_echo("Putting " . count($this->urlList) . " entries in queue:\n\n");
2235
            $cliObj->cli_echo(implode(chr(10), $this->urlList) . chr(10));
2236
        } else {
2237
            $cliObj->cli_echo(count($this->urlList) . " entries found for processing. (Use -o to decide action):\n\n", true);
2238
            $cliObj->cli_echo(implode(chr(10), $this->urlList) . chr(10), true);
2239
        }
2240
    }
2241
2242
    /**
2243
     * Function executed by crawler_im.php cli script.
2244
     *
2245
     * @return bool
2246
     */
2247
    public function CLI_main_flush()
2248
    {
2249
        $this->setAccessMode('cli_flush');
2250
        $cliObj = GeneralUtility::makeInstance(FlushCommandLineController::class);
2251
2252
        // Force user to admin state and set workspace to "Live":
2253
        $this->backendUser->user['admin'] = 1;
2254
        $this->backendUser->setWorkspace(0);
2255
2256
        // Print help
2257
        if (!isset($cliObj->cli_args['_DEFAULT'][1])) {
2258
            $cliObj->cli_validateArgs();
2259
            $cliObj->cli_help();
2260
            exit;
2261
        }
2262
2263
        $cliObj->cli_validateArgs();
2264
        $pageId = MathUtility::forceIntegerInRange($cliObj->cli_args['_DEFAULT'][1], 0);
2265
        $fullFlush = ($pageId == 0);
2266
2267
        $mode = $cliObj->cli_argValue('-o');
2268
2269
        switch ($mode) {
2270
            case 'all':
2271
                $result = $this->getLogEntriesForPageId($pageId, '', true, $fullFlush);
2272
                break;
2273
            case 'finished':
2274
            case 'pending':
2275
                $result = $this->getLogEntriesForPageId($pageId, $mode, true, $fullFlush);
2276
                break;
2277
            default:
2278
                $cliObj->cli_validateArgs();
2279
                $cliObj->cli_help();
2280
                $result = false;
2281
        }
2282
2283
        return $result !== false;
2284
    }
2285
2286
    /**
2287
     * Obtains configuration keys from the CLI arguments
2288
     *
2289
     * @param QueueCommandLineController $cliObj
2290
     * @return array
2291
     *
2292
     * @deprecated since crawler v6.3.0, will be removed in crawler v7.0.0.
2293
     */
2294
    protected function getConfigurationKeys(QueueCommandLineController $cliObj)
2295
    {
2296
        $parameter = trim($cliObj->cli_argValue('-conf'));
2297
        return ($parameter != '' ? GeneralUtility::trimExplode(',', $parameter) : []);
2298
    }
2299
2300
    /**
2301
     * Running the functionality of the CLI (crawling URLs from queue)
2302
     *
2303
     * @param int $countInARun
2304
     * @param int $sleepTime
2305
     * @param int $sleepAfterFinish
2306
     * @return string
2307
     */
2308
    public function CLI_run($countInARun, $sleepTime, $sleepAfterFinish)
2309
    {
2310
        $result = 0;
2311
        $counter = 0;
2312
2313
        // First, run hooks:
2314
        $this->CLI_runHooks();
2315
2316
        // Clean up the queue
2317
        if (intval($this->extensionSettings['purgeQueueDays']) > 0) {
2318
            $purgeDate = $this->getCurrentTime() - 24 * 60 * 60 * intval($this->extensionSettings['purgeQueueDays']);
2319
            $del = $this->db->exec_DELETEquery(
2320
                'tx_crawler_queue',
2321
                'exec_time!=0 AND exec_time<' . $purgeDate
2322
            );
2323
            if (false == $del) {
2324
                GeneralUtility::devLog('Records could not be deleted.', 'crawler', LogLevel::INFO);
2325
            }
2326
        }
2327
2328
        // Select entries:
2329
        //TODO Shouldn't this reside within the transaction?
2330
        $rows = $this->db->exec_SELECTgetRows(
2331
            'qid,scheduled',
2332
            'tx_crawler_queue',
2333
            'exec_time=0
2334
                AND process_scheduled= 0
2335
                AND scheduled<=' . $this->getCurrentTime(),
2336
            '',
2337
            'scheduled, qid',
2338
        intval($countInARun)
2339
        );
2340
2341
        if (count($rows) > 0) {
2342
            $quidList = [];
2343
2344
            foreach ($rows as $r) {
0 ignored issues
show
Bug introduced by
The expression $rows of type null|array is not guaranteed to be traversable. How about adding an additional type check?

There are different options of fixing this problem.

  1. If you want to be on the safe side, you can add an additional type-check:

    $collection = json_decode($data, true);
    if ( ! is_array($collection)) {
        throw new \RuntimeException('$collection must be an array.');
    }
    
    foreach ($collection as $item) { /** ... */ }
    
  2. If you are sure that the expression is traversable, you might want to add a doc comment cast to improve IDE auto-completion and static analysis:

    /** @var array $collection */
    $collection = json_decode($data, true);
    
    foreach ($collection as $item) { /** .. */ }
    
  3. Mark the issue as a false-positive: Just hover the remove button, in the top-right corner of this issue for more options.

Loading history...
2345
                $quidList[] = $r['qid'];
2346
            }
2347
2348
            $processId = $this->CLI_buildProcessId();
2349
2350
            //reserve queue entries for process
2351
            $this->db->sql_query('BEGIN');
2352
            //TODO make sure we're not taking assigned queue-entires
2353
            $this->db->exec_UPDATEquery(
2354
                'tx_crawler_queue',
2355
                'qid IN (' . implode(',', $quidList) . ')',
2356
                [
2357
                    'process_scheduled' => intval($this->getCurrentTime()),
2358
                    'process_id' => $processId,
2359
                ]
2360
            );
2361
2362
            //save the number of assigned queue entrys to determine who many have been processed later
2363
            $numberOfAffectedRows = $this->db->sql_affected_rows();
2364
            $this->db->exec_UPDATEquery(
2365
                'tx_crawler_process',
2366
                "process_id = '" . $processId . "'",
2367
                [
2368
                    'assigned_items_count' => intval($numberOfAffectedRows),
2369
                ]
2370
            );
2371
2372
            if ($numberOfAffectedRows == count($quidList)) {
2373
                $this->db->sql_query('COMMIT');
2374
            } else {
2375
                $this->db->sql_query('ROLLBACK');
2376
                $this->CLI_debug("Nothing processed due to multi-process collision (" . $this->CLI_buildProcessId() . ")");
2377
                return ($result | self::CLI_STATUS_ABORTED);
2378
            }
2379
2380
            foreach ($rows as $r) {
0 ignored issues
show
Bug introduced by
The expression $rows of type null|array is not guaranteed to be traversable. How about adding an additional type check?

There are different options of fixing this problem.

  1. If you want to be on the safe side, you can add an additional type-check:

    $collection = json_decode($data, true);
    if ( ! is_array($collection)) {
        throw new \RuntimeException('$collection must be an array.');
    }
    
    foreach ($collection as $item) { /** ... */ }
    
  2. If you are sure that the expression is traversable, you might want to add a doc comment cast to improve IDE auto-completion and static analysis:

    /** @var array $collection */
    $collection = json_decode($data, true);
    
    foreach ($collection as $item) { /** .. */ }
    
  3. Mark the issue as a false-positive: Just hover the remove button, in the top-right corner of this issue for more options.

Loading history...
2381
                $result |= $this->readUrl($r['qid']);
2382
2383
                $counter++;
2384
                usleep(intval($sleepTime)); // Just to relax the system
2385
2386
                // if during the start and the current read url the cli has been disable we need to return from the function
2387
                // mark the process NOT as ended.
2388
                if ($this->getDisabled()) {
2389
                    return ($result | self::CLI_STATUS_ABORTED);
2390
                }
2391
2392
                $process = $this->processRepository->findByUid($this->CLI_buildProcessId());
0 ignored issues
show
Bug introduced by
The method findByUid() does not seem to exist on object<AOE\Crawler\Domai...tory\ProcessRepository>.

This check looks for calls to methods that do not seem to exist on a given type. It looks for the method on the type itself as well as in inherited classes or implemented interfaces.

This is most likely a typographical error or the method has been renamed.

Loading history...
2393
                if (!$process->isActive()) {
2394
                    $this->CLI_debug("conflict / timeout (" . $this->CLI_buildProcessId() . ")");
2395
2396
                    //TODO might need an additional returncode
2397
                    $result |= self::CLI_STATUS_ABORTED;
2398
                    break; //possible timeout
2399
                }
2400
            }
2401
2402
            sleep(intval($sleepAfterFinish));
2403
2404
            $msg = 'Rows: ' . $counter;
2405
            $this->CLI_debug($msg . " (" . $this->CLI_buildProcessId() . ")");
2406
        } else {
2407
            $this->CLI_debug("Nothing within queue which needs to be processed (" . $this->CLI_buildProcessId() . ")");
2408
        }
2409
2410
        if ($counter > 0) {
2411
            $result |= self::CLI_STATUS_PROCESSED;
2412
        }
2413
2414
        return $result;
2415
    }
2416
2417
    /**
2418
     * Activate hooks
2419
     *
2420
     * @return void
2421
     */
2422
    public function CLI_runHooks()
2423
    {
2424
        global $TYPO3_CONF_VARS;
2425
        if (is_array($TYPO3_CONF_VARS['EXTCONF']['crawler']['cli_hooks'])) {
2426
            foreach ($TYPO3_CONF_VARS['EXTCONF']['crawler']['cli_hooks'] as $objRef) {
2427
                $hookObj = &GeneralUtility::getUserObj($objRef);
2428
                if (is_object($hookObj)) {
2429
                    $hookObj->crawler_init($this);
2430
                }
2431
            }
2432
        }
2433
    }
2434
2435
    /**
2436
     * Try to acquire a new process with the given id
2437
     * also performs some auto-cleanup for orphan processes
2438
     * @todo preemption might not be the most elegant way to clean up
2439
     *
2440
     * @param string $id identification string for the process
2441
     * @return boolean
2442
     */
2443
    public function CLI_checkAndAcquireNewProcess($id)
2444
    {
2445
        $ret = true;
2446
2447
        $systemProcessId = getmypid();
2448
        if ($systemProcessId < 1) {
2449
            return false;
2450
        }
2451
2452
        $processCount = 0;
2453
        $orphanProcesses = [];
2454
2455
        $this->db->sql_query('BEGIN');
2456
2457
        $res = $this->db->exec_SELECTquery(
2458
            'process_id,ttl',
2459
            'tx_crawler_process',
2460
            'active=1 AND deleted=0'
2461
            );
2462
2463
        $currentTime = $this->getCurrentTime();
2464
2465
        while ($row = $this->db->sql_fetch_assoc($res)) {
2466
            if ($row['ttl'] < $currentTime) {
2467
                $orphanProcesses[] = $row['process_id'];
2468
            } else {
2469
                $processCount++;
2470
            }
2471
        }
2472
2473
        // if there are less than allowed active processes then add a new one
2474
        if ($processCount < intval($this->extensionSettings['processLimit'])) {
2475
            $this->CLI_debug("add process " . $this->CLI_buildProcessId() . " (" . ($processCount + 1) . "/" . intval($this->extensionSettings['processLimit']) . ")");
2476
2477
            // create new process record
2478
            $this->db->exec_INSERTquery(
2479
                'tx_crawler_process',
2480
                [
2481
                    'process_id' => $id,
2482
                    'active' => '1',
2483
                    'ttl' => ($currentTime + intval($this->extensionSettings['processMaxRunTime'])),
2484
                    'system_process_id' => $systemProcessId,
2485
                ]
2486
                );
2487
        } else {
2488
            $this->CLI_debug("Processlimit reached (" . ($processCount) . "/" . intval($this->extensionSettings['processLimit']) . ")");
2489
            $ret = false;
2490
        }
2491
2492
        $this->CLI_releaseProcesses($orphanProcesses, true); // maybe this should be somehow included into the current lock
0 ignored issues
show
Deprecated Code introduced by
The method AOE\Crawler\Controller\C...:CLI_releaseProcesses() has been deprecated with message: since crawler v6.5.1, will be removed in crawler v9.0.0.

This method has been deprecated. The supplier of the class has supplied an explanatory message.

The explanatory message should give you some clue as to whether and when the method will be removed from the class and what other method or class to use instead.

Loading history...
2493
        $this->CLI_deleteProcessesMarkedDeleted();
0 ignored issues
show
Deprecated Code introduced by
The method AOE\Crawler\Controller\C...rocessesMarkedDeleted() has been deprecated with message: since crawler v6.5.1, will be removed in crawler v9.0.0.

This method has been deprecated. The supplier of the class has supplied an explanatory message.

The explanatory message should give you some clue as to whether and when the method will be removed from the class and what other method or class to use instead.

Loading history...
2494
2495
        $this->db->sql_query('COMMIT');
2496
2497
        return $ret;
2498
    }
2499
2500
    /**
2501
     * Release a process and the required resources
2502
     *
2503
     * @param  mixed    $releaseIds   string with a single process-id or array with multiple process-ids
2504
     * @param  boolean  $withinLock   show whether the DB-actions are included within an existing lock
2505
     * @return boolean
2506
     *
2507
     * @deprecated since crawler v6.5.1, will be removed in crawler v9.0.0.
2508
     */
2509
    public function CLI_releaseProcesses($releaseIds, $withinLock = false)
2510
    {
2511
        if (!is_array($releaseIds)) {
2512
            $releaseIds = [$releaseIds];
2513
        }
2514
2515
        if (!count($releaseIds) > 0) {
2516
            return false;   //nothing to release
2517
        }
2518
2519
        if (!$withinLock) {
2520
            $this->db->sql_query('BEGIN');
2521
        }
2522
2523
        // some kind of 2nd chance algo - this way you need at least 2 processes to have a real cleanup
2524
        // this ensures that a single process can't mess up the entire process table
2525
2526
        // mark all processes as deleted which have no "waiting" queue-entires and which are not active
2527
        $this->db->exec_UPDATEquery(
2528
            'tx_crawler_queue',
2529
            'process_id IN (SELECT process_id FROM tx_crawler_process WHERE active=0 AND deleted=0)',
2530
            [
2531
                'process_scheduled' => 0,
2532
                'process_id' => '',
2533
            ]
2534
        );
2535
        $this->db->exec_UPDATEquery(
2536
            'tx_crawler_process',
2537
            'active=0 AND deleted=0
2538
            AND NOT EXISTS (
2539
                SELECT * FROM tx_crawler_queue
2540
                WHERE tx_crawler_queue.process_id = tx_crawler_process.process_id
2541
                AND tx_crawler_queue.exec_time = 0
2542
            )',
2543
            [
2544
                'deleted' => '1',
2545
                'system_process_id' => 0,
2546
            ]
2547
        );
2548
        // mark all requested processes as non-active
2549
        $this->db->exec_UPDATEquery(
2550
            'tx_crawler_process',
2551
            'process_id IN (\'' . implode('\',\'', $releaseIds) . '\') AND deleted=0',
2552
            [
2553
                'active' => '0',
2554
            ]
2555
        );
2556
        $this->db->exec_UPDATEquery(
2557
            'tx_crawler_queue',
2558
            'exec_time=0 AND process_id IN ("' . implode('","', $releaseIds) . '")',
2559
            [
2560
                'process_scheduled' => 0,
2561
                'process_id' => '',
2562
            ]
2563
        );
2564
2565
        if (!$withinLock) {
2566
            $this->db->sql_query('COMMIT');
2567
        }
2568
2569
        return true;
2570
    }
2571
2572
    /**
2573
     * Delete processes marked as deleted
2574
     *
2575
     * @return void
2576
     *
2577
     * @deprecated since crawler v6.5.1, will be removed in crawler v9.0.0.
2578
     */
2579 1
    public function CLI_deleteProcessesMarkedDeleted()
2580
    {
2581 1
        $this->db->exec_DELETEquery('tx_crawler_process', 'deleted = 1');
2582 1
    }
2583
2584
    /**
2585
     * Check if there are still resources left for the process with the given id
2586
     * Used to determine timeouts and to ensure a proper cleanup if there's a timeout
2587
     *
2588
     * @param  string  identification string for the process
2589
     * @return boolean determines if the process is still active / has resources
2590
     *
2591
     * @deprecated since crawler v6.5.1, will be removed in crawler v9.0.0.
2592
     *
2593
     * FIXME: Please remove Transaction, not needed as only a select query.
2594
     */
2595
    public function CLI_checkIfProcessIsActive($pid)
2596
    {
2597
        $ret = false;
2598
        $this->db->sql_query('BEGIN');
2599
        $res = $this->db->exec_SELECTquery(
2600
            'process_id,active,ttl',
2601
            'tx_crawler_process',
2602
            'process_id = \'' . $pid . '\'  AND deleted=0',
2603
            '',
2604
            'ttl',
2605
            '0,1'
2606
        );
2607
        if ($row = $this->db->sql_fetch_assoc($res)) {
2608
            $ret = intVal($row['active']) == 1;
2609
        }
2610
        $this->db->sql_query('COMMIT');
2611
2612
        return $ret;
2613
    }
2614
2615
    /**
2616
     * Create a unique Id for the current process
2617
     *
2618
     * @return string  the ID
2619
     */
2620 2
    public function CLI_buildProcessId()
2621
    {
2622 2
        if (!$this->processID) {
2623 1
            $this->processID = GeneralUtility::shortMD5($this->microtime(true));
2624
        }
2625 2
        return $this->processID;
2626
    }
2627
2628
    /**
2629
     * @param bool $get_as_float
2630
     *
2631
     * @return mixed
2632
     */
2633
    protected function microtime($get_as_float = false)
2634
    {
2635
        return microtime($get_as_float);
2636
    }
2637
2638
    /**
2639
     * Prints a message to the stdout (only if debug-mode is enabled)
2640
     *
2641
     * @param  string $msg  the message
2642
     */
2643
    public function CLI_debug($msg)
2644
    {
2645
        if (intval($this->extensionSettings['processDebug'])) {
2646
            echo $msg . "\n";
2647
            flush();
2648
        }
2649
    }
2650
2651
    /**
2652
     * Get URL content by making direct request to TYPO3.
2653
     *
2654
     * @param  string $url          Page URL
2655
     * @param  int    $crawlerId    Crawler-ID
2656
     * @return array
2657
     */
2658 2
    protected function sendDirectRequest($url, $crawlerId)
2659
    {
2660 2
        $parsedUrl = parse_url($url);
2661 2
        if (!is_array($parsedUrl)) {
2662
            return [];
2663
        }
2664
2665 2
        $requestHeaders = $this->buildRequestHeaderArray($parsedUrl, $crawlerId);
2666
2667 2
        $cmd = escapeshellcmd($this->extensionSettings['phpPath']);
2668 2
        $cmd .= ' ';
2669 2
        $cmd .= escapeshellarg(ExtensionManagementUtility::extPath('crawler') . 'cli/bootstrap.php');
2670 2
        $cmd .= ' ';
2671 2
        $cmd .= escapeshellarg($this->getFrontendBasePath());
2672 2
        $cmd .= ' ';
2673 2
        $cmd .= escapeshellarg($url);
2674 2
        $cmd .= ' ';
2675 2
        $cmd .= escapeshellarg(base64_encode(serialize($requestHeaders)));
2676
2677 2
        $startTime = microtime(true);
2678 2
        $content = $this->executeShellCommand($cmd);
2679 2
        $this->log($url . ' ' . (microtime(true) - $startTime));
2680
2681
        $result = [
2682 2
            'request' => implode("\r\n", $requestHeaders) . "\r\n\r\n",
2683 2
            'headers' => '',
2684 2
            'content' => $content,
2685
        ];
2686
2687 2
        return $result;
2688
    }
2689
2690
    /**
2691
     * Cleans up entries that stayed for too long in the queue. These are:
2692
     * - processed entries that are over 1.5 days in age
2693
     * - scheduled entries that are over 7 days old
2694
     *
2695
     * @return void
2696
     *
2697
     * TODO: Should be switched back to protected - TNM 2018-11-16
2698
     */
2699
    public function cleanUpOldQueueEntries()
2700
    {
2701
        $processedAgeInSeconds = $this->extensionSettings['cleanUpProcessedAge'] * 86400; // 24*60*60 Seconds in 24 hours
2702
        $scheduledAgeInSeconds = $this->extensionSettings['cleanUpScheduledAge'] * 86400;
2703
2704
        $now = time();
2705
        $condition = '(exec_time<>0 AND exec_time<' . ($now - $processedAgeInSeconds) . ') OR scheduled<=' . ($now - $scheduledAgeInSeconds);
2706
        $this->flushQueue($condition);
2707
    }
2708
2709
    /**
2710
     * Initializes a TypoScript Frontend necessary for using TypoScript and TypoLink functions
2711
     *
2712
     * @param int $id
2713
     * @param int $typeNum
2714
     *
2715
     * @throws \TYPO3\CMS\Core\Error\Http\ServiceUnavailableException
2716
     *
2717
     * @return void
2718
     */
2719
    protected function initTSFE($id = 1, $typeNum = 0)
2720
    {
2721
        EidUtility::initTCA();
2722
2723
        $isVersion7 = VersionNumberUtility::convertVersionNumberToInteger(TYPO3_version) < 8000000;
2724
        if ($isVersion7 && !is_object($GLOBALS['TT'])) {
2725
            /** @var NullTimeTracker $GLOBALS['TT'] */
2726
            $GLOBALS['TT'] = new NullTimeTracker();
0 ignored issues
show
Deprecated Code introduced by
The class TYPO3\CMS\Core\TimeTracker\NullTimeTracker has been deprecated with message: since TYPO3 v8, will be removed in v9

This class, trait or interface has been deprecated. The supplier of the file has supplied an explanatory message.

The explanatory message should give you some clue as to whether and when the type will be removed from the class and what other constant to use instead.

Loading history...
2727
            $GLOBALS['TT']->start();
0 ignored issues
show
Deprecated Code introduced by
The method TYPO3\CMS\Core\TimeTrack...ullTimeTracker::start() has been deprecated with message: since TYPO3 v8, will be removed in v9, use the regular time tracking

This method has been deprecated. The supplier of the class has supplied an explanatory message.

The explanatory message should give you some clue as to whether and when the method will be removed from the class and what other method or class to use instead.

Loading history...
2728
        } else {
2729
            $timeTracker = GeneralUtility::makeInstance(TimeTracker::class);
2730
            $timeTracker->start();
2731
        }
2732
2733
        $GLOBALS['TSFE'] = GeneralUtility::makeInstance(TypoScriptFrontendController::class, $GLOBALS['TYPO3_CONF_VARS'], $id, $typeNum);
2734
        $GLOBALS['TSFE']->sys_page = GeneralUtility::makeInstance(PageRepository::class);
2735
        $GLOBALS['TSFE']->sys_page->init(true);
2736
        $GLOBALS['TSFE']->connectToDB();
2737
        $GLOBALS['TSFE']->initFEuser();
2738
        $GLOBALS['TSFE']->determineId();
2739
        $GLOBALS['TSFE']->initTemplate();
2740
        $GLOBALS['TSFE']->rootLine = $GLOBALS['TSFE']->sys_page->getRootLine($id, '');
2741
        $GLOBALS['TSFE']->getConfigArray();
2742
        PageGenerator::pagegenInit();
0 ignored issues
show
Deprecated Code introduced by
The method TYPO3\CMS\Frontend\Page\...enerator::pagegenInit() has been deprecated with message: since TYPO3 v8, will be removed in TYPO3 v9

This method has been deprecated. The supplier of the class has supplied an explanatory message.

The explanatory message should give you some clue as to whether and when the method will be removed from the class and what other method or class to use instead.

Loading history...
2743
    }
2744
2745
    /**
2746
     * Returns a md5 hash generated from a serialized configuration array.
2747
     *
2748
     * @param array $configuration
2749
     *
2750
     * @return string
2751
     */
2752 10
    protected function getConfigurationHash(array $configuration) {
2753 10
        unset($configuration['paramExpanded']);
2754 10
        unset($configuration['URLs']);
2755 10
        return md5(serialize($configuration));
2756
    }
2757
2758
    /**
2759
     * Check whether the Crawling Protocol should be http or https
2760
     *
2761
     * @param $crawlerConfiguration
2762
     * @param $pageConfiguration
2763
     *
2764
     * @return bool
2765
     */
2766 10
    protected function isCrawlingProtocolHttps($crawlerConfiguration, $pageConfiguration) {
2767
        switch($crawlerConfiguration) {
2768 10
            case -1:
2769 2
                return false;
2770 8
            case 0:
2771 4
                return $pageConfiguration;
2772 4
            case 1:
2773 2
                return true;
2774
            default:
2775 2
                return false;
2776
        }
2777
    }
2778
}
2779