Completed
Push — issue/92 ( 1ce8c6...5b8d76 )
by Tomas Norre
05:44
created

CrawlerController::getUrlsForPageId()   D

Complexity

Conditions 26
Paths 60

Size

Total Lines 136
Code Lines 71

Duplication

Lines 0
Ratio 0 %

Code Coverage

Tests 63
CRAP Score 26.2684

Importance

Changes 0
Metric Value
cc 26
eloc 71
nc 60
nop 2
dl 0
loc 136
ccs 63
cts 68
cp 0.9265
crap 26.2684
rs 4.5382
c 0
b 0
f 0

How to fix   Long Method    Complexity   

Long Method

Small methods make your code easier to understand, in particular if combined with a good name. Besides, if your method is small, finding a good name is usually much easier.

For example, if you find yourself adding comments to a method's body, this is usually a good sign to extract the commented part to a new method, and use the comment as a starting point when coming up with a good name for this new method.

Commonly applied refactorings include:

1
<?php
2
namespace AOE\Crawler\Controller;
3
4
/***************************************************************
5
 *  Copyright notice
6
 *
7
 *  (c) 2017 AOE GmbH <[email protected]>
8
 *
9
 *  All rights reserved
10
 *
11
 *  This script is part of the TYPO3 project. The TYPO3 project is
12
 *  free software; you can redistribute it and/or modify
13
 *  it under the terms of the GNU General Public License as published by
14
 *  the Free Software Foundation; either version 3 of the License, or
15
 *  (at your option) any later version.
16
 *
17
 *  The GNU General Public License can be found at
18
 *  http://www.gnu.org/copyleft/gpl.html.
19
 *
20
 *  This script is distributed in the hope that it will be useful,
21
 *  but WITHOUT ANY WARRANTY; without even the implied warranty of
22
 *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
23
 *  GNU General Public License for more details.
24
 *
25
 *  This copyright notice MUST APPEAR in all copies of the script!
26
 ***************************************************************/
27
28
use AOE\Crawler\Command\CrawlerCommandLineController;
29
use AOE\Crawler\Command\FlushCommandLineController;
30
use AOE\Crawler\Command\QueueCommandLineController;
31
use AOE\Crawler\Domain\Model\Reason;
32
use AOE\Crawler\Event\EventDispatcher;
33
use AOE\Crawler\Utility\IconUtility;
34
use AOE\Crawler\Utility\SignalSlotUtility;
35
use TYPO3\CMS\Backend\Utility\BackendUtility;
36
use TYPO3\CMS\Backend\Tree\View\PageTreeView;
37
use TYPO3\CMS\Core\Authentication\BackendUserAuthentication;
38
use TYPO3\CMS\Core\Database\DatabaseConnection;
39
use TYPO3\CMS\Core\Log\LogLevel;
40
use TYPO3\CMS\Core\TimeTracker\NullTimeTracker;
41
use TYPO3\CMS\Core\Utility\DebugUtility;
42
use TYPO3\CMS\Core\Utility\ExtensionManagementUtility;
43
use TYPO3\CMS\Core\Utility\GeneralUtility;
44
use TYPO3\CMS\Core\Utility\MathUtility;
45
use TYPO3\CMS\Frontend\Controller\TypoScriptFrontendController;
46
use TYPO3\CMS\Frontend\Page\PageGenerator;
47
use TYPO3\CMS\Frontend\Page\PageRepository;
48
use TYPO3\CMS\Frontend\Utility\EidUtility;
49
50
/**
51
 * Class CrawlerController
52
 *
53
 * @package AOE\Crawler\Controller
54
 */
55
class CrawlerController
56
{
57
    const CLI_STATUS_NOTHING_PROCCESSED = 0;
58
    const CLI_STATUS_REMAIN = 1; //queue not empty
59
    const CLI_STATUS_PROCESSED = 2; //(some) queue items where processed
60
    const CLI_STATUS_ABORTED = 4; //instance didn't finish
61
    const CLI_STATUS_POLLABLE_PROCESSED = 8;
62
63
    /**
64
     * @var integer
65
     */
66
    public $setID = 0;
67
68
    /**
69
     * @var string
70
     */
71
    public $processID = '';
72
73
    /**
74
     * One hour is max stalled time for the CLI
75
     * If the process had the status "start" for 3600 seconds, it will be regarded stalled and a new process is started
76
     *
77
     * @var integer
78
     */
79
    public $max_CLI_exec_time = 3600;
80
81
    /**
82
     * @var array
83
     */
84
    public $duplicateTrack = [];
85
86
    /**
87
     * @var array
88
     */
89
    public $downloadUrls = [];
90
91
    /**
92
     * @var array
93
     */
94
    public $incomingProcInstructions = [];
95
96
    /**
97
     * @var array
98
     */
99
    public $incomingConfigurationSelection = [];
100
101
    /**
102
     * @var bool
103
     */
104
    public $registerQueueEntriesInternallyOnly = false;
105
106
    /**
107
     * @var array
108
     */
109
    public $queueEntries = [];
110
111
    /**
112
     * @var array
113
     */
114
    public $urlList = [];
115
116
    /**
117
     * @var boolean
118
     */
119
    public $debugMode = false;
120
121
    /**
122
     * @var array
123
     */
124
    public $extensionSettings = [];
125
126
    /**
127
     * Mount Point
128
     *
129
     * @var boolean
130
     */
131
    public $MP = false;
132
133
    /**
134
     * @var string
135
     */
136
    protected $processFilename;
137
138
    /**
139
     * Holds the internal access mode can be 'gui','cli' or 'cli_im'
140
     *
141
     * @var string
142
     */
143
    protected $accessMode;
144
145
    /**
146
     * @var DatabaseConnection
147
     */
148
    private $db;
149
150
    /**
151
     * @var BackendUserAuthentication
152
     */
153
    private $backendUser;
154
155
    /**
156
     * @var integer
157
     */
158
    private $scheduledTime = 0;
159
160
    /**
161
     * @var integer
162
     */
163
    private $reqMinute = 0;
164
165
    /**
166
     * @var bool
167
     */
168
    private $submitCrawlUrls = false;
169
170
    /**
171
     * @var bool
172
     */
173
    private $downloadCrawlUrls = false;
174
175
    /**
176
     * Method to set the accessMode can be gui, cli or cli_im
177
     *
178
     * @return string
179
     */
180 1
    public function getAccessMode()
181
    {
182 1
        return $this->accessMode;
183
    }
184
185
    /**
186
     * @param string $accessMode
187
     */
188 1
    public function setAccessMode($accessMode)
189
    {
190 1
        $this->accessMode = $accessMode;
191 1
    }
192
193
    /**
194
     * Set disabled status to prevent processes from being processed
195
     *
196
     * @param  bool $disabled (optional, defaults to true)
197
     * @return void
198
     */
199 3
    public function setDisabled($disabled = true)
200
    {
201 3
        if ($disabled) {
202 2
            GeneralUtility::writeFile($this->processFilename, '');
203
        } else {
204 1
            if (is_file($this->processFilename)) {
205 1
                unlink($this->processFilename);
206
            }
207
        }
208 3
    }
209
210
    /**
211
     * Get disable status
212
     *
213
     * @return bool true if disabled
214
     */
215 3
    public function getDisabled()
216
    {
217 3
        if (is_file($this->processFilename)) {
218 2
            return true;
219
        } else {
220 1
            return false;
221
        }
222
    }
223
224
    /**
225
     * @param string $filenameWithPath
226
     *
227
     * @return void
228
     */
229 4
    public function setProcessFilename($filenameWithPath)
230
    {
231 4
        $this->processFilename = $filenameWithPath;
232 4
    }
233
234
    /**
235
     * @return string
236
     */
237 1
    public function getProcessFilename()
238
    {
239 1
        return $this->processFilename;
240
    }
241
242
    /************************************
243
     *
244
     * Getting URLs based on Page TSconfig
245
     *
246
     ************************************/
247
248 60
    public function __construct()
249
    {
250 60
        $this->db = $GLOBALS['TYPO3_DB'];
251 60
        $this->backendUser = $GLOBALS['BE_USER'];
252 60
        $this->processFilename = PATH_site . 'typo3temp/tx_crawler.proc';
253
254 60
        $settings = unserialize($GLOBALS['TYPO3_CONF_VARS']['EXT']['extConf']['crawler']);
255 60
        $settings = is_array($settings) ? $settings : [];
256
257
        // read ext_em_conf_template settings and set
258 60
        $this->setExtensionSettings($settings);
259
260
        // set defaults:
261 60
        if (MathUtility::convertToPositiveInteger($this->extensionSettings['countInARun']) == 0) {
262 23
            $this->extensionSettings['countInARun'] = 100;
263
        }
264
265 60
        $this->extensionSettings['processLimit'] = MathUtility::forceIntegerInRange($this->extensionSettings['processLimit'], 1, 99, 1);
266 60
    }
267
268
    /**
269
     * Sets the extensions settings (unserialized pendant of $TYPO3_CONF_VARS['EXT']['extConf']['crawler']).
270
     *
271
     * @param array $extensionSettings
272
     * @return void
273
     */
274 68
    public function setExtensionSettings(array $extensionSettings)
275
    {
276 68
        $this->extensionSettings = $extensionSettings;
277 68
    }
278
279
    /**
280
     * Check if the given page should be crawled
281
     *
282
     * @param array $pageRow
283
     * @return false|string false if the page should be crawled (not excluded), true / skipMessage if it should be skipped
284
     */
285 10
    public function checkIfPageShouldBeSkipped(array $pageRow)
286
    {
287 10
        $skipPage = false;
288 10
        $skipMessage = 'Skipped'; // message will be overwritten later
289
290
        // if page is hidden
291 10
        if (!$this->extensionSettings['crawlHiddenPages']) {
292 10
            if ($pageRow['hidden']) {
293 1
                $skipPage = true;
294 1
                $skipMessage = 'Because page is hidden';
295
            }
296
        }
297
298 10
        if (!$skipPage) {
299 9
            if (GeneralUtility::inList('3,4', $pageRow['doktype']) || $pageRow['doktype'] >= 199) {
300 3
                $skipPage = true;
301 3
                $skipMessage = 'Because doktype is not allowed';
302
            }
303
        }
304
305 10
        if (!$skipPage) {
306 6
            if (is_array($GLOBALS['TYPO3_CONF_VARS']['EXTCONF']['crawler']['excludeDoktype'])) {
307 2
                foreach ($GLOBALS['TYPO3_CONF_VARS']['EXTCONF']['crawler']['excludeDoktype'] as $key => $doktypeList) {
308 1
                    if (GeneralUtility::inList($doktypeList, $pageRow['doktype'])) {
309 1
                        $skipPage = true;
310 1
                        $skipMessage = 'Doktype was excluded by "' . $key . '"';
311 1
                        break;
312
                    }
313
                }
314
            }
315
        }
316
317 10
        if (!$skipPage) {
318
            // veto hook
319 5
            if (is_array($GLOBALS['TYPO3_CONF_VARS']['EXTCONF']['crawler']['pageVeto'])) {
320
                foreach ($GLOBALS['TYPO3_CONF_VARS']['EXTCONF']['crawler']['pageVeto'] as $key => $func) {
321
                    $params = [
322
                        'pageRow' => $pageRow
323
                    ];
324
                    // expects "false" if page is ok and "true" or a skipMessage if this page should _not_ be crawled
325
                    $veto = GeneralUtility::callUserFunction($func, $params, $this);
326
                    if ($veto !== false) {
327
                        $skipPage = true;
328
                        if (is_string($veto)) {
329
                            $skipMessage = $veto;
330
                        } else {
331
                            $skipMessage = 'Veto from hook "' . htmlspecialchars($key) . '"';
332
                        }
333
                        // no need to execute other hooks if a previous one return a veto
334
                        break;
335
                    }
336
                }
337
            }
338
        }
339
340 10
        return $skipPage ? $skipMessage : false;
341
    }
342
343
    /**
344
     * Wrapper method for getUrlsForPageId()
345
     * It returns an array of configurations and no urls!
346
     *
347
     * @param array $pageRow Page record with at least dok-type and uid columns.
348
     * @param string $skipMessage
349
     * @return array
350
     * @see getUrlsForPageId()
351
     */
352 6
    public function getUrlsForPageRow(array $pageRow, &$skipMessage = '')
353
    {
354 6
        $message = $this->checkIfPageShouldBeSkipped($pageRow);
355
356 6
        if ($message === false) {
357 5
            $forceSsl = ($pageRow['url_scheme'] === 2) ? true : false;
358 5
            $res = $this->getUrlsForPageId($pageRow['uid'], $forceSsl);
359 5
            $skipMessage = '';
360
        } else {
361 1
            $skipMessage = $message;
362 1
            $res = [];
363
        }
364
365 6
        return $res;
366
    }
367
368
    /**
369
     * This method is used to count if there are ANY unprocessed queue entries
370
     * of a given page_id and the configuration which matches a given hash.
371
     * If there if none, we can skip an inner detail check
372
     *
373
     * @param  int $uid
374
     * @param  string $configurationHash
375
     * @return boolean
376
     */
377 7
    protected function noUnprocessedQueueEntriesForPageWithConfigurationHashExist($uid, $configurationHash)
378
    {
379 7
        $configurationHash = $this->db->fullQuoteStr($configurationHash, 'tx_crawler_queue');
380 7
        $res = $this->db->exec_SELECTquery('count(*) as anz', 'tx_crawler_queue', "page_id=" . intval($uid) . " AND configuration_hash=" . $configurationHash . " AND exec_time=0");
381 7
        $row = $this->db->sql_fetch_assoc($res);
382
383 7
        return ($row['anz'] == 0);
384
    }
385
386
    /**
387
     * Creates a list of URLs from input array (and submits them to queue if asked for)
388
     * See Web > Info module script + "indexed_search"'s crawler hook-client using this!
389
     *
390
     * @param    array        Information about URLs from pageRow to crawl.
391
     * @param    array        Page row
392
     * @param    integer        Unix time to schedule indexing to, typically time()
393
     * @param    integer        Number of requests per minute (creates the interleave between requests)
394
     * @param    boolean        If set, submits the URLs to queue
395
     * @param    boolean        If set (and submitcrawlUrls is false) will fill $downloadUrls with entries)
396
     * @param    array        Array which is passed by reference and contains the an id per url to secure we will not crawl duplicates
397
     * @param    array        Array which will be filled with URLS for download if flag is set.
398
     * @param    array        Array of processing instructions
399
     * @return    string        List of URLs (meant for display in backend module)
400
     *
401
     */
402 4
    public function urlListFromUrlArray(
403
    array $vv,
404
    array $pageRow,
405
    $scheduledTime,
406
    $reqMinute,
407
    $submitCrawlUrls,
408
    $downloadCrawlUrls,
409
    array &$duplicateTrack,
410
    array &$downloadUrls,
411
    array $incomingProcInstructions
412
    ) {
413 4
        $urlList = '';
414
        // realurl support (thanks to Ingo Renner)
415 4
        if (ExtensionManagementUtility::isLoaded('realurl') && $vv['subCfg']['realurl']) {
416
417
            /** @var tx_realurl $urlObj */
418
            $urlObj = GeneralUtility::makeInstance('tx_realurl');
419
420
            if (!empty($vv['subCfg']['baseUrl'])) {
421
                $urlParts = parse_url($vv['subCfg']['baseUrl']);
422
                $host = strtolower($urlParts['host']);
423
                $urlObj->host = $host;
424
425
                // First pass, finding configuration OR pointer string:
426
                $urlObj->extConf = isset($GLOBALS['TYPO3_CONF_VARS']['EXTCONF']['realurl'][$urlObj->host]) ? $GLOBALS['TYPO3_CONF_VARS']['EXTCONF']['realurl'][$urlObj->host] : $GLOBALS['TYPO3_CONF_VARS']['EXTCONF']['realurl']['_DEFAULT'];
427
428
                // If it turned out to be a string pointer, then look up the real config:
429
                if (is_string($urlObj->extConf)) {
430
                    $urlObj->extConf = is_array($GLOBALS['TYPO3_CONF_VARS']['EXTCONF']['realurl'][$urlObj->extConf]) ? $GLOBALS['TYPO3_CONF_VARS']['EXTCONF']['realurl'][$urlObj->extConf] : $GLOBALS['TYPO3_CONF_VARS']['EXTCONF']['realurl']['_DEFAULT'];
431
                }
432
            }
433
434
            if (!$GLOBALS['TSFE']->sys_page) {
435
                $GLOBALS['TSFE']->sys_page = GeneralUtility::makeInstance('TYPO3\CMS\Frontend\Page\PageRepository');
436
            }
437
            if (!$GLOBALS['TSFE']->csConvObj) {
438
                $GLOBALS['TSFE']->csConvObj = GeneralUtility::makeInstance('TYPO3\CMS\Core\Charset\CharsetConverter');
439
            }
440
            if (!$GLOBALS['TSFE']->tmpl->rootLine[0]['uid']) {
441
                $GLOBALS['TSFE']->tmpl->rootLine[0]['uid'] = $urlObj->extConf['pagePath']['rootpage_id'];
442
            }
443
        }
444
445 4
        if (is_array($vv['URLs'])) {
446 4
            $configurationHash = $this->getConfigurationHash($vv);
447 4
            $skipInnerCheck = $this->noUnprocessedQueueEntriesForPageWithConfigurationHashExist($pageRow['uid'], $configurationHash);
448
449 4
            foreach ($vv['URLs'] as $urlQuery) {
450 4
                if ($this->drawURLs_PIfilter($vv['subCfg']['procInstrFilter'], $incomingProcInstructions)) {
451
452
                    // Calculate cHash:
453 4
                    if ($vv['subCfg']['cHash']) {
454
                        /* @var $cacheHash \TYPO3\CMS\Frontend\Page\CacheHashCalculator */
455
                        $cacheHash = GeneralUtility::makeInstance('TYPO3\CMS\Frontend\Page\CacheHashCalculator');
456
                        $urlQuery .= '&cHash=' . $cacheHash->generateForParameters($urlQuery);
457
                    }
458
459
                    // Create key by which to determine unique-ness:
460 4
                    $uKey = $urlQuery . '|' . $vv['subCfg']['userGroups'] . '|' . $vv['subCfg']['baseUrl'] . '|' . $vv['subCfg']['procInstrFilter'];
461
462
                    // realurl support (thanks to Ingo Renner)
463 4
                    $urlQuery = 'index.php' . $urlQuery;
464 4
                    if (ExtensionManagementUtility::isLoaded('realurl') && $vv['subCfg']['realurl']) {
465
                        $params = [
466
                            'LD' => [
467
                                'totalURL' => $urlQuery
468
                            ],
469
                            'TCEmainHook' => true
470
                        ];
471
                        $urlObj->encodeSpURL($params);
0 ignored issues
show
Bug introduced by
The variable $urlObj does not seem to be defined for all execution paths leading up to this point.

If you define a variable conditionally, it can happen that it is not defined for all execution paths.

Let’s take a look at an example:

function myFunction($a) {
    switch ($a) {
        case 'foo':
            $x = 1;
            break;

        case 'bar':
            $x = 2;
            break;
    }

    // $x is potentially undefined here.
    echo $x;
}

In the above example, the variable $x is defined if you pass “foo” or “bar” as argument for $a. However, since the switch statement has no default case statement, if you pass any other value, the variable $x would be undefined.

Available Fixes

  1. Check for existence of the variable explicitly:

    function myFunction($a) {
        switch ($a) {
            case 'foo':
                $x = 1;
                break;
    
            case 'bar':
                $x = 2;
                break;
        }
    
        if (isset($x)) { // Make sure it's always set.
            echo $x;
        }
    }
    
  2. Define a default value for the variable:

    function myFunction($a) {
        $x = ''; // Set a default which gets overridden for certain paths.
        switch ($a) {
            case 'foo':
                $x = 1;
                break;
    
            case 'bar':
                $x = 2;
                break;
        }
    
        echo $x;
    }
    
  3. Add a value for the missing path:

    function myFunction($a) {
        switch ($a) {
            case 'foo':
                $x = 1;
                break;
    
            case 'bar':
                $x = 2;
                break;
    
            // We add support for the missing case.
            default:
                $x = '';
                break;
        }
    
        echo $x;
    }
    
Loading history...
472
                        $urlQuery = $params['LD']['totalURL'];
473
                    }
474
475
                    // Scheduled time:
476 4
                    $schTime = $scheduledTime + round(count($duplicateTrack) * (60 / $reqMinute));
477 4
                    $schTime = floor($schTime / 60) * 60;
478
479 4
                    if (isset($duplicateTrack[$uKey])) {
480
481
                        //if the url key is registered just display it and do not resubmit is
482
                        $urlList = '<em><span class="typo3-dimmed">' . htmlspecialchars($urlQuery) . '</span></em><br/>';
483
                    } else {
484 4
                        $urlList = '[' . date('d.m.y H:i', $schTime) . '] ' . htmlspecialchars($urlQuery);
485 4
                        $this->urlList[] = '[' . date('d.m.y H:i', $schTime) . '] ' . $urlQuery;
486
487 4
                        $theUrl = ($vv['subCfg']['baseUrl'] ? $vv['subCfg']['baseUrl'] : GeneralUtility::getIndpEnv('TYPO3_SITE_URL')) . $urlQuery;
488
489
                        // Submit for crawling!
490 4
                        if ($submitCrawlUrls) {
491 4
                            $added = $this->addUrl(
492 4
                            $pageRow['uid'],
493 4
                            $theUrl,
494 4
                            $vv['subCfg'],
495 4
                            $scheduledTime,
496 4
                            $configurationHash,
497 4
                            $skipInnerCheck
498
                            );
499 4
                            if ($added === false) {
500 4
                                $urlList .= ' (Url already existed)';
501
                            }
502
                        } elseif ($downloadCrawlUrls) {
503
                            $downloadUrls[$theUrl] = $theUrl;
504
                        }
505
506 4
                        $urlList .= '<br />';
507
                    }
508 4
                    $duplicateTrack[$uKey] = true;
509
                }
510
            }
511
        } else {
512
            $urlList = 'ERROR - no URL generated';
513
        }
514
515 4
        return $urlList;
516
    }
517
518
    /**
519
     * Returns true if input processing instruction is among registered ones.
520
     *
521
     * @param string $piString PI to test
522
     * @param array $incomingProcInstructions Processing instructions
523
     * @return boolean
524
     */
525 5
    public function drawURLs_PIfilter($piString, array $incomingProcInstructions)
526
    {
527 5
        if (empty($incomingProcInstructions)) {
528 1
            return true;
529
        }
530
531 4
        foreach ($incomingProcInstructions as $pi) {
532 4
            if (GeneralUtility::inList($piString, $pi)) {
533 4
                return true;
534
            }
535
        }
536 2
    }
537
538 4
    public function getPageTSconfigForId($id)
539
    {
540 4
        if (!$this->MP) {
541 4
            $pageTSconfig = BackendUtility::getPagesTSconfig($id);
542
        } else {
543
            list(, $mountPointId) = explode('-', $this->MP);
544
            $pageTSconfig = BackendUtility::getPagesTSconfig($mountPointId);
545
        }
546
547
        // Call a hook to alter configuration
548 4
        if (is_array($GLOBALS['TYPO3_CONF_VARS']['EXTCONF']['crawler']['getPageTSconfigForId'])) {
549
            $params = [
550
                'pageId' => $id,
551
                'pageTSConfig' => &$pageTSconfig
552
            ];
553
            foreach ($GLOBALS['TYPO3_CONF_VARS']['EXTCONF']['crawler']['getPageTSconfigForId'] as $userFunc) {
554
                GeneralUtility::callUserFunction($userFunc, $params, $this);
555
            }
556
        }
557
558 4
        return $pageTSconfig;
559
    }
560
561
    /**
562
     * This methods returns an array of configurations.
563
     * And no urls!
564
     *
565
     * @param integer $id Page ID
566
     * @param bool $forceSsl Use https
567
     * @return array
568
     */
569 4
    protected function getUrlsForPageId($id, $forceSsl = false)
570
    {
571
572
        /**
573
         * Get configuration from tsConfig
574
         */
575
576
        // Get page TSconfig for page ID:
577 4
        $pageTSconfig = $this->getPageTSconfigForId($id);
578
579 4
        $res = [];
580
581 4
        if (is_array($pageTSconfig) && is_array($pageTSconfig['tx_crawler.']['crawlerCfg.'])) {
582 3
            $crawlerCfg = $pageTSconfig['tx_crawler.']['crawlerCfg.'];
583
584 3
            if (is_array($crawlerCfg['paramSets.'])) {
585 3
                foreach ($crawlerCfg['paramSets.'] as $key => $values) {
586 3
                    if (is_array($values)) {
587 3
                        $key = str_replace('.', '', $key);
588
                        // Sub configuration for a single configuration string:
589 3
                        $subCfg = (array)$crawlerCfg['paramSets.'][$key . '.'];
590 3
                        $subCfg['key'] = $key;
591
592 3
                        if (strcmp($subCfg['procInstrFilter'], '')) {
593 3
                            $subCfg['procInstrFilter'] = implode(',', GeneralUtility::trimExplode(',', $subCfg['procInstrFilter']));
594
                        }
595 3
                        $pidOnlyList = implode(',', GeneralUtility::trimExplode(',', $subCfg['pidsOnly'], true));
596
597
                        // process configuration if it is not page-specific or if the specific page is the current page:
598 3
                        if (!strcmp($subCfg['pidsOnly'], '') || GeneralUtility::inList($pidOnlyList, $id)) {
599
600
                                // add trailing slash if not present
601 3
                            if (!empty($subCfg['baseUrl']) && substr($subCfg['baseUrl'], -1) != '/') {
602
                                $subCfg['baseUrl'] .= '/';
603
                            }
604
605
                            // Explode, process etc.:
606 3
                            $res[$key] = [];
607 3
                            $res[$key]['subCfg'] = $subCfg;
608 3
                            $res[$key]['paramParsed'] = $this->parseParams($values);
0 ignored issues
show
Documentation introduced by
$values is of type array, but the function expects a string.

It seems like the type of the argument is not accepted by the function/method which you are calling.

In some cases, in particular if PHP’s automatic type-juggling kicks in this might be fine. In other cases, however this might be a bug.

We suggest to add an explicit type cast like in the following example:

function acceptsInteger($int) { }

$x = '123'; // string "123"

// Instead of
acceptsInteger($x);

// we recommend to use
acceptsInteger((integer) $x);
Loading history...
609 3
                            $res[$key]['paramExpanded'] = $this->expandParameters($res[$key]['paramParsed'], $id);
610 3
                            $res[$key]['origin'] = 'pagets';
611
612
                            // recognize MP value
613 3
                            if (!$this->MP) {
614 3
                                $res[$key]['URLs'] = $this->compileUrls($res[$key]['paramExpanded'], ['?id=' . $id]);
615
                            } else {
616 3
                                $res[$key]['URLs'] = $this->compileUrls($res[$key]['paramExpanded'], ['?id=' . $id . '&MP=' . $this->MP]);
617
                            }
618
                        }
619
                    }
620
                }
621
            }
622
        }
623
624
        /**
625
         * Get configuration from tx_crawler_configuration records
626
         */
627
628
        // get records along the rootline
629 4
        $rootLine = BackendUtility::BEgetRootLine($id);
630
631 4
        foreach ($rootLine as $page) {
632 4
            $configurationRecordsForCurrentPage = BackendUtility::getRecordsByField(
633 4
                'tx_crawler_configuration',
634 4
                'pid',
635 4
                intval($page['uid']),
636 4
                BackendUtility::BEenableFields('tx_crawler_configuration') . BackendUtility::deleteClause('tx_crawler_configuration')
637
            );
638
639 4
            if (is_array($configurationRecordsForCurrentPage)) {
640 1
                foreach ($configurationRecordsForCurrentPage as $configurationRecord) {
641
642
                        // check access to the configuration record
643 1
                    if (empty($configurationRecord['begroups']) || $GLOBALS['BE_USER']->isAdmin() || $this->hasGroupAccess($GLOBALS['BE_USER']->user['usergroup_cached_list'], $configurationRecord['begroups'])) {
644 1
                        $pidOnlyList = implode(',', GeneralUtility::trimExplode(',', $configurationRecord['pidsonly'], true));
645
646
                        // process configuration if it is not page-specific or if the specific page is the current page:
647 1
                        if (!strcmp($configurationRecord['pidsonly'], '') || GeneralUtility::inList($pidOnlyList, $id)) {
648 1
                            $key = $configurationRecord['name'];
649
650
                            // don't overwrite previously defined paramSets
651 1
                            if (!isset($res[$key])) {
652
653
                                    /* @var $TSparserObject \TYPO3\CMS\Core\TypoScript\Parser\TypoScriptParser */
654 1
                                $TSparserObject = GeneralUtility::makeInstance('TYPO3\CMS\Core\TypoScript\Parser\TypoScriptParser');
655 1
                                $TSparserObject->parse($configurationRecord['processing_instruction_parameters_ts']);
656
657 1
                                $isCrawlingProtocolHttps = $this->isCrawlingProtocolHttps($configurationRecord['force_ssl'], $forceSsl);
658
659
                                $subCfg = [
660 1
                                    'procInstrFilter' => $configurationRecord['processing_instruction_filter'],
661 1
                                    'procInstrParams.' => $TSparserObject->setup,
662 1
                                    'baseUrl' => $this->getBaseUrlForConfigurationRecord(
663 1
                                        $configurationRecord['base_url'],
664 1
                                        $configurationRecord['sys_domain_base_url'],
665 1
                                        $isCrawlingProtocolHttps
666
                                    ),
667 1
                                    'realurl' => $configurationRecord['realurl'],
668 1
                                    'cHash' => $configurationRecord['chash'],
669 1
                                    'userGroups' => $configurationRecord['fegroups'],
670 1
                                    'exclude' => $configurationRecord['exclude'],
671 1
                                    'rootTemplatePid' => (int) $configurationRecord['root_template_pid'],
672 1
                                    'key' => $key
673
                                ];
674
675
                                // add trailing slash if not present
676 1
                                if (!empty($subCfg['baseUrl']) && substr($subCfg['baseUrl'], -1) != '/') {
677
                                    $subCfg['baseUrl'] .= '/';
678
                                }
679 1
                                if (!in_array($id, $this->expandExcludeString($subCfg['exclude']))) {
680 1
                                    $res[$key] = [];
681 1
                                    $res[$key]['subCfg'] = $subCfg;
682 1
                                    $res[$key]['paramParsed'] = $this->parseParams($configurationRecord['configuration']);
683 1
                                    $res[$key]['paramExpanded'] = $this->expandParameters($res[$key]['paramParsed'], $id);
684 1
                                    $res[$key]['URLs'] = $this->compileUrls($res[$key]['paramExpanded'], ['?id=' . $id]);
685 4
                                    $res[$key]['origin'] = 'tx_crawler_configuration_' . $configurationRecord['uid'];
686
                                }
687
                            }
688
                        }
689
                    }
690
                }
691
            }
692
        }
693
694 4
        if (is_array($GLOBALS['TYPO3_CONF_VARS']['EXTCONF']['crawler']['processUrls'])) {
695
            foreach ($GLOBALS['TYPO3_CONF_VARS']['EXTCONF']['crawler']['processUrls'] as $func) {
696
                $params = [
697
                    'res' => &$res,
698
                ];
699
                GeneralUtility::callUserFunction($func, $params, $this);
700
            }
701
        }
702
703 4
        return $res;
704
    }
705
706
    /**
707
     * Checks if a domain record exist and returns the base-url based on the record. If not the given baseUrl string is used.
708
     *
709
     * @param string $baseUrl
710
     * @param integer $sysDomainUid
711
     * @param bool $ssl
712
     * @return string
713
     */
714 4
    protected function getBaseUrlForConfigurationRecord($baseUrl, $sysDomainUid, $ssl = false)
715
    {
716 4
        $sysDomainUid = intval($sysDomainUid);
717 4
        $urlScheme = ($ssl === false) ? 'http' : 'https';
718
719 4
        if ($sysDomainUid > 0) {
720 2
            $res = $this->db->exec_SELECTquery(
721 2
                '*',
722 2
                'sys_domain',
723 2
                'uid = ' . $sysDomainUid .
724 2
                BackendUtility::BEenableFields('sys_domain') .
725 2
                BackendUtility::deleteClause('sys_domain')
726
            );
727 2
            $row = $this->db->sql_fetch_assoc($res);
728 2
            if ($row['domainName'] != '') {
729 1
                return $urlScheme . '://' . $row['domainName'];
730
            }
731
        }
732 3
        return $baseUrl;
733
    }
734
735
    public function getConfigurationsForBranch($rootid, $depth)
736
    {
737
        $configurationsForBranch = [];
738
739
        $pageTSconfig = $this->getPageTSconfigForId($rootid);
740
        if (is_array($pageTSconfig) && is_array($pageTSconfig['tx_crawler.']['crawlerCfg.']) && is_array($pageTSconfig['tx_crawler.']['crawlerCfg.']['paramSets.'])) {
741
            $sets = $pageTSconfig['tx_crawler.']['crawlerCfg.']['paramSets.'];
742
            if (is_array($sets)) {
743
                foreach ($sets as $key => $value) {
744
                    if (!is_array($value)) {
745
                        continue;
746
                    }
747
                    $configurationsForBranch[] = substr($key, -1) == '.' ? substr($key, 0, -1) : $key;
748
                }
749
            }
750
        }
751
        $pids = [];
752
        $rootLine = BackendUtility::BEgetRootLine($rootid);
753
        foreach ($rootLine as $node) {
754
            $pids[] = $node['uid'];
755
        }
756
        /* @var PageTreeView $tree */
757
        $tree = GeneralUtility::makeInstance(PageTreeView::class);
758
        $perms_clause = $GLOBALS['BE_USER']->getPagePermsClause(1);
759
        $tree->init('AND ' . $perms_clause);
760
        $tree->getTree($rootid, $depth, '');
761
        foreach ($tree->tree as $node) {
762
            $pids[] = $node['row']['uid'];
763
        }
764
765
        $res = $this->db->exec_SELECTquery(
766
            '*',
767
            'tx_crawler_configuration',
768
            'pid IN (' . implode(',', $pids) . ') ' .
769
            BackendUtility::BEenableFields('tx_crawler_configuration') .
770
            BackendUtility::deleteClause('tx_crawler_configuration') . ' ' .
771
            BackendUtility::versioningPlaceholderClause('tx_crawler_configuration') . ' '
772
        );
773
774
        while ($row = $this->db->sql_fetch_assoc($res)) {
775
            $configurationsForBranch[] = $row['name'];
776
        }
777
        $this->db->sql_free_result($res);
778
        return $configurationsForBranch;
779
    }
780
781
    /**
782
     * Check if a user has access to an item
783
     * (e.g. get the group list of the current logged in user from $GLOBALS['TSFE']->gr_list)
784
     *
785
     * @see \TYPO3\CMS\Frontend\Page\PageRepository::getMultipleGroupsWhereClause()
786
     * @param  string $groupList    Comma-separated list of (fe_)group UIDs from a user
787
     * @param  string $accessList   Comma-separated list of (fe_)group UIDs of the item to access
788
     * @return bool                 TRUE if at least one of the users group UIDs is in the access list or the access list is empty
789
     */
790 3
    public function hasGroupAccess($groupList, $accessList)
791
    {
792 3
        if (empty($accessList)) {
793 1
            return true;
794
        }
795 2
        foreach (GeneralUtility::intExplode(',', $groupList) as $groupUid) {
796 2
            if (GeneralUtility::inList($accessList, $groupUid)) {
797 2
                return true;
798
            }
799
        }
800 1
        return false;
801
    }
802
803
    /**
804
     * Parse GET vars of input Query into array with key=>value pairs
805
     *
806
     * @param string $inputQuery Input query string
807
     * @return array
808
     */
809 7
    public function parseParams($inputQuery)
810
    {
811
        // Extract all GET parameters into an ARRAY:
812 7
        $paramKeyValues = [];
813 7
        $GETparams = explode('&', $inputQuery);
814
815 7
        foreach ($GETparams as $paramAndValue) {
816 4
            list($p, $v) = explode('=', $paramAndValue, 2);
817 4
            if (strlen($p)) {
818 4
                $paramKeyValues[rawurldecode($p)] = rawurldecode($v);
819
            }
820
        }
821
822 7
        return $paramKeyValues;
823
    }
824
825
    /**
826
     * Will expand the parameters configuration to individual values. This follows a certain syntax of the value of each parameter.
827
     * Syntax of values:
828
     * - Basically: If the value is wrapped in [...] it will be expanded according to the following syntax, otherwise the value is taken literally
829
     * - Configuration is splitted by "|" and the parts are processed individually and finally added together
830
     * - For each configuration part:
831
     *         - "[int]-[int]" = Integer range, will be expanded to all values in between, values included, starting from low to high (max. 1000). Example "1-34" or "-40--30"
832
     *         - "_TABLE:[TCA table name];[_PID:[optional page id, default is current page]];[_ENABLELANG:1]" = Look up of table records from PID, filtering out deleted records. Example "_TABLE:tt_content; _PID:123"
833
     *        _ENABLELANG:1 picks only original records without their language overlays
834
     *         - Default: Literal value
835
     *
836
     * @param array $paramArray Array with key (GET var name) and values (value of GET var which is configuration for expansion)
837
     * @param integer $pid Current page ID
838
     * @return array
839
     */
840 4
    public function expandParameters($paramArray, $pid)
841
    {
842 4
        global $TCA;
843
844
        // Traverse parameter names:
845 4
        foreach ($paramArray as $p => $v) {
846 1
            $v = trim($v);
847
848
            // If value is encapsulated in square brackets it means there are some ranges of values to find, otherwise the value is literal
849 1
            if (substr($v, 0, 1) === '[' && substr($v, -1) === ']') {
850
                // So, find the value inside brackets and reset the paramArray value as an array.
851 1
                $v = substr($v, 1, -1);
852 1
                $paramArray[$p] = [];
853
854
                // Explode parts and traverse them:
855 1
                $parts = explode('|', $v);
856 1
                foreach ($parts as $pV) {
857
858
                        // Look for integer range: (fx. 1-34 or -40--30 // reads minus 40 to minus 30)
859 1
                    if (preg_match('/^(-?[0-9]+)\s*-\s*(-?[0-9]+)$/', trim($pV), $reg)) {
860
861
                        // Swap if first is larger than last:
862
                        if ($reg[1] > $reg[2]) {
863
                            $temp = $reg[2];
864
                            $reg[2] = $reg[1];
865
                            $reg[1] = $temp;
866
                        }
867
868
                        // Traverse range, add values:
869
                        $runAwayBrake = 1000; // Limit to size of range!
870
                        for ($a = $reg[1]; $a <= $reg[2];$a++) {
871
                            $paramArray[$p][] = $a;
872
                            $runAwayBrake--;
873
                            if ($runAwayBrake <= 0) {
874
                                break;
875
                            }
876
                        }
877 1
                    } elseif (substr(trim($pV), 0, 7) == '_TABLE:') {
878
879
                        // Parse parameters:
880
                        $subparts = GeneralUtility::trimExplode(';', $pV);
881
                        $subpartParams = [];
882
                        foreach ($subparts as $spV) {
883
                            list($pKey, $pVal) = GeneralUtility::trimExplode(':', $spV);
884
                            $subpartParams[$pKey] = $pVal;
885
                        }
886
887
                        // Table exists:
888
                        if (isset($TCA[$subpartParams['_TABLE']])) {
889
                            $lookUpPid = isset($subpartParams['_PID']) ? intval($subpartParams['_PID']) : $pid;
890
                            $pidField = isset($subpartParams['_PIDFIELD']) ? trim($subpartParams['_PIDFIELD']) : 'pid';
891
                            $where = isset($subpartParams['_WHERE']) ? $subpartParams['_WHERE'] : '';
892
                            $addTable = isset($subpartParams['_ADDTABLE']) ? $subpartParams['_ADDTABLE'] : '';
893
894
                            $fieldName = $subpartParams['_FIELD'] ? $subpartParams['_FIELD'] : 'uid';
895
                            if ($fieldName === 'uid' || $TCA[$subpartParams['_TABLE']]['columns'][$fieldName]) {
896
                                $andWhereLanguage = '';
897
                                $transOrigPointerField = $TCA[$subpartParams['_TABLE']]['ctrl']['transOrigPointerField'];
898
899
                                if ($subpartParams['_ENABLELANG'] && $transOrigPointerField) {
900
                                    $andWhereLanguage = ' AND ' . $this->db->quoteStr($transOrigPointerField, $subpartParams['_TABLE']) . ' <= 0 ';
901
                                }
902
903
                                $where = $this->db->quoteStr($pidField, $subpartParams['_TABLE']) . '=' . intval($lookUpPid) . ' ' .
904
                                    $andWhereLanguage . $where;
905
906
                                $rows = $this->db->exec_SELECTgetRows(
907
                                    $fieldName,
908
                                    $subpartParams['_TABLE'] . $addTable,
909
                                    $where . BackendUtility::deleteClause($subpartParams['_TABLE']),
910
                                    '',
911
                                    '',
912
                                    '',
913
                                    $fieldName
914
                                );
915
916
                                if (is_array($rows)) {
917
                                    $paramArray[$p] = array_merge($paramArray[$p], array_keys($rows));
918
                                }
919
                            }
920
                        }
921
                    } else { // Just add value:
922 1
                        $paramArray[$p][] = $pV;
923
                    }
924
                    // Hook for processing own expandParameters place holder
925 1
                    if (is_array($GLOBALS['TYPO3_CONF_VARS']['SC_OPTIONS']['crawler/class.tx_crawler_lib.php']['expandParameters'])) {
926
                        $_params = [
927
                            'pObj' => &$this,
928
                            'paramArray' => &$paramArray,
929
                            'currentKey' => $p,
930
                            'currentValue' => $pV,
931
                            'pid' => $pid
932
                        ];
933
                        foreach ($GLOBALS['TYPO3_CONF_VARS']['SC_OPTIONS']['crawler/class.tx_crawler_lib.php']['expandParameters'] as $key => $_funcRef) {
934 1
                            GeneralUtility::callUserFunction($_funcRef, $_params, $this);
935
                        }
936
                    }
937
                }
938
939
                // Make unique set of values and sort array by key:
940 1
                $paramArray[$p] = array_unique($paramArray[$p]);
941 1
                ksort($paramArray);
942
            } else {
943
                // Set the literal value as only value in array:
944 1
                $paramArray[$p] = [$v];
945
            }
946
        }
947
948 4
        return $paramArray;
949
    }
950
951
    /**
952
     * Compiling URLs from parameter array (output of expandParameters())
953
     * The number of URLs will be the multiplication of the number of parameter values for each key
954
     *
955
     * @param array $paramArray Output of expandParameters(): Array with keys (GET var names) and for each an array of values
956
     * @param array $urls URLs accumulated in this array (for recursion)
957
     * @return array
958
     */
959 7
    public function compileUrls($paramArray, $urls = [])
960
    {
961 7
        if (count($paramArray) && is_array($urls)) {
962
            // shift first off stack:
963 3
            reset($paramArray);
964 3
            $varName = key($paramArray);
965 3
            $valueSet = array_shift($paramArray);
966
967
            // Traverse value set:
968 3
            $newUrls = [];
969 3
            foreach ($urls as $url) {
970 2
                foreach ($valueSet as $val) {
971 2
                    $newUrls[] = $url . (strcmp($val, '') ? '&' . rawurlencode($varName) . '=' . rawurlencode($val) : '');
972
973 2
                    if (count($newUrls) > MathUtility::forceIntegerInRange($this->extensionSettings['maxCompileUrls'], 1, 1000000000, 10000)) {
974 2
                        break;
975
                    }
976
                }
977
            }
978 3
            $urls = $newUrls;
979 3
            $urls = $this->compileUrls($paramArray, $urls);
980
        }
981
982 7
        return $urls;
983
    }
984
985
    /************************************
986
     *
987
     * Crawler log
988
     *
989
     ************************************/
990
991
    /**
992
     * Return array of records from crawler queue for input page ID
993
     *
994
     * @param integer $id Page ID for which to look up log entries.
995
     * @param string$filter Filter: "all" => all entries, "pending" => all that is not yet run, "finished" => all complete ones
996
     * @param boolean $doFlush If TRUE, then entries selected at DELETED(!) instead of selected!
997
     * @param boolean $doFullFlush
998
     * @param integer $itemsPerPage Limit the amount of entries per page default is 10
999
     * @return array
1000
     */
1001 4
    public function getLogEntriesForPageId($id, $filter = '', $doFlush = false, $doFullFlush = false, $itemsPerPage = 10)
1002
    {
1003
        switch ($filter) {
1004 4
            case 'pending':
1005
                $addWhere = ' AND exec_time=0';
1006
                break;
1007 4
            case 'finished':
1008
                $addWhere = ' AND exec_time>0';
1009
                break;
1010
            default:
1011 4
                $addWhere = '';
1012 4
                break;
1013
        }
1014
1015
        // FIXME: Write unit test that ensures that the right records are deleted.
1016 4
        if ($doFlush) {
1017 2
            $this->flushQueue(($doFullFlush ? '1=1' : ('page_id=' . intval($id))) . $addWhere);
1018 2
            return [];
1019
        } else {
1020 2
            return $this->db->exec_SELECTgetRows(
1021 2
                '*',
1022 2
                'tx_crawler_queue',
1023 2
                'page_id=' . intval($id) . $addWhere,
1024 2
                '',
1025 2
                'scheduled DESC',
1026 2
                (intval($itemsPerPage) > 0 ? intval($itemsPerPage) : '')
1027
            );
1028
        }
1029
    }
1030
1031
    /**
1032
     * Return array of records from crawler queue for input set ID
1033
     *
1034
     * @param integer $set_id Set ID for which to look up log entries.
1035
     * @param string $filter Filter: "all" => all entries, "pending" => all that is not yet run, "finished" => all complete ones
1036
     * @param boolean $doFlush If TRUE, then entries selected at DELETED(!) instead of selected!
1037
     * @param integer $itemsPerPage Limit the amount of entires per page default is 10
1038
     * @return array
1039
     */
1040 6
    public function getLogEntriesForSetId($set_id, $filter = '', $doFlush = false, $doFullFlush = false, $itemsPerPage = 10)
1041
    {
1042
        // FIXME: Write Unit tests for Filters
1043
        switch ($filter) {
1044 6
            case 'pending':
1045 1
                $addWhere = ' AND exec_time=0';
1046 1
                break;
1047 5
            case 'finished':
1048 1
                $addWhere = ' AND exec_time>0';
1049 1
                break;
1050
            default:
1051 4
                $addWhere = '';
1052 4
                break;
1053
        }
1054
        // FIXME: Write unit test that ensures that the right records are deleted.
1055 6
        if ($doFlush) {
1056 4
            $this->flushQueue($doFullFlush ? '' : ('set_id=' . intval($set_id) . $addWhere));
1057 4
            return [];
1058
        } else {
1059 2
            return $this->db->exec_SELECTgetRows(
1060 2
                '*',
1061 2
                'tx_crawler_queue',
1062 2
                'set_id=' . intval($set_id) . $addWhere,
1063 2
                '',
1064 2
                'scheduled DESC',
1065 2
                (intval($itemsPerPage) > 0 ? intval($itemsPerPage) : '')
1066
            );
1067
        }
1068
    }
1069
1070
    /**
1071
     * Removes queue entries
1072
     *
1073
     * @param string $where SQL related filter for the entries which should be removed
1074
     * @return void
1075
     */
1076 10
    protected function flushQueue($where = '')
1077
    {
1078 10
        $realWhere = strlen($where) > 0 ? $where : '1=1';
1079
1080 10
        if (EventDispatcher::getInstance()->hasObserver('queueEntryFlush')) {
1081
            $groups = $this->db->exec_SELECTgetRows('DISTINCT set_id', 'tx_crawler_queue', $realWhere);
1082
            if (is_array($groups)) {
1083
                foreach ($groups as $group) {
1084
                    EventDispatcher::getInstance()->post('queueEntryFlush', $group['set_id'], $this->db->exec_SELECTgetRows('uid, set_id', 'tx_crawler_queue', $realWhere . ' AND set_id="' . $group['set_id'] . '"'));
1085
                }
1086
            }
1087
        }
1088
1089 10
        $this->db->exec_DELETEquery('tx_crawler_queue', $realWhere);
1090 10
    }
1091
1092
    /**
1093
     * Adding call back entries to log (called from hooks typically, see indexed search class "class.crawler.php"
1094
     *
1095
     * @param integer $setId Set ID
1096
     * @param array $params Parameters to pass to call back function
1097
     * @param string $callBack Call back object reference, eg. 'EXT:indexed_search/class.crawler.php:&tx_indexedsearch_crawler'
1098
     * @param integer $page_id Page ID to attach it to
1099
     * @param integer $schedule Time at which to activate
1100
     * @return void
1101
     */
1102
    public function addQueueEntry_callBack($setId, $params, $callBack, $page_id = 0, $schedule = 0)
1103
    {
1104
        if (!is_array($params)) {
1105
            $params = [];
1106
        }
1107
        $params['_CALLBACKOBJ'] = $callBack;
1108
1109
        // Compile value array:
1110
        $fieldArray = [
1111
            'page_id' => intval($page_id),
1112
            'parameters' => serialize($params),
1113
            'scheduled' => intval($schedule) ? intval($schedule) : $this->getCurrentTime(),
1114
            'exec_time' => 0,
1115
            'set_id' => intval($setId),
1116
            'result_data' => '',
1117
        ];
1118
1119
        $this->db->exec_INSERTquery('tx_crawler_queue', $fieldArray);
1120
    }
1121
1122
    /************************************
1123
     *
1124
     * URL setting
1125
     *
1126
     ************************************/
1127
1128
    /**
1129
     * Setting a URL for crawling:
1130
     *
1131
     * @param integer $id Page ID
1132
     * @param string $url Complete URL
1133
     * @param array $subCfg Sub configuration array (from TS config)
1134
     * @param integer $tstamp Scheduled-time
1135
     * @param string $configurationHash (optional) configuration hash
1136
     * @param bool $skipInnerDuplicationCheck (optional) skip inner duplication check
1137
     * @return bool
1138
     */
1139 4
    public function addUrl(
1140
        $id,
1141
        $url,
1142
        array $subCfg,
1143
        $tstamp,
1144
        $configurationHash = '',
1145
        $skipInnerDuplicationCheck = false
1146
    ) {
1147 4
        $urlAdded = false;
1148 4
        $rows = [];
1149
1150
        // Creating parameters:
1151
        $parameters = [
1152 4
            'url' => $url
1153
        ];
1154
1155
        // fe user group simulation:
1156 4
        $uGs = implode(',', array_unique(GeneralUtility::intExplode(',', $subCfg['userGroups'], true)));
1157 4
        if ($uGs) {
1158
            $parameters['feUserGroupList'] = $uGs;
1159
        }
1160
1161
        // Setting processing instructions
1162 4
        $parameters['procInstructions'] = GeneralUtility::trimExplode(',', $subCfg['procInstrFilter']);
1163 4
        if (is_array($subCfg['procInstrParams.'])) {
1164 4
            $parameters['procInstrParams'] = $subCfg['procInstrParams.'];
1165
        }
1166
1167
        // Possible TypoScript Template Parents
1168 4
        $parameters['rootTemplatePid'] = $subCfg['rootTemplatePid'];
1169
1170
        // Compile value array:
1171 4
        $parameters_serialized = serialize($parameters);
1172
        $fieldArray = [
1173 4
            'page_id' => intval($id),
1174 4
            'parameters' => $parameters_serialized,
1175 4
            'parameters_hash' => GeneralUtility::shortMD5($parameters_serialized),
1176 4
            'configuration_hash' => $configurationHash,
1177 4
            'scheduled' => $tstamp,
1178 4
            'exec_time' => 0,
1179 4
            'set_id' => intval($this->setID),
1180 4
            'result_data' => '',
1181 4
            'configuration' => $subCfg['key'],
1182
        ];
1183
1184 4
        if ($this->registerQueueEntriesInternallyOnly) {
1185
            //the entries will only be registered and not stored to the database
1186
            $this->queueEntries[] = $fieldArray;
1187
        } else {
1188 4
            if (!$skipInnerDuplicationCheck) {
1189
                // check if there is already an equal entry
1190 4
                $rows = $this->getDuplicateRowsIfExist($tstamp, $fieldArray);
1191
            }
1192
1193 4
            if (count($rows) == 0) {
1194 4
                $this->db->exec_INSERTquery('tx_crawler_queue', $fieldArray);
1195 4
                $uid = $this->db->sql_insert_id();
1196 4
                $rows[] = $uid;
1197 4
                $urlAdded = true;
1198 4
                EventDispatcher::getInstance()->post('urlAddedToQueue', $this->setID, ['uid' => $uid, 'fieldArray' => $fieldArray]);
1199
            } else {
1200 2
                EventDispatcher::getInstance()->post('duplicateUrlInQueue', $this->setID, ['rows' => $rows, 'fieldArray' => $fieldArray]);
1201
            }
1202
        }
1203
1204 4
        return $urlAdded;
1205
    }
1206
1207
    /**
1208
     * This method determines duplicates for a queue entry with the same parameters and this timestamp.
1209
     * If the timestamp is in the past, it will check if there is any unprocessed queue entry in the past.
1210
     * If the timestamp is in the future it will check, if the queued entry has exactly the same timestamp
1211
     *
1212
     * @param int $tstamp
1213
     * @param array $fieldArray
1214
     *
1215
     * @return array
1216
     */
1217 4
    protected function getDuplicateRowsIfExist($tstamp, $fieldArray)
1218
    {
1219 4
        $rows = [];
1220
1221 4
        $currentTime = $this->getCurrentTime();
1222
1223
        //if this entry is scheduled with "now"
1224 4
        if ($tstamp <= $currentTime) {
1225 1
            if ($this->extensionSettings['enableTimeslot']) {
1226 1
                $timeBegin = $currentTime - 100;
1227 1
                $timeEnd = $currentTime + 100;
1228 1
                $where = ' ((scheduled BETWEEN ' . $timeBegin . ' AND ' . $timeEnd . ' ) OR scheduled <= ' . $currentTime . ') ';
1229
            } else {
1230 1
                $where = 'scheduled <= ' . $currentTime;
1231
            }
1232 3
        } elseif ($tstamp > $currentTime) {
1233
            //entry with a timestamp in the future need to have the same schedule time
1234 3
            $where = 'scheduled = ' . $tstamp ;
1235
        }
1236
1237 4
        if (!empty($where)) {
1238 4
            $result = $this->db->exec_SELECTgetRows(
1239 4
                'qid',
1240 4
                'tx_crawler_queue',
1241
                $where .
1242 4
                ' AND NOT exec_time' .
1243 4
                ' AND NOT process_id ' .
1244 4
                ' AND page_id=' . intval($fieldArray['page_id']) .
1245 4
                ' AND parameters_hash = ' . $this->db->fullQuoteStr($fieldArray['parameters_hash'], 'tx_crawler_queue')
1246
            );
1247
1248 4
            if (is_array($result)) {
1249 4
                foreach ($result as $value) {
1250 2
                    $rows[] = $value['qid'];
1251
                }
1252
            }
1253
        }
1254
1255 4
        return $rows;
1256
    }
1257
1258
    /**
1259
     * Returns the current system time
1260
     *
1261
     * @return int
1262
     */
1263 1
    public function getCurrentTime()
1264
    {
1265 1
        return time();
1266
    }
1267
1268
    /************************************
1269
     *
1270
     * URL reading
1271
     *
1272
     ************************************/
1273
1274
    /**
1275
     * Read URL for single queue entry
1276
     *
1277
     * @param integer $queueId
1278
     * @param boolean $force If set, will process even if exec_time has been set!
1279
     * @return integer
1280
     */
1281
    public function readUrl($queueId, $force = false)
1282
    {
1283
        $ret = 0;
1284
        if ($this->debugMode) {
1285
            GeneralUtility::devlog('crawler-readurl start ' . microtime(true), __FUNCTION__);
1286
        }
1287
        // Get entry:
1288
        list($queueRec) = $this->db->exec_SELECTgetRows(
1289
            '*',
1290
            'tx_crawler_queue',
1291
            'qid=' . intval($queueId) . ($force ? '' : ' AND exec_time=0 AND process_scheduled > 0')
1292
        );
1293
1294
        if (!is_array($queueRec)) {
1295
            return;
1296
        }
1297
1298
        $parameters = unserialize($queueRec['parameters']);
1299
        if ($parameters['rootTemplatePid']) {
1300
            $this->initTSFE((int)$parameters['rootTemplatePid']);
1301
        } else {
1302
            GeneralUtility::sysLog(
1303
                'Page with (' . $queueRec['page_id'] . ') could not be crawled, please check your crawler configuration. Perhaps no Root Template Pid is set',
1304
                'crawler',
1305
                GeneralUtility::SYSLOG_SEVERITY_WARNING
1306
            );
1307
        }
1308
1309
        SignalSlotUtility::emitSignal(
1310
            __CLASS__,
1311
            SignalSlotUtility::SIGNNAL_QUEUEITEM_PREPROCESS,
1312
            [$queueId, &$queueRec]
1313
        );
1314
1315
        // Set exec_time to lock record:
1316
        $field_array = ['exec_time' => $this->getCurrentTime()];
1317
1318
        if (isset($this->processID)) {
1319
            //if mulitprocessing is used we need to store the id of the process which has handled this entry
1320
            $field_array['process_id_completed'] = $this->processID;
1321
        }
1322
        $this->db->exec_UPDATEquery('tx_crawler_queue', 'qid=' . intval($queueId), $field_array);
1323
1324
        $result = $this->readUrl_exec($queueRec);
1325
        $resultData = unserialize($result['content']);
1326
1327
        //atm there's no need to point to specific pollable extensions
1328
        if (is_array($GLOBALS['TYPO3_CONF_VARS']['EXTCONF']['crawler']['pollSuccess'])) {
1329
            foreach ($GLOBALS['TYPO3_CONF_VARS']['EXTCONF']['crawler']['pollSuccess'] as $pollable) {
1330
                // only check the success value if the instruction is runnig
1331
                // it is important to name the pollSuccess key same as the procInstructions key
1332
                if (is_array($resultData['parameters']['procInstructions']) && in_array(
1333
                    $pollable,
1334
                        $resultData['parameters']['procInstructions']
1335
                )
1336
                ) {
1337
                    if (!empty($resultData['success'][$pollable]) && $resultData['success'][$pollable]) {
1338
                        $ret |= self::CLI_STATUS_POLLABLE_PROCESSED;
1339
                    }
1340
                }
1341
            }
1342
        }
1343
1344
        // Set result in log which also denotes the end of the processing of this entry.
1345
        $field_array = ['result_data' => serialize($result)];
1346
1347
        SignalSlotUtility::emitSignal(
1348
            __CLASS__,
1349
            SignalSlotUtility::SIGNNAL_QUEUEITEM_POSTPROCESS,
1350
            [$queueId, &$field_array]
1351
        );
1352
1353
        $this->db->exec_UPDATEquery('tx_crawler_queue', 'qid=' . intval($queueId), $field_array);
1354
1355
        if ($this->debugMode) {
1356
            GeneralUtility::devlog('crawler-readurl stop ' . microtime(true), __FUNCTION__);
1357
        }
1358
1359
        return $ret;
1360
    }
1361
1362
    /**
1363
     * Read URL for not-yet-inserted log-entry
1364
     *
1365
     * @param array $field_array Queue field array,
1366
     *
1367
     * @return string
1368
     */
1369
    public function readUrlFromArray($field_array)
1370
    {
1371
1372
            // Set exec_time to lock record:
1373
        $field_array['exec_time'] = $this->getCurrentTime();
1374
        $this->db->exec_INSERTquery('tx_crawler_queue', $field_array);
1375
        $queueId = $field_array['qid'] = $this->db->sql_insert_id();
1376
1377
        $result = $this->readUrl_exec($field_array);
1378
1379
        // Set result in log which also denotes the end of the processing of this entry.
1380
        $field_array = ['result_data' => serialize($result)];
1381
1382
        SignalSlotUtility::emitSignal(
1383
            __CLASS__,
1384
            SignalSlotUtility::SIGNNAL_QUEUEITEM_POSTPROCESS,
1385
            [$queueId, &$field_array]
1386
        );
1387
1388
        $this->db->exec_UPDATEquery('tx_crawler_queue', 'qid=' . intval($queueId), $field_array);
1389
1390
        return $result;
1391
    }
1392
1393
    /**
1394
     * Read URL for a queue record
1395
     *
1396
     * @param array $queueRec Queue record
1397
     * @return string
1398
     */
1399
    public function readUrl_exec($queueRec)
1400
    {
1401
        // Decode parameters:
1402
        $parameters = unserialize($queueRec['parameters']);
1403
        $result = 'ERROR';
1404
        if (is_array($parameters)) {
1405
            if ($parameters['_CALLBACKOBJ']) { // Calling object:
1406
                $objRef = $parameters['_CALLBACKOBJ'];
1407
                $callBackObj = &GeneralUtility::getUserObj($objRef);
1408
                if (is_object($callBackObj)) {
1409
                    unset($parameters['_CALLBACKOBJ']);
1410
                    $result = ['content' => serialize($callBackObj->crawler_execute($parameters, $this))];
1411
                } else {
1412
                    $result = ['content' => 'No object: ' . $objRef];
1413
                }
1414
            } else { // Regular FE request:
1415
1416
                // Prepare:
1417
                $crawlerId = $queueRec['qid'] . ':' . md5($queueRec['qid'] . '|' . $queueRec['set_id'] . '|' . $GLOBALS['TYPO3_CONF_VARS']['SYS']['encryptionKey']);
1418
1419
                // Get result:
1420
                $result = $this->requestUrl($parameters['url'], $crawlerId);
1421
1422
                EventDispatcher::getInstance()->post('urlCrawled', $queueRec['set_id'], ['url' => $parameters['url'], 'result' => $result]);
1423
            }
1424
        }
1425
1426
        return $result;
1427
    }
1428
1429
    /**
1430
     * Gets the content of a URL.
1431
     *
1432
     * @param string $originalUrl URL to read
1433
     * @param string $crawlerId Crawler ID string (qid + hash to verify)
1434
     * @param integer $timeout Timeout time
1435
     * @param integer $recursion Recursion limiter for 302 redirects
1436
     * @return array
1437
     */
1438 2
    public function requestUrl($originalUrl, $crawlerId, $timeout = 2, $recursion = 10)
1439
    {
1440 2
        if (!$recursion) {
1441
            return false;
1442
        }
1443
1444
        // Parse URL, checking for scheme:
1445 2
        $url = parse_url($originalUrl);
1446
1447 2
        if ($url === false) {
1448
            if (TYPO3_DLOG) {
1449
                GeneralUtility::devLog(sprintf('Could not parse_url() for string "%s"', $url), 'crawler', 4, ['crawlerId' => $crawlerId]);
1450
            }
1451
            return false;
1452
        }
1453
1454 2
        if (!in_array($url['scheme'], ['','http','https'])) {
1455
            if (TYPO3_DLOG) {
1456
                GeneralUtility::devLog(sprintf('Scheme does not match for url "%s"', $url), 'crawler', 4, ['crawlerId' => $crawlerId]);
1457
            }
1458
            return false;
1459
        }
1460
1461
        // direct request
1462 2
        if ($this->extensionSettings['makeDirectRequests']) {
1463 2
            $result = $this->sendDirectRequest($originalUrl, $crawlerId);
1464 2
            return $result;
1465
        }
1466
1467
        $reqHeaders = $this->buildRequestHeaderArray($url, $crawlerId);
1468
1469
        // thanks to Pierrick Caillon for adding proxy support
1470
        $rurl = $url;
1471
1472
        if ($GLOBALS['TYPO3_CONF_VARS']['SYS']['curlUse'] && $GLOBALS['TYPO3_CONF_VARS']['SYS']['curlProxyServer']) {
1473
            $rurl = parse_url($GLOBALS['TYPO3_CONF_VARS']['SYS']['curlProxyServer']);
1474
            $url['path'] = $url['scheme'] . '://' . $url['host'] . ($url['port'] > 0 ? ':' . $url['port'] : '') . $url['path'];
1475
            $reqHeaders = $this->buildRequestHeaderArray($url, $crawlerId);
1476
        }
1477
1478
        $host = $rurl['host'];
1479
1480
        if ($url['scheme'] == 'https') {
1481
            $host = 'ssl://' . $host;
1482
            $port = ($rurl['port'] > 0) ? $rurl['port'] : 443;
1483
        } else {
1484
            $port = ($rurl['port'] > 0) ? $rurl['port'] : 80;
1485
        }
1486
1487
        $startTime = microtime(true);
1488
        $fp = fsockopen($host, $port, $errno, $errstr, $timeout);
1489
1490
        if (!$fp) {
1491
            if (TYPO3_DLOG) {
1492
                GeneralUtility::devLog(sprintf('Error while opening "%s"', $url), 'crawler', 4, ['crawlerId' => $crawlerId]);
1493
            }
1494
            return false;
1495
        } else {
1496
            // Request message:
1497
            $msg = implode("\r\n", $reqHeaders) . "\r\n\r\n";
1498
            fputs($fp, $msg);
1499
1500
            // Read response:
1501
            $d = $this->getHttpResponseFromStream($fp);
1502
            fclose($fp);
1503
1504
            $time = microtime(true) - $startTime;
1505
            $this->log($originalUrl . ' ' . $time);
1506
1507
            // Implode content and headers:
1508
            $result = [
1509
                'request' => $msg,
1510
                'headers' => implode('', $d['headers']),
1511
                'content' => implode('', (array)$d['content'])
1512
            ];
1513
1514
            if (($this->extensionSettings['follow30x']) && ($newUrl = $this->getRequestUrlFrom302Header($d['headers'], $url['user'], $url['pass']))) {
1515
                $result = array_merge(['parentRequest' => $result], $this->requestUrl($newUrl, $crawlerId, $recursion--));
0 ignored issues
show
Bug introduced by
It seems like $newUrl defined by $this->getRequestUrlFrom...['user'], $url['pass']) on line 1514 can also be of type boolean; however, AOE\Crawler\Controller\C...ontroller::requestUrl() does only seem to accept string, maybe add an additional type check?

If a method or function can return multiple different values and unless you are sure that you only can receive a single value in this context, we recommend to add an additional type check:

/**
 * @return array|string
 */
function returnsDifferentValues($x) {
    if ($x) {
        return 'foo';
    }

    return array();
}

$x = returnsDifferentValues($y);
if (is_array($x)) {
    // $x is an array.
}

If this a common case that PHP Analyzer should handle natively, please let us know by opening an issue.

Loading history...
1516
                $newRequestUrl = $this->requestUrl($newUrl, $crawlerId, $timeout, --$recursion);
0 ignored issues
show
Bug introduced by
It seems like $newUrl defined by $this->getRequestUrlFrom...['user'], $url['pass']) on line 1514 can also be of type boolean; however, AOE\Crawler\Controller\C...ontroller::requestUrl() does only seem to accept string, maybe add an additional type check?

If a method or function can return multiple different values and unless you are sure that you only can receive a single value in this context, we recommend to add an additional type check:

/**
 * @return array|string
 */
function returnsDifferentValues($x) {
    if ($x) {
        return 'foo';
    }

    return array();
}

$x = returnsDifferentValues($y);
if (is_array($x)) {
    // $x is an array.
}

If this a common case that PHP Analyzer should handle natively, please let us know by opening an issue.

Loading history...
1517
1518
                if (is_array($newRequestUrl)) {
1519
                    $result = array_merge(['parentRequest' => $result], $newRequestUrl);
1520
                } else {
1521
                    if (TYPO3_DLOG) {
1522
                        GeneralUtility::devLog(sprintf('Error while opening "%s"', $url), 'crawler', 4, ['crawlerId' => $crawlerId]);
1523
                    }
1524
                    return false;
1525
                }
1526
            }
1527
1528
            return $result;
1529
        }
1530
    }
1531
1532
    /**
1533
     * Gets the base path of the website frontend.
1534
     * (e.g. if you call http://mydomain.com/cms/index.php in
1535
     * the browser the base path is "/cms/")
1536
     *
1537
     * @return string Base path of the website frontend
1538
     */
1539
    protected function getFrontendBasePath()
1540
    {
1541
        $frontendBasePath = '/';
1542
1543
        // Get the path from the extension settings:
1544
        if (isset($this->extensionSettings['frontendBasePath']) && $this->extensionSettings['frontendBasePath']) {
1545
            $frontendBasePath = $this->extensionSettings['frontendBasePath'];
1546
            // If empty, try to use config.absRefPrefix:
1547
        } elseif (isset($GLOBALS['TSFE']->absRefPrefix) && !empty($GLOBALS['TSFE']->absRefPrefix)) {
1548
            $frontendBasePath = $GLOBALS['TSFE']->absRefPrefix;
1549
            // If not in CLI mode the base path can be determined from $_SERVER environment:
1550
        } elseif (!defined('TYPO3_REQUESTTYPE_CLI') || !TYPO3_REQUESTTYPE_CLI) {
1551
            $frontendBasePath = GeneralUtility::getIndpEnv('TYPO3_SITE_PATH');
1552
        }
1553
1554
        // Base path must be '/<pathSegements>/':
1555
        if ($frontendBasePath != '/') {
1556
            $frontendBasePath = '/' . ltrim($frontendBasePath, '/');
1557
            $frontendBasePath = rtrim($frontendBasePath, '/') . '/';
1558
        }
1559
1560
        return $frontendBasePath;
1561
    }
1562
1563
    /**
1564
     * Executes a shell command and returns the outputted result.
1565
     *
1566
     * @param string $command Shell command to be executed
1567
     * @return string Outputted result of the command execution
1568
     */
1569
    protected function executeShellCommand($command)
1570
    {
1571
        $result = shell_exec($command);
1572
        return $result;
1573
    }
1574
1575
    /**
1576
     * Reads HTTP response from the given stream.
1577
     *
1578
     * @param  resource $streamPointer  Pointer to connection stream.
1579
     * @return array                    Associative array with the following items:
1580
     *                                  headers <array> Response headers sent by server.
1581
     *                                  content <array> Content, with each line as an array item.
1582
     */
1583 1
    protected function getHttpResponseFromStream($streamPointer)
1584
    {
1585 1
        $response = ['headers' => [], 'content' => []];
1586
1587 1
        if (is_resource($streamPointer)) {
1588
            // read headers
1589 1
            while ($line = fgets($streamPointer, '2048')) {
1590 1
                $line = trim($line);
1591 1
                if ($line !== '') {
1592 1
                    $response['headers'][] = $line;
1593
                } else {
1594 1
                    break;
1595
                }
1596
            }
1597
1598
            // read content
1599 1
            while ($line = fgets($streamPointer, '2048')) {
1600 1
                $response['content'][] = $line;
1601
            }
1602
        }
1603
1604 1
        return $response;
1605
    }
1606
1607
    /**
1608
     * @param message
1609
     */
1610 2
    protected function log($message)
1611
    {
1612 2
        if (!empty($this->extensionSettings['logFileName'])) {
1613
            $fileResult = @file_put_contents($this->extensionSettings['logFileName'], date('Ymd His') . ' ' . $message . PHP_EOL, FILE_APPEND);
1614
            if (!$fileResult) {
1615
                GeneralUtility::devLog('File "' . $this->extensionSettings['logFileName'] . '" could not be written, please check file permissions.', 'crawler', LogLevel::INFO);
1616
            }
1617
        }
1618 2
    }
1619
1620
    /**
1621
     * Builds HTTP request headers.
1622
     *
1623
     * @param array $url
1624
     * @param string $crawlerId
1625
     *
1626
     * @return array
1627
     */
1628 6
    protected function buildRequestHeaderArray(array $url, $crawlerId)
1629
    {
1630 6
        $reqHeaders = [];
1631 6
        $reqHeaders[] = 'GET ' . $url['path'] . ($url['query'] ? '?' . $url['query'] : '') . ' HTTP/1.0';
1632 6
        $reqHeaders[] = 'Host: ' . $url['host'];
1633 6
        if (stristr($url['query'], 'ADMCMD_previewWS')) {
1634 2
            $reqHeaders[] = 'Cookie: $Version="1"; be_typo_user="1"; $Path=/';
1635
        }
1636 6
        $reqHeaders[] = 'Connection: close';
1637 6
        if ($url['user'] != '') {
1638 2
            $reqHeaders[] = 'Authorization: Basic ' . base64_encode($url['user'] . ':' . $url['pass']);
1639
        }
1640 6
        $reqHeaders[] = 'X-T3crawler: ' . $crawlerId;
1641 6
        $reqHeaders[] = 'User-Agent: TYPO3 crawler';
1642 6
        return $reqHeaders;
1643
    }
1644
1645
    /**
1646
     * Check if the submitted HTTP-Header contains a redirect location and built new crawler-url
1647
     *
1648
     * @param array $headers HTTP Header
1649
     * @param string $user HTTP Auth. User
1650
     * @param string $pass HTTP Auth. Password
1651
     * @return bool|string
1652
     */
1653 12
    protected function getRequestUrlFrom302Header($headers, $user = '', $pass = '')
1654
    {
1655 12
        $header = [];
1656 12
        if (!is_array($headers)) {
1657 1
            return false;
1658
        }
1659 11
        if (!(stristr($headers[0], '301 Moved') || stristr($headers[0], '302 Found') || stristr($headers[0], '302 Moved'))) {
1660 2
            return false;
1661
        }
1662
1663 9
        foreach ($headers as $hl) {
1664 9
            $tmp = explode(": ", $hl);
1665 9
            $header[trim($tmp[0])] = trim($tmp[1]);
1666 9
            if (trim($tmp[0]) == 'Location') {
1667 9
                break;
1668
            }
1669
        }
1670 9
        if (!array_key_exists('Location', $header)) {
1671 3
            return false;
1672
        }
1673
1674 6
        if ($user != '') {
1675 3
            if (!($tmp = parse_url($header['Location']))) {
1676 1
                return false;
1677
            }
1678 2
            $newUrl = $tmp['scheme'] . '://' . $user . ':' . $pass . '@' . $tmp['host'] . $tmp['path'];
1679 2
            if ($tmp['query'] != '') {
1680 2
                $newUrl .= '?' . $tmp['query'];
1681
            }
1682
        } else {
1683 3
            $newUrl = $header['Location'];
1684
        }
1685 5
        return $newUrl;
1686
    }
1687
1688
    /**************************
1689
     *
1690
     * tslib_fe hooks:
1691
     *
1692
     **************************/
1693
1694
    /**
1695
     * Initialization hook (called after database connection)
1696
     * Takes the "HTTP_X_T3CRAWLER" header and looks up queue record and verifies if the session comes from the system (by comparing hashes)
1697
     *
1698
     * @param array $params Parameters from frontend
1699
     * @param object $ref TSFE object (reference under PHP5)
1700
     * @return void
1701
     *
1702
     * FIXME: Look like this is not used, in commit 9910d3f40cce15f4e9b7bcd0488bf21f31d53ebc it's added as public,
1703
     * FIXME: I think this can be removed. (TNM)
1704
     */
1705
    public function fe_init(&$params, $ref)
0 ignored issues
show
Unused Code introduced by
The parameter $ref is not used and could be removed.

This check looks from parameters that have been defined for a function or method, but which are not used in the method body.

Loading history...
1706
    {
1707
        // Authenticate crawler request:
1708
        if (isset($_SERVER['HTTP_X_T3CRAWLER'])) {
1709
            list($queueId, $hash) = explode(':', $_SERVER['HTTP_X_T3CRAWLER']);
1710
            list($queueRec) = $this->db->exec_SELECTgetSingleRow('*', 'tx_crawler_queue', 'qid=' . intval($queueId));
1711
1712
            // If a crawler record was found and hash was matching, set it up:
1713
            if (is_array($queueRec) && $hash === md5($queueRec['qid'] . '|' . $queueRec['set_id'] . '|' . $GLOBALS['TYPO3_CONF_VARS']['SYS']['encryptionKey'])) {
1714
                $params['pObj']->applicationData['tx_crawler']['running'] = true;
1715
                $params['pObj']->applicationData['tx_crawler']['parameters'] = unserialize($queueRec['parameters']);
1716
                $params['pObj']->applicationData['tx_crawler']['log'] = [];
1717
            } else {
1718
                die('No crawler entry found!');
0 ignored issues
show
Coding Style Compatibility introduced by
The method fe_init() contains an exit expression.

An exit expression should only be used in rare cases. For example, if you write a short command line script.

In most cases however, using an exit expression makes the code untestable and often causes incompatibilities with other libraries. Thus, unless you are absolutely sure it is required here, we recommend to refactor your code to avoid its usage.

Loading history...
1719
            }
1720
        }
1721
    }
1722
1723
    /*****************************
1724
     *
1725
     * Compiling URLs to crawl - tools
1726
     *
1727
     *****************************/
1728
1729
    /**
1730
     * @param integer $id Root page id to start from.
1731
     * @param integer $depth Depth of tree, 0=only id-page, 1= on sublevel, 99 = infinite
1732
     * @param integer $scheduledTime Unix Time when the URL is timed to be visited when put in queue
1733
     * @param integer $reqMinute Number of requests per minute (creates the interleave between requests)
1734
     * @param boolean $submitCrawlUrls If set, submits the URLs to queue in database (real crawling)
1735
     * @param boolean $downloadCrawlUrls If set (and submitcrawlUrls is false) will fill $downloadUrls with entries)
1736
     * @param array $incomingProcInstructions Array of processing instructions
1737
     * @param array $configurationSelection Array of configuration keys
1738
     * @return string
1739
     */
1740
    public function getPageTreeAndUrls(
1741
        $id,
1742
        $depth,
1743
        $scheduledTime,
1744
        $reqMinute,
1745
        $submitCrawlUrls,
1746
        $downloadCrawlUrls,
1747
        array $incomingProcInstructions,
1748
        array $configurationSelection
1749
    ) {
1750
        global $BACK_PATH;
1751
        global $LANG;
1752
        if (!is_object($LANG)) {
1753
            $LANG = GeneralUtility::makeInstance('language');
1754
            $LANG->init(0);
1755
        }
1756
        $this->scheduledTime = $scheduledTime;
1757
        $this->reqMinute = $reqMinute;
1758
        $this->submitCrawlUrls = $submitCrawlUrls;
1759
        $this->downloadCrawlUrls = $downloadCrawlUrls;
1760
        $this->incomingProcInstructions = $incomingProcInstructions;
1761
        $this->incomingConfigurationSelection = $configurationSelection;
1762
1763
        $this->duplicateTrack = [];
1764
        $this->downloadUrls = [];
1765
1766
        // Drawing tree:
1767
        /* @var PageTreeView $tree */
1768
        $tree = GeneralUtility::makeInstance(PageTreeView::class);
1769
        $perms_clause = $GLOBALS['BE_USER']->getPagePermsClause(1);
1770
        $tree->init('AND ' . $perms_clause);
1771
1772
        $pageInfo = BackendUtility::readPageAccess($id, $perms_clause);
1773
        if (is_array($pageInfo)) {
1774
            // Set root row:
1775
            $tree->tree[] = [
1776
                'row' => $pageInfo,
1777
                'HTML' => IconUtility::getIconForRecord('pages', $pageInfo)
1778
            ];
1779
        }
1780
1781
        // Get branch beneath:
1782
        if ($depth) {
1783
            $tree->getTree($id, $depth, '');
1784
        }
1785
1786
        // Traverse page tree:
1787
        $code = '';
1788
1789
        foreach ($tree->tree as $data) {
1790
            $this->MP = false;
1791
1792
            // recognize mount points
1793
            if ($data['row']['doktype'] == 7) {
1794
                $mountpage = $this->db->exec_SELECTgetRows('*', 'pages', 'uid = ' . $data['row']['uid']);
1795
1796
                // fetch mounted pages
1797
                $this->MP = $mountpage[0]['mount_pid'] . '-' . $data['row']['uid'];
0 ignored issues
show
Documentation Bug introduced by
The property $MP was declared of type boolean, but $mountpage[0]['mount_pid...' . $data['row']['uid'] is of type string. Maybe add a type cast?

This check looks for assignments to scalar types that may be of the wrong type.

To ensure the code behaves as expected, it may be a good idea to add an explicit type cast.

$answer = 42;

$correct = false;

$correct = (bool) $answer;
Loading history...
1798
1799
                $mountTree = GeneralUtility::makeInstance(PageTreeView::class);
1800
                $mountTree->init('AND ' . $perms_clause);
1801
                $mountTree->getTree($mountpage[0]['mount_pid'], $depth, '');
1802
1803
                foreach ($mountTree->tree as $mountData) {
1804
                    $code .= $this->drawURLs_addRowsForPage(
1805
                        $mountData['row'],
1806
                        $mountData['HTML'] . BackendUtility::getRecordTitle('pages', $mountData['row'], true)
1807
                    );
1808
                }
1809
1810
                // replace page when mount_pid_ol is enabled
1811
                if ($mountpage[0]['mount_pid_ol']) {
1812
                    $data['row']['uid'] = $mountpage[0]['mount_pid'];
1813
                } else {
1814
                    // if the mount_pid_ol is not set the MP must not be used for the mountpoint page
1815
                    $this->MP = false;
1816
                }
1817
            }
1818
1819
            $code .= $this->drawURLs_addRowsForPage(
1820
                $data['row'],
1821
                $data['HTML'] . BackendUtility::getRecordTitle('pages', $data['row'], true)
1822
            );
1823
        }
1824
1825
        return $code;
1826
    }
1827
1828
    /**
1829
     * Expands exclude string
1830
     *
1831
     * @param string $excludeString Exclude string
1832
     * @return array
1833
     */
1834 1
    public function expandExcludeString($excludeString)
1835
    {
1836
        // internal static caches;
1837 1
        static $expandedExcludeStringCache;
1838 1
        static $treeCache;
1839
1840 1
        if (empty($expandedExcludeStringCache[$excludeString])) {
1841 1
            $pidList = [];
1842
1843 1
            if (!empty($excludeString)) {
1844
                /** @var PageTreeView $tree */
1845
                $tree = GeneralUtility::makeInstance(PageTreeView::class);
1846
                $tree->init('AND ' . $this->backendUser->getPagePermsClause(1));
1847
1848
                $excludeParts = GeneralUtility::trimExplode(',', $excludeString);
1849
1850
                foreach ($excludeParts as $excludePart) {
1851
                    list($pid, $depth) = GeneralUtility::trimExplode('+', $excludePart);
1852
1853
                    // default is "page only" = "depth=0"
1854
                    if (empty($depth)) {
1855
                        $depth = (stristr($excludePart, '+')) ? 99 : 0;
1856
                    }
1857
1858
                    $pidList[] = $pid;
1859
1860
                    if ($depth > 0) {
1861
                        if (empty($treeCache[$pid][$depth])) {
1862
                            $tree->reset();
1863
                            $tree->getTree($pid, $depth);
1864
                            $treeCache[$pid][$depth] = $tree->tree;
1865
                        }
1866
1867
                        foreach ($treeCache[$pid][$depth] as $data) {
1868
                            $pidList[] = $data['row']['uid'];
1869
                        }
1870
                    }
1871
                }
1872
            }
1873
1874 1
            $expandedExcludeStringCache[$excludeString] = array_unique($pidList);
1875
        }
1876
1877 1
        return $expandedExcludeStringCache[$excludeString];
1878
    }
1879
1880
    /**
1881
     * Create the rows for display of the page tree
1882
     * For each page a number of rows are shown displaying GET variable configuration
1883
     *
1884
     * @param    array        Page row
1885
     * @param    string        Page icon and title for row
1886
     * @return    string        HTML <tr> content (one or more)
1887
     */
1888
    public function drawURLs_addRowsForPage(array $pageRow, $pageTitleAndIcon)
1889
    {
1890
        $skipMessage = '';
1891
1892
        // Get list of configurations
1893
        $configurations = $this->getUrlsForPageRow($pageRow, $skipMessage);
1894
1895
        if (count($this->incomingConfigurationSelection) > 0) {
1896
            // remove configuration that does not match the current selection
1897
            foreach ($configurations as $confKey => $confArray) {
1898
                if (!in_array($confKey, $this->incomingConfigurationSelection)) {
1899
                    unset($configurations[$confKey]);
1900
                }
1901
            }
1902
        }
1903
1904
        // Traverse parameter combinations:
1905
        $c = 0;
1906
        $content = '';
1907
        if (count($configurations)) {
1908
            foreach ($configurations as $confKey => $confArray) {
1909
1910
                    // Title column:
1911
                if (!$c) {
1912
                    $titleClm = '<td rowspan="' . count($configurations) . '">' . $pageTitleAndIcon . '</td>';
1913
                } else {
1914
                    $titleClm = '';
1915
                }
1916
1917
                if (!in_array($pageRow['uid'], $this->expandExcludeString($confArray['subCfg']['exclude']))) {
1918
1919
                        // URL list:
1920
                    $urlList = $this->urlListFromUrlArray(
1921
                        $confArray,
1922
                        $pageRow,
1923
                        $this->scheduledTime,
1924
                        $this->reqMinute,
1925
                        $this->submitCrawlUrls,
1926
                        $this->downloadCrawlUrls,
1927
                        $this->duplicateTrack,
1928
                        $this->downloadUrls,
1929
                        $this->incomingProcInstructions // if empty the urls won't be filtered by processing instructions
1930
                    );
1931
1932
                    // Expanded parameters:
1933
                    $paramExpanded = '';
1934
                    $calcAccu = [];
1935
                    $calcRes = 1;
1936
                    foreach ($confArray['paramExpanded'] as $gVar => $gVal) {
1937
                        $paramExpanded .= '
1938
                            <tr>
1939
                                <td class="bgColor4-20">' . htmlspecialchars('&' . $gVar . '=') . '<br/>' .
1940
                                                '(' . count($gVal) . ')' .
1941
                                                '</td>
1942
                                <td class="bgColor4" nowrap="nowrap">' . nl2br(htmlspecialchars(implode(chr(10), $gVal))) . '</td>
1943
                            </tr>
1944
                        ';
1945
                        $calcRes *= count($gVal);
1946
                        $calcAccu[] = count($gVal);
1947
                    }
1948
                    $paramExpanded = '<table class="lrPadding c-list param-expanded">' . $paramExpanded . '</table>';
1949
                    $paramExpanded .= 'Comb: ' . implode('*', $calcAccu) . '=' . $calcRes;
1950
1951
                    // Options
1952
                    $optionValues = '';
1953
                    if ($confArray['subCfg']['userGroups']) {
1954
                        $optionValues .= 'User Groups: ' . $confArray['subCfg']['userGroups'] . '<br/>';
1955
                    }
1956
                    if ($confArray['subCfg']['baseUrl']) {
1957
                        $optionValues .= 'Base Url: ' . $confArray['subCfg']['baseUrl'] . '<br/>';
1958
                    }
1959
                    if ($confArray['subCfg']['procInstrFilter']) {
1960
                        $optionValues .= 'ProcInstr: ' . $confArray['subCfg']['procInstrFilter'] . '<br/>';
1961
                    }
1962
1963
                    // Compile row:
1964
                    $content .= '
1965
                        <tr class="bgColor' . ($c % 2 ? '-20' : '-10') . '">
1966
                            ' . $titleClm . '
1967
                            <td>' . htmlspecialchars($confKey) . '</td>
1968
                            <td>' . nl2br(htmlspecialchars(rawurldecode(trim(str_replace('&', chr(10) . '&', GeneralUtility::implodeArrayForUrl('', $confArray['paramParsed'])))))) . '</td>
1969
                            <td>' . $paramExpanded . '</td>
1970
                            <td nowrap="nowrap">' . $urlList . '</td>
1971
                            <td nowrap="nowrap">' . $optionValues . '</td>
1972
                            <td nowrap="nowrap">' . DebugUtility::viewArray($confArray['subCfg']['procInstrParams.']) . '</td>
1973
                        </tr>';
1974
                } else {
1975
                    $content .= '<tr class="bgColor' . ($c % 2 ? '-20' : '-10') . '">
1976
                            ' . $titleClm . '
1977
                            <td>' . htmlspecialchars($confKey) . '</td>
1978
                            <td colspan="5"><em>No entries</em> (Page is excluded in this configuration)</td>
1979
                        </tr>';
1980
                }
1981
1982
                $c++;
1983
            }
1984
        } else {
1985
            $message = !empty($skipMessage) ? ' (' . $skipMessage . ')' : '';
1986
1987
            // Compile row:
1988
            $content .= '
1989
                <tr class="bgColor-20" style="border-bottom: 1px solid black;">
1990
                    <td>' . $pageTitleAndIcon . '</td>
1991
                    <td colspan="6"><em>No entries</em>' . $message . '</td>
1992
                </tr>';
1993
        }
1994
1995
        return $content;
1996
    }
1997
1998
    /**
1999
     * @return int
2000
     */
2001 1
    public function getUnprocessedItemsCount()
2002
    {
2003 1
        $res = $this->db->exec_SELECTquery(
2004 1
            'count(*) as num',
2005 1
            'tx_crawler_queue',
2006 1
            'exec_time=0 AND process_scheduled=0 AND scheduled<=' . $this->getCurrentTime()
2007
        );
2008
2009 1
        $count = $this->db->sql_fetch_assoc($res);
2010 1
        return $count['num'];
2011
    }
2012
2013
    /*****************************
2014
     *
2015
     * CLI functions
2016
     *
2017
     *****************************/
2018
2019
    /**
2020
     * Main function for running from Command Line PHP script (cron job)
2021
     * See ext/crawler/cli/crawler_cli.phpsh for details
2022
     *
2023
     * @return int number of remaining items or false if error
2024
     */
2025
    public function CLI_main()
2026
    {
2027
        $this->setAccessMode('cli');
2028
        $result = self::CLI_STATUS_NOTHING_PROCCESSED;
2029
        $cliObj = GeneralUtility::makeInstance(CrawlerCommandLineController::class);
2030
2031
        if (isset($cliObj->cli_args['-h']) || isset($cliObj->cli_args['--help'])) {
2032
            $cliObj->cli_validateArgs();
2033
            $cliObj->cli_help();
2034
            exit;
0 ignored issues
show
Coding Style Compatibility introduced by
The method CLI_main() contains an exit expression.

An exit expression should only be used in rare cases. For example, if you write a short command line script.

In most cases however, using an exit expression makes the code untestable and often causes incompatibilities with other libraries. Thus, unless you are absolutely sure it is required here, we recommend to refactor your code to avoid its usage.

Loading history...
2035
        }
2036
2037
        if (!$this->getDisabled() && $this->CLI_checkAndAcquireNewProcess($this->CLI_buildProcessId())) {
2038
            $countInARun = $cliObj->cli_argValue('--countInARun') ? intval($cliObj->cli_argValue('--countInARun')) : $this->extensionSettings['countInARun'];
2039
            // Seconds
2040
            $sleepAfterFinish = $cliObj->cli_argValue('--sleepAfterFinish') ? intval($cliObj->cli_argValue('--sleepAfterFinish')) : $this->extensionSettings['sleepAfterFinish'];
2041
            // Milliseconds
2042
            $sleepTime = $cliObj->cli_argValue('--sleepTime') ? intval($cliObj->cli_argValue('--sleepTime')) : $this->extensionSettings['sleepTime'];
2043
2044
            try {
2045
                // Run process:
2046
                $result = $this->CLI_run($countInARun, $sleepTime, $sleepAfterFinish);
2047
            } catch (\Exception $e) {
2048
                $this->CLI_debug(get_class($e) . ': ' . $e->getMessage());
2049
                $result = self::CLI_STATUS_ABORTED;
2050
            }
2051
2052
            // Cleanup
2053
            $this->db->exec_DELETEquery('tx_crawler_process', 'assigned_items_count = 0');
2054
2055
            //TODO can't we do that in a clean way?
2056
            $releaseStatus = $this->CLI_releaseProcesses($this->CLI_buildProcessId());
0 ignored issues
show
Unused Code introduced by
$releaseStatus is not used, you could remove the assignment.

This check looks for variable assignements that are either overwritten by other assignments or where the variable is not used subsequently.

$myVar = 'Value';
$higher = false;

if (rand(1, 6) > 3) {
    $higher = true;
} else {
    $higher = false;
}

Both the $myVar assignment in line 1 and the $higher assignment in line 2 are dead. The first because $myVar is never used and the second because $higher is always overwritten for every possible time line.

Loading history...
2057
2058
            $this->CLI_debug("Unprocessed Items remaining:" . $this->getUnprocessedItemsCount() . " (" . $this->CLI_buildProcessId() . ")");
2059
            $result |= ($this->getUnprocessedItemsCount() > 0 ? self::CLI_STATUS_REMAIN : self::CLI_STATUS_NOTHING_PROCCESSED);
2060
        } else {
2061
            $result |= self::CLI_STATUS_ABORTED;
2062
        }
2063
2064
        return $result;
2065
    }
2066
2067
    /**
2068
     * Function executed by crawler_im.php cli script.
2069
     *
2070
     * @return void
2071
     */
2072
    public function CLI_main_im()
2073
    {
2074
        $this->setAccessMode('cli_im');
2075
2076
        $cliObj = GeneralUtility::makeInstance(QueueCommandLineController::class);
2077
2078
        // Force user to admin state and set workspace to "Live":
2079
        $this->backendUser->user['admin'] = 1;
2080
        $this->backendUser->setWorkspace(0);
2081
2082
        // Print help
2083
        if (!isset($cliObj->cli_args['_DEFAULT'][1])) {
2084
            $cliObj->cli_validateArgs();
2085
            $cliObj->cli_help();
2086
            exit;
0 ignored issues
show
Coding Style Compatibility introduced by
The method CLI_main_im() contains an exit expression.

An exit expression should only be used in rare cases. For example, if you write a short command line script.

In most cases however, using an exit expression makes the code untestable and often causes incompatibilities with other libraries. Thus, unless you are absolutely sure it is required here, we recommend to refactor your code to avoid its usage.

Loading history...
2087
        }
2088
2089
        $cliObj->cli_validateArgs();
2090
2091
        if ($cliObj->cli_argValue('-o') === 'exec') {
2092
            $this->registerQueueEntriesInternallyOnly = true;
2093
        }
2094
2095
        if (isset($cliObj->cli_args['_DEFAULT'][2])) {
2096
            // Crawler is called over TYPO3 BE
2097
            $pageId = MathUtility::forceIntegerInRange($cliObj->cli_args['_DEFAULT'][2], 0);
2098
        } else {
2099
            // Crawler is called over cli
2100
            $pageId = MathUtility::forceIntegerInRange($cliObj->cli_args['_DEFAULT'][1], 0);
2101
        }
2102
2103
        $configurationKeys = $this->getConfigurationKeys($cliObj);
2104
2105
        if (!is_array($configurationKeys)) {
2106
            $configurations = $this->getUrlsForPageId($pageId);
2107
            if (is_array($configurations)) {
2108
                $configurationKeys = array_keys($configurations);
2109
            } else {
2110
                $configurationKeys = [];
2111
            }
2112
        }
2113
2114
        if ($cliObj->cli_argValue('-o') === 'queue' || $cliObj->cli_argValue('-o') === 'exec') {
2115
            $reason = new Reason();
2116
            $reason->setReason(Reason::REASON_GUI_SUBMIT);
2117
            $reason->setDetailText('The cli script of the crawler added to the queue');
2118
            EventDispatcher::getInstance()->post(
2119
                'invokeQueueChange',
2120
                $this->setID,
2121
                ['reason' => $reason]
2122
            );
2123
        }
2124
2125
        if ($this->extensionSettings['cleanUpOldQueueEntries']) {
2126
            $this->cleanUpOldQueueEntries();
2127
        }
2128
2129
        $this->setID = (int) GeneralUtility::md5int(microtime());
2130
        $this->getPageTreeAndUrls(
2131
            $pageId,
2132
            MathUtility::forceIntegerInRange($cliObj->cli_argValue('-d'), 0, 99),
2133
            $this->getCurrentTime(),
2134
            MathUtility::forceIntegerInRange($cliObj->cli_isArg('-n') ? $cliObj->cli_argValue('-n') : 30, 1, 1000),
2135
            $cliObj->cli_argValue('-o') === 'queue' || $cliObj->cli_argValue('-o') === 'exec',
2136
            $cliObj->cli_argValue('-o') === 'url',
2137
            GeneralUtility::trimExplode(',', $cliObj->cli_argValue('-proc'), true),
2138
            $configurationKeys
2139
        );
2140
2141
        if ($cliObj->cli_argValue('-o') === 'url') {
2142
            $cliObj->cli_echo(implode(chr(10), $this->downloadUrls) . chr(10), true);
2143
        } elseif ($cliObj->cli_argValue('-o') === 'exec') {
2144
            $cliObj->cli_echo("Executing " . count($this->urlList) . " requests right away:\n\n");
2145
            $cliObj->cli_echo(implode(chr(10), $this->urlList) . chr(10));
2146
            $cliObj->cli_echo("\nProcessing:\n");
2147
2148
            foreach ($this->queueEntries as $queueRec) {
2149
                $p = unserialize($queueRec['parameters']);
2150
                $cliObj->cli_echo($p['url'] . ' (' . implode(',', $p['procInstructions']) . ') => ');
2151
2152
                $result = $this->readUrlFromArray($queueRec);
2153
2154
                $requestResult = unserialize($result['content']);
2155
                if (is_array($requestResult)) {
2156
                    $resLog = is_array($requestResult['log']) ? chr(10) . chr(9) . chr(9) . implode(chr(10) . chr(9) . chr(9), $requestResult['log']) : '';
2157
                    $cliObj->cli_echo('OK: ' . $resLog . chr(10));
2158
                } else {
2159
                    $cliObj->cli_echo('Error checking Crawler Result: ' . substr(preg_replace('/\s+/', ' ', strip_tags($result['content'])), 0, 30000) . '...' . chr(10));
2160
                }
2161
            }
2162
        } elseif ($cliObj->cli_argValue('-o') === 'queue') {
2163
            $cliObj->cli_echo("Putting " . count($this->urlList) . " entries in queue:\n\n");
2164
            $cliObj->cli_echo(implode(chr(10), $this->urlList) . chr(10));
2165
        } else {
2166
            $cliObj->cli_echo(count($this->urlList) . " entries found for processing. (Use -o to decide action):\n\n", true);
2167
            $cliObj->cli_echo(implode(chr(10), $this->urlList) . chr(10), true);
2168
        }
2169
    }
2170
2171
    /**
2172
     * Function executed by crawler_im.php cli script.
2173
     *
2174
     * @return bool
2175
     */
2176
    public function CLI_main_flush()
2177
    {
2178
        $this->setAccessMode('cli_flush');
2179
        $cliObj = GeneralUtility::makeInstance(FlushCommandLineController::class);
2180
2181
        // Force user to admin state and set workspace to "Live":
2182
        $this->backendUser->user['admin'] = 1;
2183
        $this->backendUser->setWorkspace(0);
2184
2185
        // Print help
2186
        if (!isset($cliObj->cli_args['_DEFAULT'][1])) {
2187
            $cliObj->cli_validateArgs();
2188
            $cliObj->cli_help();
2189
            exit;
0 ignored issues
show
Coding Style Compatibility introduced by
The method CLI_main_flush() contains an exit expression.

An exit expression should only be used in rare cases. For example, if you write a short command line script.

In most cases however, using an exit expression makes the code untestable and often causes incompatibilities with other libraries. Thus, unless you are absolutely sure it is required here, we recommend to refactor your code to avoid its usage.

Loading history...
2190
        }
2191
2192
        $cliObj->cli_validateArgs();
2193
        $pageId = MathUtility::forceIntegerInRange($cliObj->cli_args['_DEFAULT'][1], 0);
2194
        $fullFlush = ($pageId == 0);
2195
2196
        $mode = $cliObj->cli_argValue('-o');
2197
2198
        switch ($mode) {
2199
            case 'all':
2200
                $result = $this->getLogEntriesForPageId($pageId, '', true, $fullFlush);
2201
                break;
2202
            case 'finished':
2203
            case 'pending':
2204
                $result = $this->getLogEntriesForPageId($pageId, $mode, true, $fullFlush);
2205
                break;
2206
            default:
2207
                $cliObj->cli_validateArgs();
2208
                $cliObj->cli_help();
2209
                $result = false;
2210
        }
2211
2212
        return $result !== false;
2213
    }
2214
2215
    /**
2216
     * Obtains configuration keys from the CLI arguments
2217
     *
2218
     * @param  QueueCommandLineController $cliObj    Command line object
2219
     * @return mixed                        Array of keys or null if no keys found
2220
     */
2221
    protected function getConfigurationKeys(QueueCommandLineController &$cliObj)
2222
    {
2223
        $parameter = trim($cliObj->cli_argValue('-conf'));
2224
        return ($parameter != '' ? GeneralUtility::trimExplode(',', $parameter) : []);
2225
    }
2226
2227
    /**
2228
     * Running the functionality of the CLI (crawling URLs from queue)
2229
     *
2230
     * @param int $countInARun
2231
     * @param int $sleepTime
2232
     * @param int $sleepAfterFinish
2233
     * @return string
2234
     */
2235
    public function CLI_run($countInARun, $sleepTime, $sleepAfterFinish)
2236
    {
2237
        $result = 0;
2238
        $counter = 0;
2239
2240
        // First, run hooks:
2241
        $this->CLI_runHooks();
2242
2243
        // Clean up the queue
2244
        if (intval($this->extensionSettings['purgeQueueDays']) > 0) {
2245
            $purgeDate = $this->getCurrentTime() - 24 * 60 * 60 * intval($this->extensionSettings['purgeQueueDays']);
2246
            $del = $this->db->exec_DELETEquery(
2247
                'tx_crawler_queue',
2248
                'exec_time!=0 AND exec_time<' . $purgeDate
2249
            );
2250
            if (false == $del) {
2251
                GeneralUtility::devLog('Records could not be deleted.', 'crawler', LogLevel::INFO);
2252
            }
2253
        }
2254
2255
        // Select entries:
2256
        //TODO Shouldn't this reside within the transaction?
2257
        $rows = $this->db->exec_SELECTgetRows(
2258
            'qid,scheduled',
2259
            'tx_crawler_queue',
2260
            'exec_time=0
2261
                AND process_scheduled= 0
2262
                AND scheduled<=' . $this->getCurrentTime(),
2263
            '',
2264
            'scheduled, qid',
2265
        intval($countInARun)
2266
        );
2267
2268
        if (count($rows) > 0) {
2269
            $quidList = [];
2270
2271
            foreach ($rows as $r) {
0 ignored issues
show
Bug introduced by
The expression $rows of type null|array is not guaranteed to be traversable. How about adding an additional type check?

There are different options of fixing this problem.

  1. If you want to be on the safe side, you can add an additional type-check:

    $collection = json_decode($data, true);
    if ( ! is_array($collection)) {
        throw new \RuntimeException('$collection must be an array.');
    }
    
    foreach ($collection as $item) { /** ... */ }
    
  2. If you are sure that the expression is traversable, you might want to add a doc comment cast to improve IDE auto-completion and static analysis:

    /** @var array $collection */
    $collection = json_decode($data, true);
    
    foreach ($collection as $item) { /** .. */ }
    
  3. Mark the issue as a false-positive: Just hover the remove button, in the top-right corner of this issue for more options.

Loading history...
2272
                $quidList[] = $r['qid'];
2273
            }
2274
2275
            $processId = $this->CLI_buildProcessId();
2276
2277
            //reserve queue entries for process
2278
            $this->db->sql_query('BEGIN');
2279
            //TODO make sure we're not taking assigned queue-entires
2280
            $this->db->exec_UPDATEquery(
2281
                'tx_crawler_queue',
2282
                'qid IN (' . implode(',', $quidList) . ')',
2283
                [
2284
                    'process_scheduled' => intval($this->getCurrentTime()),
2285
                    'process_id' => $processId
2286
                ]
2287
            );
2288
2289
            //save the number of assigned queue entrys to determine who many have been processed later
2290
            $numberOfAffectedRows = $this->db->sql_affected_rows();
2291
            $this->db->exec_UPDATEquery(
2292
                'tx_crawler_process',
2293
                "process_id = '" . $processId . "'",
2294
                [
2295
                    'assigned_items_count' => intval($numberOfAffectedRows)
2296
                ]
2297
            );
2298
2299
            if ($numberOfAffectedRows == count($quidList)) {
2300
                $this->db->sql_query('COMMIT');
2301
            } else {
2302
                $this->db->sql_query('ROLLBACK');
2303
                $this->CLI_debug("Nothing processed due to multi-process collision (" . $this->CLI_buildProcessId() . ")");
2304
                return ($result | self::CLI_STATUS_ABORTED);
2305
            }
2306
2307
            foreach ($rows as $r) {
0 ignored issues
show
Bug introduced by
The expression $rows of type null|array is not guaranteed to be traversable. How about adding an additional type check?

There are different options of fixing this problem.

  1. If you want to be on the safe side, you can add an additional type-check:

    $collection = json_decode($data, true);
    if ( ! is_array($collection)) {
        throw new \RuntimeException('$collection must be an array.');
    }
    
    foreach ($collection as $item) { /** ... */ }
    
  2. If you are sure that the expression is traversable, you might want to add a doc comment cast to improve IDE auto-completion and static analysis:

    /** @var array $collection */
    $collection = json_decode($data, true);
    
    foreach ($collection as $item) { /** .. */ }
    
  3. Mark the issue as a false-positive: Just hover the remove button, in the top-right corner of this issue for more options.

Loading history...
2308
                $result |= $this->readUrl($r['qid']);
2309
2310
                $counter++;
2311
                usleep(intval($sleepTime)); // Just to relax the system
2312
2313
                // if during the start and the current read url the cli has been disable we need to return from the function
2314
                // mark the process NOT as ended.
2315
                if ($this->getDisabled()) {
2316
                    return ($result | self::CLI_STATUS_ABORTED);
2317
                }
2318
2319
                if (!$this->CLI_checkIfProcessIsActive($this->CLI_buildProcessId())) {
2320
                    $this->CLI_debug("conflict / timeout (" . $this->CLI_buildProcessId() . ")");
2321
2322
                    //TODO might need an additional returncode
2323
                    $result |= self::CLI_STATUS_ABORTED;
2324
                    break; //possible timeout
2325
                }
2326
            }
2327
2328
            sleep(intval($sleepAfterFinish));
2329
2330
            $msg = 'Rows: ' . $counter;
2331
            $this->CLI_debug($msg . " (" . $this->CLI_buildProcessId() . ")");
2332
        } else {
2333
            $this->CLI_debug("Nothing within queue which needs to be processed (" . $this->CLI_buildProcessId() . ")");
2334
        }
2335
2336
        if ($counter > 0) {
2337
            $result |= self::CLI_STATUS_PROCESSED;
2338
        }
2339
2340
        return $result;
2341
    }
2342
2343
    /**
2344
     * Activate hooks
2345
     *
2346
     * @return void
2347
     */
2348
    public function CLI_runHooks()
2349
    {
2350
        global $TYPO3_CONF_VARS;
2351
        if (is_array($TYPO3_CONF_VARS['EXTCONF']['crawler']['cli_hooks'])) {
2352
            foreach ($TYPO3_CONF_VARS['EXTCONF']['crawler']['cli_hooks'] as $objRef) {
2353
                $hookObj = &GeneralUtility::getUserObj($objRef);
2354
                if (is_object($hookObj)) {
2355
                    $hookObj->crawler_init($this);
2356
                }
2357
            }
2358
        }
2359
    }
2360
2361
    /**
2362
     * Try to acquire a new process with the given id
2363
     * also performs some auto-cleanup for orphan processes
2364
     * @todo preemption might not be the most elegant way to clean up
2365
     *
2366
     * @param string $id identification string for the process
2367
     * @return boolean
2368
     */
2369
    public function CLI_checkAndAcquireNewProcess($id)
2370
    {
2371
        $ret = true;
2372
2373
        $systemProcessId = getmypid();
2374
        if ($systemProcessId < 1) {
2375
            return false;
2376
        }
2377
2378
        $processCount = 0;
2379
        $orphanProcesses = [];
2380
2381
        $this->db->sql_query('BEGIN');
2382
2383
        $res = $this->db->exec_SELECTquery(
2384
            'process_id,ttl',
2385
            'tx_crawler_process',
2386
            'active=1 AND deleted=0'
2387
            );
2388
2389
        $currentTime = $this->getCurrentTime();
2390
2391
        while ($row = $this->db->sql_fetch_assoc($res)) {
2392
            if ($row['ttl'] < $currentTime) {
2393
                $orphanProcesses[] = $row['process_id'];
2394
            } else {
2395
                $processCount++;
2396
            }
2397
        }
2398
2399
        // if there are less than allowed active processes then add a new one
2400
        if ($processCount < intval($this->extensionSettings['processLimit'])) {
2401
            $this->CLI_debug("add process " . $this->CLI_buildProcessId() . " (" . ($processCount + 1) . "/" . intval($this->extensionSettings['processLimit']) . ")");
2402
2403
            // create new process record
2404
            $this->db->exec_INSERTquery(
2405
                'tx_crawler_process',
2406
                [
2407
                    'process_id' => $id,
2408
                    'active' => '1',
2409
                    'ttl' => ($currentTime + intval($this->extensionSettings['processMaxRunTime'])),
2410
                    'system_process_id' => $systemProcessId
2411
                ]
2412
                );
2413
        } else {
2414
            $this->CLI_debug("Processlimit reached (" . ($processCount) . "/" . intval($this->extensionSettings['processLimit']) . ")");
2415
            $ret = false;
2416
        }
2417
2418
        $this->CLI_releaseProcesses($orphanProcesses, true); // maybe this should be somehow included into the current lock
2419
        $this->CLI_deleteProcessesMarkedDeleted();
2420
2421
        $this->db->sql_query('COMMIT');
2422
2423
        return $ret;
2424
    }
2425
2426
    /**
2427
     * Release a process and the required resources
2428
     *
2429
     * @param  mixed    $releaseIds   string with a single process-id or array with multiple process-ids
2430
     * @param  boolean  $withinLock   show whether the DB-actions are included within an existing lock
2431
     * @return boolean
2432
     */
2433
    public function CLI_releaseProcesses($releaseIds, $withinLock = false)
2434
    {
2435
        if (!is_array($releaseIds)) {
2436
            $releaseIds = [$releaseIds];
2437
        }
2438
2439
        if (!count($releaseIds) > 0) {
2440
            return false;   //nothing to release
2441
        }
2442
2443
        if (!$withinLock) {
2444
            $this->db->sql_query('BEGIN');
2445
        }
2446
2447
        // some kind of 2nd chance algo - this way you need at least 2 processes to have a real cleanup
2448
        // this ensures that a single process can't mess up the entire process table
2449
2450
        // mark all processes as deleted which have no "waiting" queue-entires and which are not active
2451
        $this->db->exec_UPDATEquery(
2452
            'tx_crawler_queue',
2453
            'process_id IN (SELECT process_id FROM tx_crawler_process WHERE active=0 AND deleted=0)',
2454
            [
2455
                'process_scheduled' => 0,
2456
                'process_id' => ''
2457
            ]
2458
        );
2459
        $this->db->exec_UPDATEquery(
2460
            'tx_crawler_process',
2461
            'active=0 AND deleted=0
2462
            AND NOT EXISTS (
2463
                SELECT * FROM tx_crawler_queue
2464
                WHERE tx_crawler_queue.process_id = tx_crawler_process.process_id
2465
                AND tx_crawler_queue.exec_time = 0
2466
            )',
2467
            [
2468
                'deleted' => '1',
2469
                'system_process_id' => 0
2470
            ]
2471
        );
2472
        // mark all requested processes as non-active
2473
        $this->db->exec_UPDATEquery(
2474
            'tx_crawler_process',
2475
            'process_id IN (\'' . implode('\',\'', $releaseIds) . '\') AND deleted=0',
2476
            [
2477
                'active' => '0'
2478
            ]
2479
        );
2480
        $this->db->exec_UPDATEquery(
2481
            'tx_crawler_queue',
2482
            'exec_time=0 AND process_id IN ("' . implode('","', $releaseIds) . '")',
2483
            [
2484
                'process_scheduled' => 0,
2485
                'process_id' => ''
2486
            ]
2487
        );
2488
2489
        if (!$withinLock) {
2490
            $this->db->sql_query('COMMIT');
2491
        }
2492
2493
        return true;
2494
    }
2495
2496
    /**
2497
     * Delete processes marked as deleted
2498
     *
2499
     * @return void
2500
     */
2501 1
    public function CLI_deleteProcessesMarkedDeleted()
2502
    {
2503 1
        $this->db->exec_DELETEquery('tx_crawler_process', 'deleted = 1');
2504 1
    }
2505
2506
    /**
2507
     * Check if there are still resources left for the process with the given id
2508
     * Used to determine timeouts and to ensure a proper cleanup if there's a timeout
2509
     *
2510
     * @param  string  identification string for the process
2511
     * @return boolean determines if the process is still active / has resources
2512
     *
2513
     * FIXME: Please remove Transaction, not needed as only a select query.
2514
     */
2515
    public function CLI_checkIfProcessIsActive($pid)
2516
    {
2517
        $ret = false;
2518
        $this->db->sql_query('BEGIN');
2519
        $res = $this->db->exec_SELECTquery(
2520
            'process_id,active,ttl',
2521
            'tx_crawler_process',
2522
            'process_id = \'' . $pid . '\'  AND deleted=0',
2523
            '',
2524
            'ttl',
2525
            '0,1'
2526
        );
2527
        if ($row = $this->db->sql_fetch_assoc($res)) {
2528
            $ret = intVal($row['active']) == 1;
2529
        }
2530
        $this->db->sql_query('COMMIT');
2531
2532
        return $ret;
2533
    }
2534
2535
    /**
2536
     * Create a unique Id for the current process
2537
     *
2538
     * @return string  the ID
2539
     */
2540 2
    public function CLI_buildProcessId()
2541
    {
2542 2
        if (!$this->processID) {
2543 1
            $this->processID = GeneralUtility::shortMD5($this->microtime(true));
2544
        }
2545 2
        return $this->processID;
2546
    }
2547
2548
    /**
2549
     * @param bool $get_as_float
2550
     *
2551
     * @return mixed
2552
     */
2553
    protected function microtime($get_as_float = false)
2554
    {
2555
        return microtime($get_as_float);
2556
    }
2557
2558
    /**
2559
     * Prints a message to the stdout (only if debug-mode is enabled)
2560
     *
2561
     * @param  string $msg  the message
2562
     */
2563
    public function CLI_debug($msg)
2564
    {
2565
        if (intval($this->extensionSettings['processDebug'])) {
2566
            echo $msg . "\n";
2567
            flush();
2568
        }
2569
    }
2570
2571
    /**
2572
     * Get URL content by making direct request to TYPO3.
2573
     *
2574
     * @param  string $url          Page URL
2575
     * @param  int    $crawlerId    Crawler-ID
2576
     * @return array
2577
     */
2578 2
    protected function sendDirectRequest($url, $crawlerId)
2579
    {
2580 2
        $parsedUrl = parse_url($url);
2581 2
        if (!is_array($parsedUrl)) {
2582
            return [];
2583
        }
2584
2585 2
        $requestHeaders = $this->buildRequestHeaderArray($parsedUrl, $crawlerId);
2586
2587 2
        $cmd = escapeshellcmd($this->extensionSettings['phpPath']);
2588 2
        $cmd .= ' ';
2589 2
        $cmd .= escapeshellarg(ExtensionManagementUtility::extPath('crawler') . 'cli/bootstrap.php');
2590 2
        $cmd .= ' ';
2591 2
        $cmd .= escapeshellarg($this->getFrontendBasePath());
2592 2
        $cmd .= ' ';
2593 2
        $cmd .= escapeshellarg($url);
2594 2
        $cmd .= ' ';
2595 2
        $cmd .= escapeshellarg(base64_encode(serialize($requestHeaders)));
2596
2597 2
        $startTime = microtime(true);
2598 2
        $content = $this->executeShellCommand($cmd);
2599 2
        $this->log($url . ' ' . (microtime(true) - $startTime));
2600
2601
        $result = [
2602 2
            'request' => implode("\r\n", $requestHeaders) . "\r\n\r\n",
2603 2
            'headers' => '',
2604 2
            'content' => $content
2605
        ];
2606
2607 2
        return $result;
2608
    }
2609
2610
    /**
2611
     * Cleans up entries that stayed for too long in the queue. These are:
2612
     * - processed entries that are over 1.5 days in age
2613
     * - scheduled entries that are over 7 days old
2614
     *
2615
     * @return void
2616
     */
2617
    protected function cleanUpOldQueueEntries()
2618
    {
2619
        $processedAgeInSeconds = $this->extensionSettings['cleanUpProcessedAge'] * 86400; // 24*60*60 Seconds in 24 hours
2620
        $scheduledAgeInSeconds = $this->extensionSettings['cleanUpScheduledAge'] * 86400;
2621
2622
        $now = time();
2623
        $condition = '(exec_time<>0 AND exec_time<' . ($now - $processedAgeInSeconds) . ') OR scheduled<=' . ($now - $scheduledAgeInSeconds);
2624
        $this->flushQueue($condition);
2625
    }
2626
2627
    /**
2628
     * Initializes a TypoScript Frontend necessary for using TypoScript and TypoLink functions
2629
     *
2630
     * @param int $id
2631
     * @param int $typeNum
2632
     *
2633
     * @return void
2634
     */
2635
    protected function initTSFE($id = 1, $typeNum = 0)
2636
    {
2637
        EidUtility::initTCA();
2638
        if (!is_object($GLOBALS['TT'])) {
2639
            $GLOBALS['TT'] = new NullTimeTracker();
2640
            $GLOBALS['TT']->start();
2641
        }
2642
2643
        $GLOBALS['TSFE'] = GeneralUtility::makeInstance(TypoScriptFrontendController::class, $GLOBALS['TYPO3_CONF_VARS'], $id, $typeNum);
2644
        $GLOBALS['TSFE']->sys_page = GeneralUtility::makeInstance(PageRepository::class);
2645
        $GLOBALS['TSFE']->sys_page->init(true);
2646
        $GLOBALS['TSFE']->connectToDB();
2647
        $GLOBALS['TSFE']->initFEuser();
2648
        $GLOBALS['TSFE']->determineId();
2649
        $GLOBALS['TSFE']->initTemplate();
2650
        $GLOBALS['TSFE']->rootLine = $GLOBALS['TSFE']->sys_page->getRootLine($id, '');
2651
        $GLOBALS['TSFE']->getConfigArray();
2652
        PageGenerator::pagegenInit();
2653
    }
2654
2655
    /**
2656
     * Returns a md5 hash generated from a serialized configuration array.
2657
     *
2658
     * @param array $configuration
2659
     *
2660
     * @return string
2661
     */
2662 9
    protected function getConfigurationHash(array $configuration) {
2663 9
        unset($configuration['paramExpanded']);
2664 9
        unset($configuration['URLs']);
2665 9
        return md5(serialize($configuration));
2666
    }
2667
2668
    /**
2669
     * Check whether the Crawling Protocol should be http or https
2670
     *
2671
     * @param $crawlerConfiguration
2672
     * @param $pageConfiguration
2673
     *
2674
     * @return bool
2675
     */
2676 6
    protected function isCrawlingProtocolHttps($crawlerConfiguration, $pageConfiguration) {
2677
        switch($crawlerConfiguration) {
2678 6
            case -1:
2679 1
                return false;
2680
                break;
0 ignored issues
show
Unused Code introduced by
break is not strictly necessary here and could be removed.

The break statement is not necessary if it is preceded for example by a return statement:

switch ($x) {
    case 1:
        return 'foo';
        break; // This break is not necessary and can be left off.
}

If you would like to keep this construct to be consistent with other case statements, you can safely mark this issue as a false-positive.

Loading history...
2681 5
            case 0:
2682 3
                return $pageConfiguration;
2683
                break;
0 ignored issues
show
Unused Code introduced by
break is not strictly necessary here and could be removed.

The break statement is not necessary if it is preceded for example by a return statement:

switch ($x) {
    case 1:
        return 'foo';
        break; // This break is not necessary and can be left off.
}

If you would like to keep this construct to be consistent with other case statements, you can safely mark this issue as a false-positive.

Loading history...
2684 2
            case 1:
2685 1
                return true;
2686
                break;
0 ignored issues
show
Unused Code introduced by
break is not strictly necessary here and could be removed.

The break statement is not necessary if it is preceded for example by a return statement:

switch ($x) {
    case 1:
        return 'foo';
        break; // This break is not necessary and can be left off.
}

If you would like to keep this construct to be consistent with other case statements, you can safely mark this issue as a false-positive.

Loading history...
2687
            default:
2688 1
                return false;
2689
        }
2690
2691
    }
2692
}
2693