Completed
Push — issue/92 ( 1ce8c6 )
by Tomas Norre
07:51
created

CrawlerController::getUrlsForPageId()   D

Complexity

Conditions 27
Paths 84

Size

Total Lines 138
Code Lines 72

Duplication

Lines 0
Ratio 0 %

Code Coverage

Tests 64
CRAP Score 27.2778

Importance

Changes 0
Metric Value
cc 27
eloc 72
nc 84
nop 2
dl 0
loc 138
ccs 64
cts 69
cp 0.9275
crap 27.2778
rs 4.509
c 0
b 0
f 0

How to fix   Long Method    Complexity   

Long Method

Small methods make your code easier to understand, in particular if combined with a good name. Besides, if your method is small, finding a good name is usually much easier.

For example, if you find yourself adding comments to a method's body, this is usually a good sign to extract the commented part to a new method, and use the comment as a starting point when coming up with a good name for this new method.

Commonly applied refactorings include:

1
<?php
2
namespace AOE\Crawler\Controller;
3
4
/***************************************************************
5
 *  Copyright notice
6
 *
7
 *  (c) 2017 AOE GmbH <[email protected]>
8
 *
9
 *  All rights reserved
10
 *
11
 *  This script is part of the TYPO3 project. The TYPO3 project is
12
 *  free software; you can redistribute it and/or modify
13
 *  it under the terms of the GNU General Public License as published by
14
 *  the Free Software Foundation; either version 3 of the License, or
15
 *  (at your option) any later version.
16
 *
17
 *  The GNU General Public License can be found at
18
 *  http://www.gnu.org/copyleft/gpl.html.
19
 *
20
 *  This script is distributed in the hope that it will be useful,
21
 *  but WITHOUT ANY WARRANTY; without even the implied warranty of
22
 *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
23
 *  GNU General Public License for more details.
24
 *
25
 *  This copyright notice MUST APPEAR in all copies of the script!
26
 ***************************************************************/
27
28
use AOE\Crawler\Command\CrawlerCommandLineController;
29
use AOE\Crawler\Command\FlushCommandLineController;
30
use AOE\Crawler\Command\QueueCommandLineController;
31
use AOE\Crawler\Domain\Model\Reason;
32
use AOE\Crawler\Event\EventDispatcher;
33
use AOE\Crawler\Utility\IconUtility;
34
use AOE\Crawler\Utility\SignalSlotUtility;
35
use TYPO3\CMS\Backend\Utility\BackendUtility;
36
use TYPO3\CMS\Backend\Tree\View\PageTreeView;
37
use TYPO3\CMS\Core\Authentication\BackendUserAuthentication;
38
use TYPO3\CMS\Core\Database\DatabaseConnection;
39
use TYPO3\CMS\Core\Log\LogLevel;
40
use TYPO3\CMS\Core\TimeTracker\NullTimeTracker;
41
use TYPO3\CMS\Core\Utility\DebugUtility;
42
use TYPO3\CMS\Core\Utility\ExtensionManagementUtility;
43
use TYPO3\CMS\Core\Utility\GeneralUtility;
44
use TYPO3\CMS\Core\Utility\MathUtility;
45
use TYPO3\CMS\Frontend\Controller\TypoScriptFrontendController;
46
use TYPO3\CMS\Frontend\Page\PageGenerator;
47
use TYPO3\CMS\Frontend\Page\PageRepository;
48
use TYPO3\CMS\Frontend\Utility\EidUtility;
49
50
/**
51
 * Class CrawlerController
52
 *
53
 * @package AOE\Crawler\Controller
54
 */
55
class CrawlerController
56
{
57
    const CLI_STATUS_NOTHING_PROCCESSED = 0;
58
    const CLI_STATUS_REMAIN = 1; //queue not empty
59
    const CLI_STATUS_PROCESSED = 2; //(some) queue items where processed
60
    const CLI_STATUS_ABORTED = 4; //instance didn't finish
61
    const CLI_STATUS_POLLABLE_PROCESSED = 8;
62
63
    /**
64
     * @var integer
65
     */
66
    public $setID = 0;
67
68
    /**
69
     * @var string
70
     */
71
    public $processID = '';
72
73
    /**
74
     * One hour is max stalled time for the CLI
75
     * If the process had the status "start" for 3600 seconds, it will be regarded stalled and a new process is started
76
     *
77
     * @var integer
78
     */
79
    public $max_CLI_exec_time = 3600;
80
81
    /**
82
     * @var array
83
     */
84
    public $duplicateTrack = [];
85
86
    /**
87
     * @var array
88
     */
89
    public $downloadUrls = [];
90
91
    /**
92
     * @var array
93
     */
94
    public $incomingProcInstructions = [];
95
96
    /**
97
     * @var array
98
     */
99
    public $incomingConfigurationSelection = [];
100
101
    /**
102
     * @var bool
103
     */
104
    public $registerQueueEntriesInternallyOnly = false;
105
106
    /**
107
     * @var array
108
     */
109
    public $queueEntries = [];
110
111
    /**
112
     * @var array
113
     */
114
    public $urlList = [];
115
116
    /**
117
     * @var boolean
118
     */
119
    public $debugMode = false;
120
121
    /**
122
     * @var array
123
     */
124
    public $extensionSettings = [];
125
126
    /**
127
     * Mount Point
128
     *
129
     * @var boolean
130
     */
131
    public $MP = false;
132
133
    /**
134
     * @var string
135
     */
136
    protected $processFilename;
137
138
    /**
139
     * Holds the internal access mode can be 'gui','cli' or 'cli_im'
140
     *
141
     * @var string
142
     */
143
    protected $accessMode;
144
145
    /**
146
     * @var DatabaseConnection
147
     */
148
    private $db;
149
150
    /**
151
     * @var BackendUserAuthentication
152
     */
153
    private $backendUser;
154
155
    /**
156
     * @var integer
157
     */
158
    private $scheduledTime = 0;
159
160
    /**
161
     * @var integer
162
     */
163
    private $reqMinute = 0;
164
165
    /**
166
     * @var bool
167
     */
168
    private $submitCrawlUrls = false;
169
170
    /**
171
     * @var bool
172
     */
173
    private $downloadCrawlUrls = false;
174
175
    /**
176
     * Method to set the accessMode can be gui, cli or cli_im
177
     *
178
     * @return string
179
     */
180 1
    public function getAccessMode()
181
    {
182 1
        return $this->accessMode;
183
    }
184
185
    /**
186
     * @param string $accessMode
187
     */
188 1
    public function setAccessMode($accessMode)
189
    {
190 1
        $this->accessMode = $accessMode;
191 1
    }
192
193
    /**
194
     * Set disabled status to prevent processes from being processed
195
     *
196
     * @param  bool $disabled (optional, defaults to true)
197
     * @return void
198
     */
199 3
    public function setDisabled($disabled = true)
200
    {
201 3
        if ($disabled) {
202 2
            GeneralUtility::writeFile($this->processFilename, '');
203
        } else {
204 1
            if (is_file($this->processFilename)) {
205 1
                unlink($this->processFilename);
206
            }
207
        }
208 3
    }
209
210
    /**
211
     * Get disable status
212
     *
213
     * @return bool true if disabled
214
     */
215 3
    public function getDisabled()
216
    {
217 3
        if (is_file($this->processFilename)) {
218 2
            return true;
219
        } else {
220 1
            return false;
221
        }
222
    }
223
224
    /**
225
     * @param string $filenameWithPath
226
     *
227
     * @return void
228
     */
229 4
    public function setProcessFilename($filenameWithPath)
230
    {
231 4
        $this->processFilename = $filenameWithPath;
232 4
    }
233
234
    /**
235
     * @return string
236
     */
237 1
    public function getProcessFilename()
238
    {
239 1
        return $this->processFilename;
240
    }
241
242
    /************************************
243
     *
244
     * Getting URLs based on Page TSconfig
245
     *
246
     ************************************/
247
248 55
    public function __construct()
249
    {
250 55
        $this->db = $GLOBALS['TYPO3_DB'];
251 55
        $this->backendUser = $GLOBALS['BE_USER'];
252 55
        $this->processFilename = PATH_site . 'typo3temp/tx_crawler.proc';
253
254 55
        $settings = unserialize($GLOBALS['TYPO3_CONF_VARS']['EXT']['extConf']['crawler']);
255 55
        $settings = is_array($settings) ? $settings : [];
256
257
        // read ext_em_conf_template settings and set
258 55
        $this->setExtensionSettings($settings);
259
260
        // set defaults:
261 55
        if (MathUtility::convertToPositiveInteger($this->extensionSettings['countInARun']) == 0) {
262 23
            $this->extensionSettings['countInARun'] = 100;
263
        }
264
265 55
        $this->extensionSettings['processLimit'] = MathUtility::forceIntegerInRange($this->extensionSettings['processLimit'], 1, 99, 1);
266 55
    }
267
268
    /**
269
     * Sets the extensions settings (unserialized pendant of $TYPO3_CONF_VARS['EXT']['extConf']['crawler']).
270
     *
271
     * @param array $extensionSettings
272
     * @return void
273
     */
274 63
    public function setExtensionSettings(array $extensionSettings)
275
    {
276 63
        $this->extensionSettings = $extensionSettings;
277 63
    }
278
279
    /**
280
     * Check if the given page should be crawled
281
     *
282
     * @param array $pageRow
283
     * @return false|string false if the page should be crawled (not excluded), true / skipMessage if it should be skipped
284
     */
285 10
    public function checkIfPageShouldBeSkipped(array $pageRow)
286
    {
287 10
        $skipPage = false;
288 10
        $skipMessage = 'Skipped'; // message will be overwritten later
289
290
        // if page is hidden
291 10
        if (!$this->extensionSettings['crawlHiddenPages']) {
292 10
            if ($pageRow['hidden']) {
293 1
                $skipPage = true;
294 1
                $skipMessage = 'Because page is hidden';
295
            }
296
        }
297
298 10
        if (!$skipPage) {
299 9
            if (GeneralUtility::inList('3,4', $pageRow['doktype']) || $pageRow['doktype'] >= 199) {
300 3
                $skipPage = true;
301 3
                $skipMessage = 'Because doktype is not allowed';
302
            }
303
        }
304
305 10
        if (!$skipPage) {
306 6
            if (is_array($GLOBALS['TYPO3_CONF_VARS']['EXTCONF']['crawler']['excludeDoktype'])) {
307 2
                foreach ($GLOBALS['TYPO3_CONF_VARS']['EXTCONF']['crawler']['excludeDoktype'] as $key => $doktypeList) {
308 1
                    if (GeneralUtility::inList($doktypeList, $pageRow['doktype'])) {
309 1
                        $skipPage = true;
310 1
                        $skipMessage = 'Doktype was excluded by "' . $key . '"';
311 1
                        break;
312
                    }
313
                }
314
            }
315
        }
316
317 10
        if (!$skipPage) {
318
            // veto hook
319 5
            if (is_array($GLOBALS['TYPO3_CONF_VARS']['EXTCONF']['crawler']['pageVeto'])) {
320
                foreach ($GLOBALS['TYPO3_CONF_VARS']['EXTCONF']['crawler']['pageVeto'] as $key => $func) {
321
                    $params = [
322
                        'pageRow' => $pageRow
323
                    ];
324
                    // expects "false" if page is ok and "true" or a skipMessage if this page should _not_ be crawled
325
                    $veto = GeneralUtility::callUserFunction($func, $params, $this);
326
                    if ($veto !== false) {
327
                        $skipPage = true;
328
                        if (is_string($veto)) {
329
                            $skipMessage = $veto;
330
                        } else {
331
                            $skipMessage = 'Veto from hook "' . htmlspecialchars($key) . '"';
332
                        }
333
                        // no need to execute other hooks if a previous one return a veto
334
                        break;
335
                    }
336
                }
337
            }
338
        }
339
340 10
        return $skipPage ? $skipMessage : false;
341
    }
342
343
    /**
344
     * Wrapper method for getUrlsForPageId()
345
     * It returns an array of configurations and no urls!
346
     *
347
     * @param array $pageRow Page record with at least dok-type and uid columns.
348
     * @param string $skipMessage
349
     * @return array
350
     * @see getUrlsForPageId()
351
     */
352 6
    public function getUrlsForPageRow(array $pageRow, &$skipMessage = '')
353
    {
354 6
        $message = $this->checkIfPageShouldBeSkipped($pageRow);
355
356 6
        if ($message === false) {
357 5
            $forceSsl = ($pageRow['url_scheme'] === 2) ? true : false;
358 5
            $res = $this->getUrlsForPageId($pageRow['uid'], $forceSsl);
359 5
            $skipMessage = '';
360
        } else {
361 1
            $skipMessage = $message;
362 1
            $res = [];
363
        }
364
365 6
        return $res;
366
    }
367
368
    /**
369
     * This method is used to count if there are ANY unprocessed queue entries
370
     * of a given page_id and the configuration which matches a given hash.
371
     * If there if none, we can skip an inner detail check
372
     *
373
     * @param  int $uid
374
     * @param  string $configurationHash
375
     * @return boolean
376
     */
377 7
    protected function noUnprocessedQueueEntriesForPageWithConfigurationHashExist($uid, $configurationHash)
378
    {
379 7
        $configurationHash = $this->db->fullQuoteStr($configurationHash, 'tx_crawler_queue');
380 7
        $res = $this->db->exec_SELECTquery('count(*) as anz', 'tx_crawler_queue', "page_id=" . intval($uid) . " AND configuration_hash=" . $configurationHash . " AND exec_time=0");
381 7
        $row = $this->db->sql_fetch_assoc($res);
382
383 7
        return ($row['anz'] == 0);
384
    }
385
386
    /**
387
     * Creates a list of URLs from input array (and submits them to queue if asked for)
388
     * See Web > Info module script + "indexed_search"'s crawler hook-client using this!
389
     *
390
     * @param    array        Information about URLs from pageRow to crawl.
391
     * @param    array        Page row
392
     * @param    integer        Unix time to schedule indexing to, typically time()
393
     * @param    integer        Number of requests per minute (creates the interleave between requests)
394
     * @param    boolean        If set, submits the URLs to queue
395
     * @param    boolean        If set (and submitcrawlUrls is false) will fill $downloadUrls with entries)
396
     * @param    array        Array which is passed by reference and contains the an id per url to secure we will not crawl duplicates
397
     * @param    array        Array which will be filled with URLS for download if flag is set.
398
     * @param    array        Array of processing instructions
399
     * @return    string        List of URLs (meant for display in backend module)
400
     *
401
     */
402 4
    public function urlListFromUrlArray(
403
    array $vv,
404
    array $pageRow,
405
    $scheduledTime,
406
    $reqMinute,
407
    $submitCrawlUrls,
408
    $downloadCrawlUrls,
409
    array &$duplicateTrack,
410
    array &$downloadUrls,
411
    array $incomingProcInstructions
412
    ) {
413 4
        $urlList = '';
414
        // realurl support (thanks to Ingo Renner)
415 4
        if (ExtensionManagementUtility::isLoaded('realurl') && $vv['subCfg']['realurl']) {
416
417
            /** @var tx_realurl $urlObj */
418
            $urlObj = GeneralUtility::makeInstance('tx_realurl');
419
420
            if (!empty($vv['subCfg']['baseUrl'])) {
421
                $urlParts = parse_url($vv['subCfg']['baseUrl']);
422
                $host = strtolower($urlParts['host']);
423
                $urlObj->host = $host;
424
425
                // First pass, finding configuration OR pointer string:
426
                $urlObj->extConf = isset($GLOBALS['TYPO3_CONF_VARS']['EXTCONF']['realurl'][$urlObj->host]) ? $GLOBALS['TYPO3_CONF_VARS']['EXTCONF']['realurl'][$urlObj->host] : $GLOBALS['TYPO3_CONF_VARS']['EXTCONF']['realurl']['_DEFAULT'];
427
428
                // If it turned out to be a string pointer, then look up the real config:
429
                if (is_string($urlObj->extConf)) {
430
                    $urlObj->extConf = is_array($GLOBALS['TYPO3_CONF_VARS']['EXTCONF']['realurl'][$urlObj->extConf]) ? $GLOBALS['TYPO3_CONF_VARS']['EXTCONF']['realurl'][$urlObj->extConf] : $GLOBALS['TYPO3_CONF_VARS']['EXTCONF']['realurl']['_DEFAULT'];
431
                }
432
            }
433
434
            if (!$GLOBALS['TSFE']->sys_page) {
435
                $GLOBALS['TSFE']->sys_page = GeneralUtility::makeInstance('TYPO3\CMS\Frontend\Page\PageRepository');
436
            }
437
            if (!$GLOBALS['TSFE']->csConvObj) {
438
                $GLOBALS['TSFE']->csConvObj = GeneralUtility::makeInstance('TYPO3\CMS\Core\Charset\CharsetConverter');
439
            }
440
            if (!$GLOBALS['TSFE']->tmpl->rootLine[0]['uid']) {
441
                $GLOBALS['TSFE']->tmpl->rootLine[0]['uid'] = $urlObj->extConf['pagePath']['rootpage_id'];
442
            }
443
        }
444
445 4
        if (is_array($vv['URLs'])) {
446 4
            $configurationHash = $this->getConfigurationHash($vv);
447 4
            $skipInnerCheck = $this->noUnprocessedQueueEntriesForPageWithConfigurationHashExist($pageRow['uid'], $configurationHash);
448
449 4
            foreach ($vv['URLs'] as $urlQuery) {
450 4
                if ($this->drawURLs_PIfilter($vv['subCfg']['procInstrFilter'], $incomingProcInstructions)) {
451
452
                    // Calculate cHash:
453 4
                    if ($vv['subCfg']['cHash']) {
454
                        /* @var $cacheHash \TYPO3\CMS\Frontend\Page\CacheHashCalculator */
455
                        $cacheHash = GeneralUtility::makeInstance('TYPO3\CMS\Frontend\Page\CacheHashCalculator');
456
                        $urlQuery .= '&cHash=' . $cacheHash->generateForParameters($urlQuery);
457
                    }
458
459
                    // Create key by which to determine unique-ness:
460 4
                    $uKey = $urlQuery . '|' . $vv['subCfg']['userGroups'] . '|' . $vv['subCfg']['baseUrl'] . '|' . $vv['subCfg']['procInstrFilter'];
461
462
                    // realurl support (thanks to Ingo Renner)
463 4
                    $urlQuery = 'index.php' . $urlQuery;
464 4
                    if (ExtensionManagementUtility::isLoaded('realurl') && $vv['subCfg']['realurl']) {
465
                        $params = [
466
                            'LD' => [
467
                                'totalURL' => $urlQuery
468
                            ],
469
                            'TCEmainHook' => true
470
                        ];
471
                        $urlObj->encodeSpURL($params);
0 ignored issues
show
Bug introduced by
The variable $urlObj does not seem to be defined for all execution paths leading up to this point.

If you define a variable conditionally, it can happen that it is not defined for all execution paths.

Let’s take a look at an example:

function myFunction($a) {
    switch ($a) {
        case 'foo':
            $x = 1;
            break;

        case 'bar':
            $x = 2;
            break;
    }

    // $x is potentially undefined here.
    echo $x;
}

In the above example, the variable $x is defined if you pass “foo” or “bar” as argument for $a. However, since the switch statement has no default case statement, if you pass any other value, the variable $x would be undefined.

Available Fixes

  1. Check for existence of the variable explicitly:

    function myFunction($a) {
        switch ($a) {
            case 'foo':
                $x = 1;
                break;
    
            case 'bar':
                $x = 2;
                break;
        }
    
        if (isset($x)) { // Make sure it's always set.
            echo $x;
        }
    }
    
  2. Define a default value for the variable:

    function myFunction($a) {
        $x = ''; // Set a default which gets overridden for certain paths.
        switch ($a) {
            case 'foo':
                $x = 1;
                break;
    
            case 'bar':
                $x = 2;
                break;
        }
    
        echo $x;
    }
    
  3. Add a value for the missing path:

    function myFunction($a) {
        switch ($a) {
            case 'foo':
                $x = 1;
                break;
    
            case 'bar':
                $x = 2;
                break;
    
            // We add support for the missing case.
            default:
                $x = '';
                break;
        }
    
        echo $x;
    }
    
Loading history...
472
                        $urlQuery = $params['LD']['totalURL'];
473
                    }
474
475
                    // Scheduled time:
476 4
                    $schTime = $scheduledTime + round(count($duplicateTrack) * (60 / $reqMinute));
477 4
                    $schTime = floor($schTime / 60) * 60;
478
479 4
                    if (isset($duplicateTrack[$uKey])) {
480
481
                        //if the url key is registered just display it and do not resubmit is
482
                        $urlList = '<em><span class="typo3-dimmed">' . htmlspecialchars($urlQuery) . '</span></em><br/>';
483
                    } else {
484 4
                        $urlList = '[' . date('d.m.y H:i', $schTime) . '] ' . htmlspecialchars($urlQuery);
485 4
                        $this->urlList[] = '[' . date('d.m.y H:i', $schTime) . '] ' . $urlQuery;
486
487 4
                        $theUrl = ($vv['subCfg']['baseUrl'] ? $vv['subCfg']['baseUrl'] : GeneralUtility::getIndpEnv('TYPO3_SITE_URL')) . $urlQuery;
488
489
                        // Submit for crawling!
490 4
                        if ($submitCrawlUrls) {
491 4
                            $added = $this->addUrl(
492 4
                            $pageRow['uid'],
493 4
                            $theUrl,
494 4
                            $vv['subCfg'],
495 4
                            $scheduledTime,
496 4
                            $configurationHash,
497 4
                            $skipInnerCheck
498
                            );
499 4
                            if ($added === false) {
500 4
                                $urlList .= ' (Url already existed)';
501
                            }
502
                        } elseif ($downloadCrawlUrls) {
503
                            $downloadUrls[$theUrl] = $theUrl;
504
                        }
505
506 4
                        $urlList .= '<br />';
507
                    }
508 4
                    $duplicateTrack[$uKey] = true;
509
                }
510
            }
511
        } else {
512
            $urlList = 'ERROR - no URL generated';
513
        }
514
515 4
        return $urlList;
516
    }
517
518
    /**
519
     * Returns true if input processing instruction is among registered ones.
520
     *
521
     * @param string $piString PI to test
522
     * @param array $incomingProcInstructions Processing instructions
523
     * @return boolean
524
     */
525 5
    public function drawURLs_PIfilter($piString, array $incomingProcInstructions)
526
    {
527 5
        if (empty($incomingProcInstructions)) {
528 1
            return true;
529
        }
530
531 4
        foreach ($incomingProcInstructions as $pi) {
532 4
            if (GeneralUtility::inList($piString, $pi)) {
533 4
                return true;
534
            }
535
        }
536 2
    }
537
538 4
    public function getPageTSconfigForId($id)
539
    {
540 4
        if (!$this->MP) {
541 4
            $pageTSconfig = BackendUtility::getPagesTSconfig($id);
542
        } else {
543
            list(, $mountPointId) = explode('-', $this->MP);
544
            $pageTSconfig = BackendUtility::getPagesTSconfig($mountPointId);
545
        }
546
547
        // Call a hook to alter configuration
548 4
        if (is_array($GLOBALS['TYPO3_CONF_VARS']['EXTCONF']['crawler']['getPageTSconfigForId'])) {
549
            $params = [
550
                'pageId' => $id,
551
                'pageTSConfig' => &$pageTSconfig
552
            ];
553
            foreach ($GLOBALS['TYPO3_CONF_VARS']['EXTCONF']['crawler']['getPageTSconfigForId'] as $userFunc) {
554
                GeneralUtility::callUserFunction($userFunc, $params, $this);
555
            }
556
        }
557
558 4
        return $pageTSconfig;
559
    }
560
561
    /**
562
     * This methods returns an array of configurations.
563
     * And no urls!
564
     *
565
     * @param integer $id Page ID
566
     * @param bool $forceSsl Use https
567
     * @return array
568
     */
569 4
    protected function getUrlsForPageId($id, $forceSsl = false)
570
    {
571
572
        /**
573
         * Get configuration from tsConfig
574
         */
575
576
        // Get page TSconfig for page ID:
577 4
        $pageTSconfig = $this->getPageTSconfigForId($id);
578
579 4
        $res = [];
580
581 4
        if (is_array($pageTSconfig) && is_array($pageTSconfig['tx_crawler.']['crawlerCfg.'])) {
582 3
            $crawlerCfg = $pageTSconfig['tx_crawler.']['crawlerCfg.'];
583
584 3
            if (is_array($crawlerCfg['paramSets.'])) {
585 3
                foreach ($crawlerCfg['paramSets.'] as $key => $values) {
586 3
                    if (is_array($values)) {
587 3
                        $key = str_replace('.', '', $key);
588
                        // Sub configuration for a single configuration string:
589 3
                        $subCfg = (array)$crawlerCfg['paramSets.'][$key . '.'];
590 3
                        $subCfg['key'] = $key;
591
592 3
                        if (strcmp($subCfg['procInstrFilter'], '')) {
593 3
                            $subCfg['procInstrFilter'] = implode(',', GeneralUtility::trimExplode(',', $subCfg['procInstrFilter']));
594
                        }
595 3
                        $pidOnlyList = implode(',', GeneralUtility::trimExplode(',', $subCfg['pidsOnly'], true));
596
597
                        // process configuration if it is not page-specific or if the specific page is the current page:
598 3
                        if (!strcmp($subCfg['pidsOnly'], '') || GeneralUtility::inList($pidOnlyList, $id)) {
599
600
                                // add trailing slash if not present
601 3
                            if (!empty($subCfg['baseUrl']) && substr($subCfg['baseUrl'], -1) != '/') {
602
                                $subCfg['baseUrl'] .= '/';
603
                            }
604
605
                            // Explode, process etc.:
606 3
                            $res[$key] = [];
607 3
                            $res[$key]['subCfg'] = $subCfg;
608 3
                            $res[$key]['paramParsed'] = $this->parseParams($values);
0 ignored issues
show
Documentation introduced by
$values is of type array, but the function expects a string.

It seems like the type of the argument is not accepted by the function/method which you are calling.

In some cases, in particular if PHP’s automatic type-juggling kicks in this might be fine. In other cases, however this might be a bug.

We suggest to add an explicit type cast like in the following example:

function acceptsInteger($int) { }

$x = '123'; // string "123"

// Instead of
acceptsInteger($x);

// we recommend to use
acceptsInteger((integer) $x);
Loading history...
609 3
                            $res[$key]['paramExpanded'] = $this->expandParameters($res[$key]['paramParsed'], $id);
610 3
                            $res[$key]['origin'] = 'pagets';
611
612
                            // recognize MP value
613 3
                            if (!$this->MP) {
614 3
                                $res[$key]['URLs'] = $this->compileUrls($res[$key]['paramExpanded'], ['?id=' . $id]);
615
                            } else {
616 3
                                $res[$key]['URLs'] = $this->compileUrls($res[$key]['paramExpanded'], ['?id=' . $id . '&MP=' . $this->MP]);
617
                            }
618
                        }
619
                    }
620
                }
621
            }
622
        }
623
624
        /**
625
         * Get configuration from tx_crawler_configuration records
626
         */
627
628
        // get records along the rootline
629 4
        $rootLine = BackendUtility::BEgetRootLine($id);
630
631 4
        foreach ($rootLine as $page) {
632 4
            $configurationRecordsForCurrentPage = BackendUtility::getRecordsByField(
633 4
                'tx_crawler_configuration',
634 4
                'pid',
635 4
                intval($page['uid']),
636 4
                BackendUtility::BEenableFields('tx_crawler_configuration') . BackendUtility::deleteClause('tx_crawler_configuration')
637
            );
638
639 4
            if (is_array($configurationRecordsForCurrentPage)) {
640 1
                foreach ($configurationRecordsForCurrentPage as $configurationRecord) {
641
642
                        // check access to the configuration record
643 1
                    if (empty($configurationRecord['begroups']) || $GLOBALS['BE_USER']->isAdmin() || $this->hasGroupAccess($GLOBALS['BE_USER']->user['usergroup_cached_list'], $configurationRecord['begroups'])) {
644 1
                        $pidOnlyList = implode(',', GeneralUtility::trimExplode(',', $configurationRecord['pidsonly'], true));
645
646
                        // process configuration if it is not page-specific or if the specific page is the current page:
647 1
                        if (!strcmp($configurationRecord['pidsonly'], '') || GeneralUtility::inList($pidOnlyList, $id)) {
648 1
                            $key = $configurationRecord['name'];
649
650
                            // don't overwrite previously defined paramSets
651 1
                            if (!isset($res[$key])) {
652
653
                                    /* @var $TSparserObject \TYPO3\CMS\Core\TypoScript\Parser\TypoScriptParser */
654 1
                                $TSparserObject = GeneralUtility::makeInstance('TYPO3\CMS\Core\TypoScript\Parser\TypoScriptParser');
655 1
                                $TSparserObject->parse($configurationRecord['processing_instruction_parameters_ts']);
656
657
                                // Check Crawler Configurations forceSsl if false use the Page url_scheme.
658 1
                                $crawlerWithSsl = $configurationRecord['force_ssl'] ?: $forceSsl;
659
660
                                $subCfg = [
661 1
                                    'procInstrFilter' => $configurationRecord['processing_instruction_filter'],
662 1
                                    'procInstrParams.' => $TSparserObject->setup,
663 1
                                    'baseUrl' => $this->getBaseUrlForConfigurationRecord(
664 1
                                        $configurationRecord['base_url'],
665 1
                                        $configurationRecord['sys_domain_base_url'],
666 1
                                        $crawlerWithSsl
667
                                    ),
668 1
                                    'realurl' => $configurationRecord['realurl'],
669 1
                                    'cHash' => $configurationRecord['chash'],
670 1
                                    'userGroups' => $configurationRecord['fegroups'],
671 1
                                    'exclude' => $configurationRecord['exclude'],
672 1
                                    'rootTemplatePid' => (int) $configurationRecord['root_template_pid'],
673 1
                                    'key' => $key,
674 1
                                    'force_ssl' => $configurationRecord['force_ssl'],
675
                                ];
676
677
                                // add trailing slash if not present
678 1
                                if (!empty($subCfg['baseUrl']) && substr($subCfg['baseUrl'], -1) != '/') {
679
                                    $subCfg['baseUrl'] .= '/';
680
                                }
681 1
                                if (!in_array($id, $this->expandExcludeString($subCfg['exclude']))) {
682 1
                                    $res[$key] = [];
683 1
                                    $res[$key]['subCfg'] = $subCfg;
684 1
                                    $res[$key]['paramParsed'] = $this->parseParams($configurationRecord['configuration']);
685 1
                                    $res[$key]['paramExpanded'] = $this->expandParameters($res[$key]['paramParsed'], $id);
686 1
                                    $res[$key]['URLs'] = $this->compileUrls($res[$key]['paramExpanded'], ['?id=' . $id]);
687 4
                                    $res[$key]['origin'] = 'tx_crawler_configuration_' . $configurationRecord['uid'];
688
                                }
689
                            }
690
                        }
691
                    }
692
                }
693
            }
694
        }
695
696 4
        if (is_array($GLOBALS['TYPO3_CONF_VARS']['EXTCONF']['crawler']['processUrls'])) {
697
            foreach ($GLOBALS['TYPO3_CONF_VARS']['EXTCONF']['crawler']['processUrls'] as $func) {
698
                $params = [
699
                    'res' => &$res,
700
                ];
701
                GeneralUtility::callUserFunction($func, $params, $this);
702
            }
703
        }
704
705 4
        return $res;
706
    }
707
708
    /**
709
     * Checks if a domain record exist and returns the base-url based on the record. If not the given baseUrl string is used.
710
     *
711
     * @param string $baseUrl
712
     * @param integer $sysDomainUid
713
     * @param bool $ssl
714
     * @return string
715
     */
716 4
    protected function getBaseUrlForConfigurationRecord($baseUrl, $sysDomainUid, $ssl = false)
717
    {
718 4
        $sysDomainUid = intval($sysDomainUid);
719 4
        $urlScheme = ($ssl === false) ? 'http' : 'https';
720
721 4
        if ($sysDomainUid > 0) {
722 2
            $res = $this->db->exec_SELECTquery(
723 2
                '*',
724 2
                'sys_domain',
725 2
                'uid = ' . $sysDomainUid .
726 2
                BackendUtility::BEenableFields('sys_domain') .
727 2
                BackendUtility::deleteClause('sys_domain')
728
            );
729 2
            $row = $this->db->sql_fetch_assoc($res);
730 2
            if ($row['domainName'] != '') {
731 1
                return $urlScheme . '://' . $row['domainName'];
732
            }
733
        }
734 3
        return $baseUrl;
735
    }
736
737
    public function getConfigurationsForBranch($rootid, $depth)
738
    {
739
        $configurationsForBranch = [];
740
741
        $pageTSconfig = $this->getPageTSconfigForId($rootid);
742
        if (is_array($pageTSconfig) && is_array($pageTSconfig['tx_crawler.']['crawlerCfg.']) && is_array($pageTSconfig['tx_crawler.']['crawlerCfg.']['paramSets.'])) {
743
            $sets = $pageTSconfig['tx_crawler.']['crawlerCfg.']['paramSets.'];
744
            if (is_array($sets)) {
745
                foreach ($sets as $key => $value) {
746
                    if (!is_array($value)) {
747
                        continue;
748
                    }
749
                    $configurationsForBranch[] = substr($key, -1) == '.' ? substr($key, 0, -1) : $key;
750
                }
751
            }
752
        }
753
        $pids = [];
754
        $rootLine = BackendUtility::BEgetRootLine($rootid);
755
        foreach ($rootLine as $node) {
756
            $pids[] = $node['uid'];
757
        }
758
        /* @var PageTreeView $tree */
759
        $tree = GeneralUtility::makeInstance(PageTreeView::class);
760
        $perms_clause = $GLOBALS['BE_USER']->getPagePermsClause(1);
761
        $tree->init('AND ' . $perms_clause);
762
        $tree->getTree($rootid, $depth, '');
763
        foreach ($tree->tree as $node) {
764
            $pids[] = $node['row']['uid'];
765
        }
766
767
        $res = $this->db->exec_SELECTquery(
768
            '*',
769
            'tx_crawler_configuration',
770
            'pid IN (' . implode(',', $pids) . ') ' .
771
            BackendUtility::BEenableFields('tx_crawler_configuration') .
772
            BackendUtility::deleteClause('tx_crawler_configuration') . ' ' .
773
            BackendUtility::versioningPlaceholderClause('tx_crawler_configuration') . ' '
774
        );
775
776
        while ($row = $this->db->sql_fetch_assoc($res)) {
777
            $configurationsForBranch[] = $row['name'];
778
        }
779
        $this->db->sql_free_result($res);
780
        return $configurationsForBranch;
781
    }
782
783
    /**
784
     * Check if a user has access to an item
785
     * (e.g. get the group list of the current logged in user from $GLOBALS['TSFE']->gr_list)
786
     *
787
     * @see \TYPO3\CMS\Frontend\Page\PageRepository::getMultipleGroupsWhereClause()
788
     * @param  string $groupList    Comma-separated list of (fe_)group UIDs from a user
789
     * @param  string $accessList   Comma-separated list of (fe_)group UIDs of the item to access
790
     * @return bool                 TRUE if at least one of the users group UIDs is in the access list or the access list is empty
791
     */
792 3
    public function hasGroupAccess($groupList, $accessList)
793
    {
794 3
        if (empty($accessList)) {
795 1
            return true;
796
        }
797 2
        foreach (GeneralUtility::intExplode(',', $groupList) as $groupUid) {
798 2
            if (GeneralUtility::inList($accessList, $groupUid)) {
799 2
                return true;
800
            }
801
        }
802 1
        return false;
803
    }
804
805
    /**
806
     * Parse GET vars of input Query into array with key=>value pairs
807
     *
808
     * @param string $inputQuery Input query string
809
     * @return array
810
     */
811 7
    public function parseParams($inputQuery)
812
    {
813
        // Extract all GET parameters into an ARRAY:
814 7
        $paramKeyValues = [];
815 7
        $GETparams = explode('&', $inputQuery);
816
817 7
        foreach ($GETparams as $paramAndValue) {
818 4
            list($p, $v) = explode('=', $paramAndValue, 2);
819 4
            if (strlen($p)) {
820 4
                $paramKeyValues[rawurldecode($p)] = rawurldecode($v);
821
            }
822
        }
823
824 7
        return $paramKeyValues;
825
    }
826
827
    /**
828
     * Will expand the parameters configuration to individual values. This follows a certain syntax of the value of each parameter.
829
     * Syntax of values:
830
     * - Basically: If the value is wrapped in [...] it will be expanded according to the following syntax, otherwise the value is taken literally
831
     * - Configuration is splitted by "|" and the parts are processed individually and finally added together
832
     * - For each configuration part:
833
     *         - "[int]-[int]" = Integer range, will be expanded to all values in between, values included, starting from low to high (max. 1000). Example "1-34" or "-40--30"
834
     *         - "_TABLE:[TCA table name];[_PID:[optional page id, default is current page]];[_ENABLELANG:1]" = Look up of table records from PID, filtering out deleted records. Example "_TABLE:tt_content; _PID:123"
835
     *        _ENABLELANG:1 picks only original records without their language overlays
836
     *         - Default: Literal value
837
     *
838
     * @param array $paramArray Array with key (GET var name) and values (value of GET var which is configuration for expansion)
839
     * @param integer $pid Current page ID
840
     * @return array
841
     */
842 4
    public function expandParameters($paramArray, $pid)
843
    {
844 4
        global $TCA;
845
846
        // Traverse parameter names:
847 4
        foreach ($paramArray as $p => $v) {
848 1
            $v = trim($v);
849
850
            // If value is encapsulated in square brackets it means there are some ranges of values to find, otherwise the value is literal
851 1
            if (substr($v, 0, 1) === '[' && substr($v, -1) === ']') {
852
                // So, find the value inside brackets and reset the paramArray value as an array.
853 1
                $v = substr($v, 1, -1);
854 1
                $paramArray[$p] = [];
855
856
                // Explode parts and traverse them:
857 1
                $parts = explode('|', $v);
858 1
                foreach ($parts as $pV) {
859
860
                        // Look for integer range: (fx. 1-34 or -40--30 // reads minus 40 to minus 30)
861 1
                    if (preg_match('/^(-?[0-9]+)\s*-\s*(-?[0-9]+)$/', trim($pV), $reg)) {
862
863
                        // Swap if first is larger than last:
864
                        if ($reg[1] > $reg[2]) {
865
                            $temp = $reg[2];
866
                            $reg[2] = $reg[1];
867
                            $reg[1] = $temp;
868
                        }
869
870
                        // Traverse range, add values:
871
                        $runAwayBrake = 1000; // Limit to size of range!
872
                        for ($a = $reg[1]; $a <= $reg[2];$a++) {
873
                            $paramArray[$p][] = $a;
874
                            $runAwayBrake--;
875
                            if ($runAwayBrake <= 0) {
876
                                break;
877
                            }
878
                        }
879 1
                    } elseif (substr(trim($pV), 0, 7) == '_TABLE:') {
880
881
                        // Parse parameters:
882
                        $subparts = GeneralUtility::trimExplode(';', $pV);
883
                        $subpartParams = [];
884
                        foreach ($subparts as $spV) {
885
                            list($pKey, $pVal) = GeneralUtility::trimExplode(':', $spV);
886
                            $subpartParams[$pKey] = $pVal;
887
                        }
888
889
                        // Table exists:
890
                        if (isset($TCA[$subpartParams['_TABLE']])) {
891
                            $lookUpPid = isset($subpartParams['_PID']) ? intval($subpartParams['_PID']) : $pid;
892
                            $pidField = isset($subpartParams['_PIDFIELD']) ? trim($subpartParams['_PIDFIELD']) : 'pid';
893
                            $where = isset($subpartParams['_WHERE']) ? $subpartParams['_WHERE'] : '';
894
                            $addTable = isset($subpartParams['_ADDTABLE']) ? $subpartParams['_ADDTABLE'] : '';
895
896
                            $fieldName = $subpartParams['_FIELD'] ? $subpartParams['_FIELD'] : 'uid';
897
                            if ($fieldName === 'uid' || $TCA[$subpartParams['_TABLE']]['columns'][$fieldName]) {
898
                                $andWhereLanguage = '';
899
                                $transOrigPointerField = $TCA[$subpartParams['_TABLE']]['ctrl']['transOrigPointerField'];
900
901
                                if ($subpartParams['_ENABLELANG'] && $transOrigPointerField) {
902
                                    $andWhereLanguage = ' AND ' . $this->db->quoteStr($transOrigPointerField, $subpartParams['_TABLE']) . ' <= 0 ';
903
                                }
904
905
                                $where = $this->db->quoteStr($pidField, $subpartParams['_TABLE']) . '=' . intval($lookUpPid) . ' ' .
906
                                    $andWhereLanguage . $where;
907
908
                                $rows = $this->db->exec_SELECTgetRows(
909
                                    $fieldName,
910
                                    $subpartParams['_TABLE'] . $addTable,
911
                                    $where . BackendUtility::deleteClause($subpartParams['_TABLE']),
912
                                    '',
913
                                    '',
914
                                    '',
915
                                    $fieldName
916
                                );
917
918
                                if (is_array($rows)) {
919
                                    $paramArray[$p] = array_merge($paramArray[$p], array_keys($rows));
920
                                }
921
                            }
922
                        }
923
                    } else { // Just add value:
924 1
                        $paramArray[$p][] = $pV;
925
                    }
926
                    // Hook for processing own expandParameters place holder
927 1
                    if (is_array($GLOBALS['TYPO3_CONF_VARS']['SC_OPTIONS']['crawler/class.tx_crawler_lib.php']['expandParameters'])) {
928
                        $_params = [
929
                            'pObj' => &$this,
930
                            'paramArray' => &$paramArray,
931
                            'currentKey' => $p,
932
                            'currentValue' => $pV,
933
                            'pid' => $pid
934
                        ];
935
                        foreach ($GLOBALS['TYPO3_CONF_VARS']['SC_OPTIONS']['crawler/class.tx_crawler_lib.php']['expandParameters'] as $key => $_funcRef) {
936 1
                            GeneralUtility::callUserFunction($_funcRef, $_params, $this);
937
                        }
938
                    }
939
                }
940
941
                // Make unique set of values and sort array by key:
942 1
                $paramArray[$p] = array_unique($paramArray[$p]);
943 1
                ksort($paramArray);
944
            } else {
945
                // Set the literal value as only value in array:
946 1
                $paramArray[$p] = [$v];
947
            }
948
        }
949
950 4
        return $paramArray;
951
    }
952
953
    /**
954
     * Compiling URLs from parameter array (output of expandParameters())
955
     * The number of URLs will be the multiplication of the number of parameter values for each key
956
     *
957
     * @param array $paramArray Output of expandParameters(): Array with keys (GET var names) and for each an array of values
958
     * @param array $urls URLs accumulated in this array (for recursion)
959
     * @return array
960
     */
961 7
    public function compileUrls($paramArray, $urls = [])
962
    {
963 7
        if (count($paramArray) && is_array($urls)) {
964
            // shift first off stack:
965 3
            reset($paramArray);
966 3
            $varName = key($paramArray);
967 3
            $valueSet = array_shift($paramArray);
968
969
            // Traverse value set:
970 3
            $newUrls = [];
971 3
            foreach ($urls as $url) {
972 2
                foreach ($valueSet as $val) {
973 2
                    $newUrls[] = $url . (strcmp($val, '') ? '&' . rawurlencode($varName) . '=' . rawurlencode($val) : '');
974
975 2
                    if (count($newUrls) > MathUtility::forceIntegerInRange($this->extensionSettings['maxCompileUrls'], 1, 1000000000, 10000)) {
976 2
                        break;
977
                    }
978
                }
979
            }
980 3
            $urls = $newUrls;
981 3
            $urls = $this->compileUrls($paramArray, $urls);
982
        }
983
984 7
        return $urls;
985
    }
986
987
    /************************************
988
     *
989
     * Crawler log
990
     *
991
     ************************************/
992
993
    /**
994
     * Return array of records from crawler queue for input page ID
995
     *
996
     * @param integer $id Page ID for which to look up log entries.
997
     * @param string$filter Filter: "all" => all entries, "pending" => all that is not yet run, "finished" => all complete ones
998
     * @param boolean $doFlush If TRUE, then entries selected at DELETED(!) instead of selected!
999
     * @param boolean $doFullFlush
1000
     * @param integer $itemsPerPage Limit the amount of entries per page default is 10
1001
     * @return array
1002
     */
1003 4
    public function getLogEntriesForPageId($id, $filter = '', $doFlush = false, $doFullFlush = false, $itemsPerPage = 10)
1004
    {
1005
        switch ($filter) {
1006 4
            case 'pending':
1007
                $addWhere = ' AND exec_time=0';
1008
                break;
1009 4
            case 'finished':
1010
                $addWhere = ' AND exec_time>0';
1011
                break;
1012
            default:
1013 4
                $addWhere = '';
1014 4
                break;
1015
        }
1016
1017
        // FIXME: Write unit test that ensures that the right records are deleted.
1018 4
        if ($doFlush) {
1019 2
            $this->flushQueue(($doFullFlush ? '1=1' : ('page_id=' . intval($id))) . $addWhere);
1020 2
            return [];
1021
        } else {
1022 2
            return $this->db->exec_SELECTgetRows(
1023 2
                '*',
1024 2
                'tx_crawler_queue',
1025 2
                'page_id=' . intval($id) . $addWhere,
1026 2
                '',
1027 2
                'scheduled DESC',
1028 2
                (intval($itemsPerPage) > 0 ? intval($itemsPerPage) : '')
1029
            );
1030
        }
1031
    }
1032
1033
    /**
1034
     * Return array of records from crawler queue for input set ID
1035
     *
1036
     * @param integer $set_id Set ID for which to look up log entries.
1037
     * @param string $filter Filter: "all" => all entries, "pending" => all that is not yet run, "finished" => all complete ones
1038
     * @param boolean $doFlush If TRUE, then entries selected at DELETED(!) instead of selected!
1039
     * @param integer $itemsPerPage Limit the amount of entires per page default is 10
1040
     * @return array
1041
     */
1042 6
    public function getLogEntriesForSetId($set_id, $filter = '', $doFlush = false, $doFullFlush = false, $itemsPerPage = 10)
1043
    {
1044
        // FIXME: Write Unit tests for Filters
1045
        switch ($filter) {
1046 6
            case 'pending':
1047 1
                $addWhere = ' AND exec_time=0';
1048 1
                break;
1049 5
            case 'finished':
1050 1
                $addWhere = ' AND exec_time>0';
1051 1
                break;
1052
            default:
1053 4
                $addWhere = '';
1054 4
                break;
1055
        }
1056
        // FIXME: Write unit test that ensures that the right records are deleted.
1057 6
        if ($doFlush) {
1058 4
            $this->flushQueue($doFullFlush ? '' : ('set_id=' . intval($set_id) . $addWhere));
1059 4
            return [];
1060
        } else {
1061 2
            return $this->db->exec_SELECTgetRows(
1062 2
                '*',
1063 2
                'tx_crawler_queue',
1064 2
                'set_id=' . intval($set_id) . $addWhere,
1065 2
                '',
1066 2
                'scheduled DESC',
1067 2
                (intval($itemsPerPage) > 0 ? intval($itemsPerPage) : '')
1068
            );
1069
        }
1070
    }
1071
1072
    /**
1073
     * Removes queue entries
1074
     *
1075
     * @param string $where SQL related filter for the entries which should be removed
1076
     * @return void
1077
     */
1078 10
    protected function flushQueue($where = '')
1079
    {
1080 10
        $realWhere = strlen($where) > 0 ? $where : '1=1';
1081
1082 10
        if (EventDispatcher::getInstance()->hasObserver('queueEntryFlush')) {
1083
            $groups = $this->db->exec_SELECTgetRows('DISTINCT set_id', 'tx_crawler_queue', $realWhere);
1084
            if (is_array($groups)) {
1085
                foreach ($groups as $group) {
1086
                    EventDispatcher::getInstance()->post('queueEntryFlush', $group['set_id'], $this->db->exec_SELECTgetRows('uid, set_id', 'tx_crawler_queue', $realWhere . ' AND set_id="' . $group['set_id'] . '"'));
1087
                }
1088
            }
1089
        }
1090
1091 10
        $this->db->exec_DELETEquery('tx_crawler_queue', $realWhere);
1092 10
    }
1093
1094
    /**
1095
     * Adding call back entries to log (called from hooks typically, see indexed search class "class.crawler.php"
1096
     *
1097
     * @param integer $setId Set ID
1098
     * @param array $params Parameters to pass to call back function
1099
     * @param string $callBack Call back object reference, eg. 'EXT:indexed_search/class.crawler.php:&tx_indexedsearch_crawler'
1100
     * @param integer $page_id Page ID to attach it to
1101
     * @param integer $schedule Time at which to activate
1102
     * @return void
1103
     */
1104
    public function addQueueEntry_callBack($setId, $params, $callBack, $page_id = 0, $schedule = 0)
1105
    {
1106
        if (!is_array($params)) {
1107
            $params = [];
1108
        }
1109
        $params['_CALLBACKOBJ'] = $callBack;
1110
1111
        // Compile value array:
1112
        $fieldArray = [
1113
            'page_id' => intval($page_id),
1114
            'parameters' => serialize($params),
1115
            'scheduled' => intval($schedule) ? intval($schedule) : $this->getCurrentTime(),
1116
            'exec_time' => 0,
1117
            'set_id' => intval($setId),
1118
            'result_data' => '',
1119
        ];
1120
1121
        $this->db->exec_INSERTquery('tx_crawler_queue', $fieldArray);
1122
    }
1123
1124
    /************************************
1125
     *
1126
     * URL setting
1127
     *
1128
     ************************************/
1129
1130
    /**
1131
     * Setting a URL for crawling:
1132
     *
1133
     * @param integer $id Page ID
1134
     * @param string $url Complete URL
1135
     * @param array $subCfg Sub configuration array (from TS config)
1136
     * @param integer $tstamp Scheduled-time
1137
     * @param string $configurationHash (optional) configuration hash
1138
     * @param bool $skipInnerDuplicationCheck (optional) skip inner duplication check
1139
     * @return bool
1140
     */
1141 4
    public function addUrl(
1142
        $id,
1143
        $url,
1144
        array $subCfg,
1145
        $tstamp,
1146
        $configurationHash = '',
1147
        $skipInnerDuplicationCheck = false
1148
    ) {
1149 4
        $urlAdded = false;
1150 4
        $rows = [];
1151
1152
        // Creating parameters:
1153
        $parameters = [
1154 4
            'url' => $url
1155
        ];
1156
1157
        // fe user group simulation:
1158 4
        $uGs = implode(',', array_unique(GeneralUtility::intExplode(',', $subCfg['userGroups'], true)));
1159 4
        if ($uGs) {
1160
            $parameters['feUserGroupList'] = $uGs;
1161
        }
1162
1163
        // Setting processing instructions
1164 4
        $parameters['procInstructions'] = GeneralUtility::trimExplode(',', $subCfg['procInstrFilter']);
1165 4
        if (is_array($subCfg['procInstrParams.'])) {
1166 4
            $parameters['procInstrParams'] = $subCfg['procInstrParams.'];
1167
        }
1168
1169
        // Possible TypoScript Template Parents
1170 4
        $parameters['rootTemplatePid'] = $subCfg['rootTemplatePid'];
1171
1172
        // Compile value array:
1173 4
        $parameters_serialized = serialize($parameters);
1174
        $fieldArray = [
1175 4
            'page_id' => intval($id),
1176 4
            'parameters' => $parameters_serialized,
1177 4
            'parameters_hash' => GeneralUtility::shortMD5($parameters_serialized),
1178 4
            'configuration_hash' => $configurationHash,
1179 4
            'scheduled' => $tstamp,
1180 4
            'exec_time' => 0,
1181 4
            'set_id' => intval($this->setID),
1182 4
            'result_data' => '',
1183 4
            'configuration' => $subCfg['key'],
1184
        ];
1185
1186 4
        if ($this->registerQueueEntriesInternallyOnly) {
1187
            //the entries will only be registered and not stored to the database
1188
            $this->queueEntries[] = $fieldArray;
1189
        } else {
1190 4
            if (!$skipInnerDuplicationCheck) {
1191
                // check if there is already an equal entry
1192 4
                $rows = $this->getDuplicateRowsIfExist($tstamp, $fieldArray);
1193
            }
1194
1195 4
            if (count($rows) == 0) {
1196 4
                $this->db->exec_INSERTquery('tx_crawler_queue', $fieldArray);
1197 4
                $uid = $this->db->sql_insert_id();
1198 4
                $rows[] = $uid;
1199 4
                $urlAdded = true;
1200 4
                EventDispatcher::getInstance()->post('urlAddedToQueue', $this->setID, ['uid' => $uid, 'fieldArray' => $fieldArray]);
1201
            } else {
1202 2
                EventDispatcher::getInstance()->post('duplicateUrlInQueue', $this->setID, ['rows' => $rows, 'fieldArray' => $fieldArray]);
1203
            }
1204
        }
1205
1206 4
        return $urlAdded;
1207
    }
1208
1209
    /**
1210
     * This method determines duplicates for a queue entry with the same parameters and this timestamp.
1211
     * If the timestamp is in the past, it will check if there is any unprocessed queue entry in the past.
1212
     * If the timestamp is in the future it will check, if the queued entry has exactly the same timestamp
1213
     *
1214
     * @param int $tstamp
1215
     * @param array $fieldArray
1216
     *
1217
     * @return array
1218
     */
1219 4
    protected function getDuplicateRowsIfExist($tstamp, $fieldArray)
1220
    {
1221 4
        $rows = [];
1222
1223 4
        $currentTime = $this->getCurrentTime();
1224
1225
        //if this entry is scheduled with "now"
1226 4
        if ($tstamp <= $currentTime) {
1227 1
            if ($this->extensionSettings['enableTimeslot']) {
1228 1
                $timeBegin = $currentTime - 100;
1229 1
                $timeEnd = $currentTime + 100;
1230 1
                $where = ' ((scheduled BETWEEN ' . $timeBegin . ' AND ' . $timeEnd . ' ) OR scheduled <= ' . $currentTime . ') ';
1231
            } else {
1232 1
                $where = 'scheduled <= ' . $currentTime;
1233
            }
1234 3
        } elseif ($tstamp > $currentTime) {
1235
            //entry with a timestamp in the future need to have the same schedule time
1236 3
            $where = 'scheduled = ' . $tstamp ;
1237
        }
1238
1239 4
        if (!empty($where)) {
1240 4
            $result = $this->db->exec_SELECTgetRows(
1241 4
                'qid',
1242 4
                'tx_crawler_queue',
1243
                $where .
1244 4
                ' AND NOT exec_time' .
1245 4
                ' AND NOT process_id ' .
1246 4
                ' AND page_id=' . intval($fieldArray['page_id']) .
1247 4
                ' AND parameters_hash = ' . $this->db->fullQuoteStr($fieldArray['parameters_hash'], 'tx_crawler_queue')
1248
            );
1249
1250 4
            if (is_array($result)) {
1251 4
                foreach ($result as $value) {
1252 2
                    $rows[] = $value['qid'];
1253
                }
1254
            }
1255
        }
1256
1257 4
        return $rows;
1258
    }
1259
1260
    /**
1261
     * Returns the current system time
1262
     *
1263
     * @return int
1264
     */
1265 1
    public function getCurrentTime()
1266
    {
1267 1
        return time();
1268
    }
1269
1270
    /************************************
1271
     *
1272
     * URL reading
1273
     *
1274
     ************************************/
1275
1276
    /**
1277
     * Read URL for single queue entry
1278
     *
1279
     * @param integer $queueId
1280
     * @param boolean $force If set, will process even if exec_time has been set!
1281
     * @return integer
1282
     */
1283
    public function readUrl($queueId, $force = false)
1284
    {
1285
        $ret = 0;
1286
        if ($this->debugMode) {
1287
            GeneralUtility::devlog('crawler-readurl start ' . microtime(true), __FUNCTION__);
1288
        }
1289
        // Get entry:
1290
        list($queueRec) = $this->db->exec_SELECTgetRows(
1291
            '*',
1292
            'tx_crawler_queue',
1293
            'qid=' . intval($queueId) . ($force ? '' : ' AND exec_time=0 AND process_scheduled > 0')
1294
        );
1295
1296
        if (!is_array($queueRec)) {
1297
            return;
1298
        }
1299
1300
        $parameters = unserialize($queueRec['parameters']);
1301
        if ($parameters['rootTemplatePid']) {
1302
            $this->initTSFE((int)$parameters['rootTemplatePid']);
1303
        } else {
1304
            GeneralUtility::sysLog(
1305
                'Page with (' . $queueRec['page_id'] . ') could not be crawled, please check your crawler configuration. Perhaps no Root Template Pid is set',
1306
                'crawler',
1307
                GeneralUtility::SYSLOG_SEVERITY_WARNING
1308
            );
1309
        }
1310
1311
        SignalSlotUtility::emitSignal(
1312
            __CLASS__,
1313
            SignalSlotUtility::SIGNNAL_QUEUEITEM_PREPROCESS,
1314
            [$queueId, &$queueRec]
1315
        );
1316
1317
        // Set exec_time to lock record:
1318
        $field_array = ['exec_time' => $this->getCurrentTime()];
1319
1320
        if (isset($this->processID)) {
1321
            //if mulitprocessing is used we need to store the id of the process which has handled this entry
1322
            $field_array['process_id_completed'] = $this->processID;
1323
        }
1324
        $this->db->exec_UPDATEquery('tx_crawler_queue', 'qid=' . intval($queueId), $field_array);
1325
1326
        $result = $this->readUrl_exec($queueRec);
1327
        $resultData = unserialize($result['content']);
1328
1329
        //atm there's no need to point to specific pollable extensions
1330
        if (is_array($GLOBALS['TYPO3_CONF_VARS']['EXTCONF']['crawler']['pollSuccess'])) {
1331
            foreach ($GLOBALS['TYPO3_CONF_VARS']['EXTCONF']['crawler']['pollSuccess'] as $pollable) {
1332
                // only check the success value if the instruction is runnig
1333
                // it is important to name the pollSuccess key same as the procInstructions key
1334
                if (is_array($resultData['parameters']['procInstructions']) && in_array(
1335
                    $pollable,
1336
                        $resultData['parameters']['procInstructions']
1337
                )
1338
                ) {
1339
                    if (!empty($resultData['success'][$pollable]) && $resultData['success'][$pollable]) {
1340
                        $ret |= self::CLI_STATUS_POLLABLE_PROCESSED;
1341
                    }
1342
                }
1343
            }
1344
        }
1345
1346
        // Set result in log which also denotes the end of the processing of this entry.
1347
        $field_array = ['result_data' => serialize($result)];
1348
1349
        SignalSlotUtility::emitSignal(
1350
            __CLASS__,
1351
            SignalSlotUtility::SIGNNAL_QUEUEITEM_POSTPROCESS,
1352
            [$queueId, &$field_array]
1353
        );
1354
1355
        $this->db->exec_UPDATEquery('tx_crawler_queue', 'qid=' . intval($queueId), $field_array);
1356
1357
        if ($this->debugMode) {
1358
            GeneralUtility::devlog('crawler-readurl stop ' . microtime(true), __FUNCTION__);
1359
        }
1360
1361
        return $ret;
1362
    }
1363
1364
    /**
1365
     * Read URL for not-yet-inserted log-entry
1366
     *
1367
     * @param array $field_array Queue field array,
1368
     *
1369
     * @return string
1370
     */
1371
    public function readUrlFromArray($field_array)
1372
    {
1373
1374
            // Set exec_time to lock record:
1375
        $field_array['exec_time'] = $this->getCurrentTime();
1376
        $this->db->exec_INSERTquery('tx_crawler_queue', $field_array);
1377
        $queueId = $field_array['qid'] = $this->db->sql_insert_id();
1378
1379
        $result = $this->readUrl_exec($field_array);
1380
1381
        // Set result in log which also denotes the end of the processing of this entry.
1382
        $field_array = ['result_data' => serialize($result)];
1383
1384
        SignalSlotUtility::emitSignal(
1385
            __CLASS__,
1386
            SignalSlotUtility::SIGNNAL_QUEUEITEM_POSTPROCESS,
1387
            [$queueId, &$field_array]
1388
        );
1389
1390
        $this->db->exec_UPDATEquery('tx_crawler_queue', 'qid=' . intval($queueId), $field_array);
1391
1392
        return $result;
1393
    }
1394
1395
    /**
1396
     * Read URL for a queue record
1397
     *
1398
     * @param array $queueRec Queue record
1399
     * @return string
1400
     */
1401
    public function readUrl_exec($queueRec)
1402
    {
1403
        // Decode parameters:
1404
        $parameters = unserialize($queueRec['parameters']);
1405
        $result = 'ERROR';
1406
        if (is_array($parameters)) {
1407
            if ($parameters['_CALLBACKOBJ']) { // Calling object:
1408
                $objRef = $parameters['_CALLBACKOBJ'];
1409
                $callBackObj = &GeneralUtility::getUserObj($objRef);
1410
                if (is_object($callBackObj)) {
1411
                    unset($parameters['_CALLBACKOBJ']);
1412
                    $result = ['content' => serialize($callBackObj->crawler_execute($parameters, $this))];
1413
                } else {
1414
                    $result = ['content' => 'No object: ' . $objRef];
1415
                }
1416
            } else { // Regular FE request:
1417
1418
                // Prepare:
1419
                $crawlerId = $queueRec['qid'] . ':' . md5($queueRec['qid'] . '|' . $queueRec['set_id'] . '|' . $GLOBALS['TYPO3_CONF_VARS']['SYS']['encryptionKey']);
1420
1421
                // Get result:
1422
                $result = $this->requestUrl($parameters['url'], $crawlerId);
1423
1424
                EventDispatcher::getInstance()->post('urlCrawled', $queueRec['set_id'], ['url' => $parameters['url'], 'result' => $result]);
1425
            }
1426
        }
1427
1428
        return $result;
1429
    }
1430
1431
    /**
1432
     * Gets the content of a URL.
1433
     *
1434
     * @param string $originalUrl URL to read
1435
     * @param string $crawlerId Crawler ID string (qid + hash to verify)
1436
     * @param integer $timeout Timeout time
1437
     * @param integer $recursion Recursion limiter for 302 redirects
1438
     * @return array
1439
     */
1440 2
    public function requestUrl($originalUrl, $crawlerId, $timeout = 2, $recursion = 10)
1441
    {
1442 2
        if (!$recursion) {
1443
            return false;
1444
        }
1445
1446
        // Parse URL, checking for scheme:
1447 2
        $url = parse_url($originalUrl);
1448
1449 2
        if ($url === false) {
1450
            if (TYPO3_DLOG) {
1451
                GeneralUtility::devLog(sprintf('Could not parse_url() for string "%s"', $url), 'crawler', 4, ['crawlerId' => $crawlerId]);
1452
            }
1453
            return false;
1454
        }
1455
1456 2
        if (!in_array($url['scheme'], ['','http','https'])) {
1457
            if (TYPO3_DLOG) {
1458
                GeneralUtility::devLog(sprintf('Scheme does not match for url "%s"', $url), 'crawler', 4, ['crawlerId' => $crawlerId]);
1459
            }
1460
            return false;
1461
        }
1462
1463
        // direct request
1464 2
        if ($this->extensionSettings['makeDirectRequests']) {
1465 2
            $result = $this->sendDirectRequest($originalUrl, $crawlerId);
1466 2
            return $result;
1467
        }
1468
1469
        $reqHeaders = $this->buildRequestHeaderArray($url, $crawlerId);
1470
1471
        // thanks to Pierrick Caillon for adding proxy support
1472
        $rurl = $url;
1473
1474
        if ($GLOBALS['TYPO3_CONF_VARS']['SYS']['curlUse'] && $GLOBALS['TYPO3_CONF_VARS']['SYS']['curlProxyServer']) {
1475
            $rurl = parse_url($GLOBALS['TYPO3_CONF_VARS']['SYS']['curlProxyServer']);
1476
            $url['path'] = $url['scheme'] . '://' . $url['host'] . ($url['port'] > 0 ? ':' . $url['port'] : '') . $url['path'];
1477
            $reqHeaders = $this->buildRequestHeaderArray($url, $crawlerId);
1478
        }
1479
1480
        $host = $rurl['host'];
1481
1482
        if ($url['scheme'] == 'https') {
1483
            $host = 'ssl://' . $host;
1484
            $port = ($rurl['port'] > 0) ? $rurl['port'] : 443;
1485
        } else {
1486
            $port = ($rurl['port'] > 0) ? $rurl['port'] : 80;
1487
        }
1488
1489
        $startTime = microtime(true);
1490
        $fp = fsockopen($host, $port, $errno, $errstr, $timeout);
1491
1492
        if (!$fp) {
1493
            if (TYPO3_DLOG) {
1494
                GeneralUtility::devLog(sprintf('Error while opening "%s"', $url), 'crawler', 4, ['crawlerId' => $crawlerId]);
1495
            }
1496
            return false;
1497
        } else {
1498
            // Request message:
1499
            $msg = implode("\r\n", $reqHeaders) . "\r\n\r\n";
1500
            fputs($fp, $msg);
1501
1502
            // Read response:
1503
            $d = $this->getHttpResponseFromStream($fp);
1504
            fclose($fp);
1505
1506
            $time = microtime(true) - $startTime;
1507
            $this->log($originalUrl . ' ' . $time);
1508
1509
            // Implode content and headers:
1510
            $result = [
1511
                'request' => $msg,
1512
                'headers' => implode('', $d['headers']),
1513
                'content' => implode('', (array)$d['content'])
1514
            ];
1515
1516
            if (($this->extensionSettings['follow30x']) && ($newUrl = $this->getRequestUrlFrom302Header($d['headers'], $url['user'], $url['pass']))) {
1517
                $result = array_merge(['parentRequest' => $result], $this->requestUrl($newUrl, $crawlerId, $recursion--));
0 ignored issues
show
Bug introduced by
It seems like $newUrl defined by $this->getRequestUrlFrom...['user'], $url['pass']) on line 1516 can also be of type boolean; however, AOE\Crawler\Controller\C...ontroller::requestUrl() does only seem to accept string, maybe add an additional type check?

If a method or function can return multiple different values and unless you are sure that you only can receive a single value in this context, we recommend to add an additional type check:

/**
 * @return array|string
 */
function returnsDifferentValues($x) {
    if ($x) {
        return 'foo';
    }

    return array();
}

$x = returnsDifferentValues($y);
if (is_array($x)) {
    // $x is an array.
}

If this a common case that PHP Analyzer should handle natively, please let us know by opening an issue.

Loading history...
1518
                $newRequestUrl = $this->requestUrl($newUrl, $crawlerId, $timeout, --$recursion);
0 ignored issues
show
Bug introduced by
It seems like $newUrl defined by $this->getRequestUrlFrom...['user'], $url['pass']) on line 1516 can also be of type boolean; however, AOE\Crawler\Controller\C...ontroller::requestUrl() does only seem to accept string, maybe add an additional type check?

If a method or function can return multiple different values and unless you are sure that you only can receive a single value in this context, we recommend to add an additional type check:

/**
 * @return array|string
 */
function returnsDifferentValues($x) {
    if ($x) {
        return 'foo';
    }

    return array();
}

$x = returnsDifferentValues($y);
if (is_array($x)) {
    // $x is an array.
}

If this a common case that PHP Analyzer should handle natively, please let us know by opening an issue.

Loading history...
1519
1520
                if (is_array($newRequestUrl)) {
1521
                    $result = array_merge(['parentRequest' => $result], $newRequestUrl);
1522
                } else {
1523
                    if (TYPO3_DLOG) {
1524
                        GeneralUtility::devLog(sprintf('Error while opening "%s"', $url), 'crawler', 4, ['crawlerId' => $crawlerId]);
1525
                    }
1526
                    return false;
1527
                }
1528
            }
1529
1530
            return $result;
1531
        }
1532
    }
1533
1534
    /**
1535
     * Gets the base path of the website frontend.
1536
     * (e.g. if you call http://mydomain.com/cms/index.php in
1537
     * the browser the base path is "/cms/")
1538
     *
1539
     * @return string Base path of the website frontend
1540
     */
1541
    protected function getFrontendBasePath()
1542
    {
1543
        $frontendBasePath = '/';
1544
1545
        // Get the path from the extension settings:
1546
        if (isset($this->extensionSettings['frontendBasePath']) && $this->extensionSettings['frontendBasePath']) {
1547
            $frontendBasePath = $this->extensionSettings['frontendBasePath'];
1548
            // If empty, try to use config.absRefPrefix:
1549
        } elseif (isset($GLOBALS['TSFE']->absRefPrefix) && !empty($GLOBALS['TSFE']->absRefPrefix)) {
1550
            $frontendBasePath = $GLOBALS['TSFE']->absRefPrefix;
1551
            // If not in CLI mode the base path can be determined from $_SERVER environment:
1552
        } elseif (!defined('TYPO3_REQUESTTYPE_CLI') || !TYPO3_REQUESTTYPE_CLI) {
1553
            $frontendBasePath = GeneralUtility::getIndpEnv('TYPO3_SITE_PATH');
1554
        }
1555
1556
        // Base path must be '/<pathSegements>/':
1557
        if ($frontendBasePath != '/') {
1558
            $frontendBasePath = '/' . ltrim($frontendBasePath, '/');
1559
            $frontendBasePath = rtrim($frontendBasePath, '/') . '/';
1560
        }
1561
1562
        return $frontendBasePath;
1563
    }
1564
1565
    /**
1566
     * Executes a shell command and returns the outputted result.
1567
     *
1568
     * @param string $command Shell command to be executed
1569
     * @return string Outputted result of the command execution
1570
     */
1571
    protected function executeShellCommand($command)
1572
    {
1573
        $result = shell_exec($command);
1574
        return $result;
1575
    }
1576
1577
    /**
1578
     * Reads HTTP response from the given stream.
1579
     *
1580
     * @param  resource $streamPointer  Pointer to connection stream.
1581
     * @return array                    Associative array with the following items:
1582
     *                                  headers <array> Response headers sent by server.
1583
     *                                  content <array> Content, with each line as an array item.
1584
     */
1585 1
    protected function getHttpResponseFromStream($streamPointer)
1586
    {
1587 1
        $response = ['headers' => [], 'content' => []];
1588
1589 1
        if (is_resource($streamPointer)) {
1590
            // read headers
1591 1
            while ($line = fgets($streamPointer, '2048')) {
1592 1
                $line = trim($line);
1593 1
                if ($line !== '') {
1594 1
                    $response['headers'][] = $line;
1595
                } else {
1596 1
                    break;
1597
                }
1598
            }
1599
1600
            // read content
1601 1
            while ($line = fgets($streamPointer, '2048')) {
1602 1
                $response['content'][] = $line;
1603
            }
1604
        }
1605
1606 1
        return $response;
1607
    }
1608
1609
    /**
1610
     * @param message
1611
     */
1612 2
    protected function log($message)
1613
    {
1614 2
        if (!empty($this->extensionSettings['logFileName'])) {
1615
            $fileResult = @file_put_contents($this->extensionSettings['logFileName'], date('Ymd His') . ' ' . $message . PHP_EOL, FILE_APPEND);
1616
            if (!$fileResult) {
1617
                GeneralUtility::devLog('File "' . $this->extensionSettings['logFileName'] . '" could not be written, please check file permissions.', 'crawler', LogLevel::INFO);
1618
            }
1619
        }
1620 2
    }
1621
1622
    /**
1623
     * Builds HTTP request headers.
1624
     *
1625
     * @param array $url
1626
     * @param string $crawlerId
1627
     *
1628
     * @return array
1629
     */
1630 6
    protected function buildRequestHeaderArray(array $url, $crawlerId)
1631
    {
1632 6
        $reqHeaders = [];
1633 6
        $reqHeaders[] = 'GET ' . $url['path'] . ($url['query'] ? '?' . $url['query'] : '') . ' HTTP/1.0';
1634 6
        $reqHeaders[] = 'Host: ' . $url['host'];
1635 6
        if (stristr($url['query'], 'ADMCMD_previewWS')) {
1636 2
            $reqHeaders[] = 'Cookie: $Version="1"; be_typo_user="1"; $Path=/';
1637
        }
1638 6
        $reqHeaders[] = 'Connection: close';
1639 6
        if ($url['user'] != '') {
1640 2
            $reqHeaders[] = 'Authorization: Basic ' . base64_encode($url['user'] . ':' . $url['pass']);
1641
        }
1642 6
        $reqHeaders[] = 'X-T3crawler: ' . $crawlerId;
1643 6
        $reqHeaders[] = 'User-Agent: TYPO3 crawler';
1644 6
        return $reqHeaders;
1645
    }
1646
1647
    /**
1648
     * Check if the submitted HTTP-Header contains a redirect location and built new crawler-url
1649
     *
1650
     * @param array $headers HTTP Header
1651
     * @param string $user HTTP Auth. User
1652
     * @param string $pass HTTP Auth. Password
1653
     * @return bool|string
1654
     */
1655 12
    protected function getRequestUrlFrom302Header($headers, $user = '', $pass = '')
1656
    {
1657 12
        $header = [];
1658 12
        if (!is_array($headers)) {
1659 1
            return false;
1660
        }
1661 11
        if (!(stristr($headers[0], '301 Moved') || stristr($headers[0], '302 Found') || stristr($headers[0], '302 Moved'))) {
1662 2
            return false;
1663
        }
1664
1665 9
        foreach ($headers as $hl) {
1666 9
            $tmp = explode(": ", $hl);
1667 9
            $header[trim($tmp[0])] = trim($tmp[1]);
1668 9
            if (trim($tmp[0]) == 'Location') {
1669 9
                break;
1670
            }
1671
        }
1672 9
        if (!array_key_exists('Location', $header)) {
1673 3
            return false;
1674
        }
1675
1676 6
        if ($user != '') {
1677 3
            if (!($tmp = parse_url($header['Location']))) {
1678 1
                return false;
1679
            }
1680 2
            $newUrl = $tmp['scheme'] . '://' . $user . ':' . $pass . '@' . $tmp['host'] . $tmp['path'];
1681 2
            if ($tmp['query'] != '') {
1682 2
                $newUrl .= '?' . $tmp['query'];
1683
            }
1684
        } else {
1685 3
            $newUrl = $header['Location'];
1686
        }
1687 5
        return $newUrl;
1688
    }
1689
1690
    /**************************
1691
     *
1692
     * tslib_fe hooks:
1693
     *
1694
     **************************/
1695
1696
    /**
1697
     * Initialization hook (called after database connection)
1698
     * Takes the "HTTP_X_T3CRAWLER" header and looks up queue record and verifies if the session comes from the system (by comparing hashes)
1699
     *
1700
     * @param array $params Parameters from frontend
1701
     * @param object $ref TSFE object (reference under PHP5)
1702
     * @return void
1703
     *
1704
     * FIXME: Look like this is not used, in commit 9910d3f40cce15f4e9b7bcd0488bf21f31d53ebc it's added as public,
1705
     * FIXME: I think this can be removed. (TNM)
1706
     */
1707
    public function fe_init(&$params, $ref)
0 ignored issues
show
Unused Code introduced by
The parameter $ref is not used and could be removed.

This check looks from parameters that have been defined for a function or method, but which are not used in the method body.

Loading history...
1708
    {
1709
        // Authenticate crawler request:
1710
        if (isset($_SERVER['HTTP_X_T3CRAWLER'])) {
1711
            list($queueId, $hash) = explode(':', $_SERVER['HTTP_X_T3CRAWLER']);
1712
            list($queueRec) = $this->db->exec_SELECTgetSingleRow('*', 'tx_crawler_queue', 'qid=' . intval($queueId));
1713
1714
            // If a crawler record was found and hash was matching, set it up:
1715
            if (is_array($queueRec) && $hash === md5($queueRec['qid'] . '|' . $queueRec['set_id'] . '|' . $GLOBALS['TYPO3_CONF_VARS']['SYS']['encryptionKey'])) {
1716
                $params['pObj']->applicationData['tx_crawler']['running'] = true;
1717
                $params['pObj']->applicationData['tx_crawler']['parameters'] = unserialize($queueRec['parameters']);
1718
                $params['pObj']->applicationData['tx_crawler']['log'] = [];
1719
            } else {
1720
                die('No crawler entry found!');
0 ignored issues
show
Coding Style Compatibility introduced by
The method fe_init() contains an exit expression.

An exit expression should only be used in rare cases. For example, if you write a short command line script.

In most cases however, using an exit expression makes the code untestable and often causes incompatibilities with other libraries. Thus, unless you are absolutely sure it is required here, we recommend to refactor your code to avoid its usage.

Loading history...
1721
            }
1722
        }
1723
    }
1724
1725
    /*****************************
1726
     *
1727
     * Compiling URLs to crawl - tools
1728
     *
1729
     *****************************/
1730
1731
    /**
1732
     * @param integer $id Root page id to start from.
1733
     * @param integer $depth Depth of tree, 0=only id-page, 1= on sublevel, 99 = infinite
1734
     * @param integer $scheduledTime Unix Time when the URL is timed to be visited when put in queue
1735
     * @param integer $reqMinute Number of requests per minute (creates the interleave between requests)
1736
     * @param boolean $submitCrawlUrls If set, submits the URLs to queue in database (real crawling)
1737
     * @param boolean $downloadCrawlUrls If set (and submitcrawlUrls is false) will fill $downloadUrls with entries)
1738
     * @param array $incomingProcInstructions Array of processing instructions
1739
     * @param array $configurationSelection Array of configuration keys
1740
     * @return string
1741
     */
1742
    public function getPageTreeAndUrls(
1743
        $id,
1744
        $depth,
1745
        $scheduledTime,
1746
        $reqMinute,
1747
        $submitCrawlUrls,
1748
        $downloadCrawlUrls,
1749
        array $incomingProcInstructions,
1750
        array $configurationSelection
1751
    ) {
1752
        global $BACK_PATH;
1753
        global $LANG;
1754
        if (!is_object($LANG)) {
1755
            $LANG = GeneralUtility::makeInstance('language');
1756
            $LANG->init(0);
1757
        }
1758
        $this->scheduledTime = $scheduledTime;
1759
        $this->reqMinute = $reqMinute;
1760
        $this->submitCrawlUrls = $submitCrawlUrls;
1761
        $this->downloadCrawlUrls = $downloadCrawlUrls;
1762
        $this->incomingProcInstructions = $incomingProcInstructions;
1763
        $this->incomingConfigurationSelection = $configurationSelection;
1764
1765
        $this->duplicateTrack = [];
1766
        $this->downloadUrls = [];
1767
1768
        // Drawing tree:
1769
        /* @var PageTreeView $tree */
1770
        $tree = GeneralUtility::makeInstance(PageTreeView::class);
1771
        $perms_clause = $GLOBALS['BE_USER']->getPagePermsClause(1);
1772
        $tree->init('AND ' . $perms_clause);
1773
1774
        $pageInfo = BackendUtility::readPageAccess($id, $perms_clause);
1775
        if (is_array($pageInfo)) {
1776
            // Set root row:
1777
            $tree->tree[] = [
1778
                'row' => $pageInfo,
1779
                'HTML' => IconUtility::getIconForRecord('pages', $pageInfo)
1780
            ];
1781
        }
1782
1783
        // Get branch beneath:
1784
        if ($depth) {
1785
            $tree->getTree($id, $depth, '');
1786
        }
1787
1788
        // Traverse page tree:
1789
        $code = '';
1790
1791
        foreach ($tree->tree as $data) {
1792
            $this->MP = false;
1793
1794
            // recognize mount points
1795
            if ($data['row']['doktype'] == 7) {
1796
                $mountpage = $this->db->exec_SELECTgetRows('*', 'pages', 'uid = ' . $data['row']['uid']);
1797
1798
                // fetch mounted pages
1799
                $this->MP = $mountpage[0]['mount_pid'] . '-' . $data['row']['uid'];
0 ignored issues
show
Documentation Bug introduced by
The property $MP was declared of type boolean, but $mountpage[0]['mount_pid...' . $data['row']['uid'] is of type string. Maybe add a type cast?

This check looks for assignments to scalar types that may be of the wrong type.

To ensure the code behaves as expected, it may be a good idea to add an explicit type cast.

$answer = 42;

$correct = false;

$correct = (bool) $answer;
Loading history...
1800
1801
                $mountTree = GeneralUtility::makeInstance(PageTreeView::class);
1802
                $mountTree->init('AND ' . $perms_clause);
1803
                $mountTree->getTree($mountpage[0]['mount_pid'], $depth, '');
1804
1805
                foreach ($mountTree->tree as $mountData) {
1806
                    $code .= $this->drawURLs_addRowsForPage(
1807
                        $mountData['row'],
1808
                        $mountData['HTML'] . BackendUtility::getRecordTitle('pages', $mountData['row'], true)
1809
                    );
1810
                }
1811
1812
                // replace page when mount_pid_ol is enabled
1813
                if ($mountpage[0]['mount_pid_ol']) {
1814
                    $data['row']['uid'] = $mountpage[0]['mount_pid'];
1815
                } else {
1816
                    // if the mount_pid_ol is not set the MP must not be used for the mountpoint page
1817
                    $this->MP = false;
1818
                }
1819
            }
1820
1821
            $code .= $this->drawURLs_addRowsForPage(
1822
                $data['row'],
1823
                $data['HTML'] . BackendUtility::getRecordTitle('pages', $data['row'], true)
1824
            );
1825
        }
1826
1827
        return $code;
1828
    }
1829
1830
    /**
1831
     * Expands exclude string
1832
     *
1833
     * @param string $excludeString Exclude string
1834
     * @return array
1835
     */
1836 1
    public function expandExcludeString($excludeString)
1837
    {
1838
        // internal static caches;
1839 1
        static $expandedExcludeStringCache;
1840 1
        static $treeCache;
1841
1842 1
        if (empty($expandedExcludeStringCache[$excludeString])) {
1843 1
            $pidList = [];
1844
1845 1
            if (!empty($excludeString)) {
1846
                /** @var PageTreeView $tree */
1847
                $tree = GeneralUtility::makeInstance(PageTreeView::class);
1848
                $tree->init('AND ' . $this->backendUser->getPagePermsClause(1));
1849
1850
                $excludeParts = GeneralUtility::trimExplode(',', $excludeString);
1851
1852
                foreach ($excludeParts as $excludePart) {
1853
                    list($pid, $depth) = GeneralUtility::trimExplode('+', $excludePart);
1854
1855
                    // default is "page only" = "depth=0"
1856
                    if (empty($depth)) {
1857
                        $depth = (stristr($excludePart, '+')) ? 99 : 0;
1858
                    }
1859
1860
                    $pidList[] = $pid;
1861
1862
                    if ($depth > 0) {
1863
                        if (empty($treeCache[$pid][$depth])) {
1864
                            $tree->reset();
1865
                            $tree->getTree($pid, $depth);
1866
                            $treeCache[$pid][$depth] = $tree->tree;
1867
                        }
1868
1869
                        foreach ($treeCache[$pid][$depth] as $data) {
1870
                            $pidList[] = $data['row']['uid'];
1871
                        }
1872
                    }
1873
                }
1874
            }
1875
1876 1
            $expandedExcludeStringCache[$excludeString] = array_unique($pidList);
1877
        }
1878
1879 1
        return $expandedExcludeStringCache[$excludeString];
1880
    }
1881
1882
    /**
1883
     * Create the rows for display of the page tree
1884
     * For each page a number of rows are shown displaying GET variable configuration
1885
     *
1886
     * @param    array        Page row
1887
     * @param    string        Page icon and title for row
1888
     * @return    string        HTML <tr> content (one or more)
1889
     */
1890
    public function drawURLs_addRowsForPage(array $pageRow, $pageTitleAndIcon)
1891
    {
1892
        $skipMessage = '';
1893
1894
        // Get list of configurations
1895
        $configurations = $this->getUrlsForPageRow($pageRow, $skipMessage);
1896
1897
        if (count($this->incomingConfigurationSelection) > 0) {
1898
            // remove configuration that does not match the current selection
1899
            foreach ($configurations as $confKey => $confArray) {
1900
                if (!in_array($confKey, $this->incomingConfigurationSelection)) {
1901
                    unset($configurations[$confKey]);
1902
                }
1903
            }
1904
        }
1905
1906
        // Traverse parameter combinations:
1907
        $c = 0;
1908
        $content = '';
1909
        if (count($configurations)) {
1910
            foreach ($configurations as $confKey => $confArray) {
1911
1912
                    // Title column:
1913
                if (!$c) {
1914
                    $titleClm = '<td rowspan="' . count($configurations) . '">' . $pageTitleAndIcon . '</td>';
1915
                } else {
1916
                    $titleClm = '';
1917
                }
1918
1919
                if (!in_array($pageRow['uid'], $this->expandExcludeString($confArray['subCfg']['exclude']))) {
1920
1921
                        // URL list:
1922
                    $urlList = $this->urlListFromUrlArray(
1923
                        $confArray,
1924
                        $pageRow,
1925
                        $this->scheduledTime,
1926
                        $this->reqMinute,
1927
                        $this->submitCrawlUrls,
1928
                        $this->downloadCrawlUrls,
1929
                        $this->duplicateTrack,
1930
                        $this->downloadUrls,
1931
                        $this->incomingProcInstructions // if empty the urls won't be filtered by processing instructions
1932
                    );
1933
1934
                    // Expanded parameters:
1935
                    $paramExpanded = '';
1936
                    $calcAccu = [];
1937
                    $calcRes = 1;
1938
                    foreach ($confArray['paramExpanded'] as $gVar => $gVal) {
1939
                        $paramExpanded .= '
1940
                            <tr>
1941
                                <td class="bgColor4-20">' . htmlspecialchars('&' . $gVar . '=') . '<br/>' .
1942
                                                '(' . count($gVal) . ')' .
1943
                                                '</td>
1944
                                <td class="bgColor4" nowrap="nowrap">' . nl2br(htmlspecialchars(implode(chr(10), $gVal))) . '</td>
1945
                            </tr>
1946
                        ';
1947
                        $calcRes *= count($gVal);
1948
                        $calcAccu[] = count($gVal);
1949
                    }
1950
                    $paramExpanded = '<table class="lrPadding c-list param-expanded">' . $paramExpanded . '</table>';
1951
                    $paramExpanded .= 'Comb: ' . implode('*', $calcAccu) . '=' . $calcRes;
1952
1953
                    // Options
1954
                    $optionValues = '';
1955
                    if ($confArray['subCfg']['userGroups']) {
1956
                        $optionValues .= 'User Groups: ' . $confArray['subCfg']['userGroups'] . '<br/>';
1957
                    }
1958
                    if ($confArray['subCfg']['baseUrl']) {
1959
                        $optionValues .= 'Base Url: ' . $confArray['subCfg']['baseUrl'] . '<br/>';
1960
                    }
1961
                    if ($confArray['subCfg']['procInstrFilter']) {
1962
                        $optionValues .= 'ProcInstr: ' . $confArray['subCfg']['procInstrFilter'] . '<br/>';
1963
                    }
1964
1965
                    // Compile row:
1966
                    $content .= '
1967
                        <tr class="bgColor' . ($c % 2 ? '-20' : '-10') . '">
1968
                            ' . $titleClm . '
1969
                            <td>' . htmlspecialchars($confKey) . '</td>
1970
                            <td>' . nl2br(htmlspecialchars(rawurldecode(trim(str_replace('&', chr(10) . '&', GeneralUtility::implodeArrayForUrl('', $confArray['paramParsed'])))))) . '</td>
1971
                            <td>' . $paramExpanded . '</td>
1972
                            <td nowrap="nowrap">' . $urlList . '</td>
1973
                            <td nowrap="nowrap">' . $optionValues . '</td>
1974
                            <td nowrap="nowrap">' . DebugUtility::viewArray($confArray['subCfg']['procInstrParams.']) . '</td>
1975
                        </tr>';
1976
                } else {
1977
                    $content .= '<tr class="bgColor' . ($c % 2 ? '-20' : '-10') . '">
1978
                            ' . $titleClm . '
1979
                            <td>' . htmlspecialchars($confKey) . '</td>
1980
                            <td colspan="5"><em>No entries</em> (Page is excluded in this configuration)</td>
1981
                        </tr>';
1982
                }
1983
1984
                $c++;
1985
            }
1986
        } else {
1987
            $message = !empty($skipMessage) ? ' (' . $skipMessage . ')' : '';
1988
1989
            // Compile row:
1990
            $content .= '
1991
                <tr class="bgColor-20" style="border-bottom: 1px solid black;">
1992
                    <td>' . $pageTitleAndIcon . '</td>
1993
                    <td colspan="6"><em>No entries</em>' . $message . '</td>
1994
                </tr>';
1995
        }
1996
1997
        return $content;
1998
    }
1999
2000
    /**
2001
     * @return int
2002
     */
2003 1
    public function getUnprocessedItemsCount()
2004
    {
2005 1
        $res = $this->db->exec_SELECTquery(
2006 1
            'count(*) as num',
2007 1
            'tx_crawler_queue',
2008 1
            'exec_time=0 AND process_scheduled=0 AND scheduled<=' . $this->getCurrentTime()
2009
        );
2010
2011 1
        $count = $this->db->sql_fetch_assoc($res);
2012 1
        return $count['num'];
2013
    }
2014
2015
    /*****************************
2016
     *
2017
     * CLI functions
2018
     *
2019
     *****************************/
2020
2021
    /**
2022
     * Main function for running from Command Line PHP script (cron job)
2023
     * See ext/crawler/cli/crawler_cli.phpsh for details
2024
     *
2025
     * @return int number of remaining items or false if error
2026
     */
2027
    public function CLI_main()
2028
    {
2029
        $this->setAccessMode('cli');
2030
        $result = self::CLI_STATUS_NOTHING_PROCCESSED;
2031
        $cliObj = GeneralUtility::makeInstance(CrawlerCommandLineController::class);
2032
2033
        if (isset($cliObj->cli_args['-h']) || isset($cliObj->cli_args['--help'])) {
2034
            $cliObj->cli_validateArgs();
2035
            $cliObj->cli_help();
2036
            exit;
0 ignored issues
show
Coding Style Compatibility introduced by
The method CLI_main() contains an exit expression.

An exit expression should only be used in rare cases. For example, if you write a short command line script.

In most cases however, using an exit expression makes the code untestable and often causes incompatibilities with other libraries. Thus, unless you are absolutely sure it is required here, we recommend to refactor your code to avoid its usage.

Loading history...
2037
        }
2038
2039
        if (!$this->getDisabled() && $this->CLI_checkAndAcquireNewProcess($this->CLI_buildProcessId())) {
2040
            $countInARun = $cliObj->cli_argValue('--countInARun') ? intval($cliObj->cli_argValue('--countInARun')) : $this->extensionSettings['countInARun'];
2041
            // Seconds
2042
            $sleepAfterFinish = $cliObj->cli_argValue('--sleepAfterFinish') ? intval($cliObj->cli_argValue('--sleepAfterFinish')) : $this->extensionSettings['sleepAfterFinish'];
2043
            // Milliseconds
2044
            $sleepTime = $cliObj->cli_argValue('--sleepTime') ? intval($cliObj->cli_argValue('--sleepTime')) : $this->extensionSettings['sleepTime'];
2045
2046
            try {
2047
                // Run process:
2048
                $result = $this->CLI_run($countInARun, $sleepTime, $sleepAfterFinish);
2049
            } catch (\Exception $e) {
2050
                $this->CLI_debug(get_class($e) . ': ' . $e->getMessage());
2051
                $result = self::CLI_STATUS_ABORTED;
2052
            }
2053
2054
            // Cleanup
2055
            $this->db->exec_DELETEquery('tx_crawler_process', 'assigned_items_count = 0');
2056
2057
            //TODO can't we do that in a clean way?
2058
            $releaseStatus = $this->CLI_releaseProcesses($this->CLI_buildProcessId());
0 ignored issues
show
Unused Code introduced by
$releaseStatus is not used, you could remove the assignment.

This check looks for variable assignements that are either overwritten by other assignments or where the variable is not used subsequently.

$myVar = 'Value';
$higher = false;

if (rand(1, 6) > 3) {
    $higher = true;
} else {
    $higher = false;
}

Both the $myVar assignment in line 1 and the $higher assignment in line 2 are dead. The first because $myVar is never used and the second because $higher is always overwritten for every possible time line.

Loading history...
2059
2060
            $this->CLI_debug("Unprocessed Items remaining:" . $this->getUnprocessedItemsCount() . " (" . $this->CLI_buildProcessId() . ")");
2061
            $result |= ($this->getUnprocessedItemsCount() > 0 ? self::CLI_STATUS_REMAIN : self::CLI_STATUS_NOTHING_PROCCESSED);
2062
        } else {
2063
            $result |= self::CLI_STATUS_ABORTED;
2064
        }
2065
2066
        return $result;
2067
    }
2068
2069
    /**
2070
     * Function executed by crawler_im.php cli script.
2071
     *
2072
     * @return void
2073
     */
2074
    public function CLI_main_im()
2075
    {
2076
        $this->setAccessMode('cli_im');
2077
2078
        $cliObj = GeneralUtility::makeInstance(QueueCommandLineController::class);
2079
2080
        // Force user to admin state and set workspace to "Live":
2081
        $this->backendUser->user['admin'] = 1;
2082
        $this->backendUser->setWorkspace(0);
2083
2084
        // Print help
2085
        if (!isset($cliObj->cli_args['_DEFAULT'][1])) {
2086
            $cliObj->cli_validateArgs();
2087
            $cliObj->cli_help();
2088
            exit;
0 ignored issues
show
Coding Style Compatibility introduced by
The method CLI_main_im() contains an exit expression.

An exit expression should only be used in rare cases. For example, if you write a short command line script.

In most cases however, using an exit expression makes the code untestable and often causes incompatibilities with other libraries. Thus, unless you are absolutely sure it is required here, we recommend to refactor your code to avoid its usage.

Loading history...
2089
        }
2090
2091
        $cliObj->cli_validateArgs();
2092
2093
        if ($cliObj->cli_argValue('-o') === 'exec') {
2094
            $this->registerQueueEntriesInternallyOnly = true;
2095
        }
2096
2097
        if (isset($cliObj->cli_args['_DEFAULT'][2])) {
2098
            // Crawler is called over TYPO3 BE
2099
            $pageId = MathUtility::forceIntegerInRange($cliObj->cli_args['_DEFAULT'][2], 0);
2100
        } else {
2101
            // Crawler is called over cli
2102
            $pageId = MathUtility::forceIntegerInRange($cliObj->cli_args['_DEFAULT'][1], 0);
2103
        }
2104
2105
        $configurationKeys = $this->getConfigurationKeys($cliObj);
2106
2107
        if (!is_array($configurationKeys)) {
2108
            $configurations = $this->getUrlsForPageId($pageId);
2109
            if (is_array($configurations)) {
2110
                $configurationKeys = array_keys($configurations);
2111
            } else {
2112
                $configurationKeys = [];
2113
            }
2114
        }
2115
2116
        if ($cliObj->cli_argValue('-o') === 'queue' || $cliObj->cli_argValue('-o') === 'exec') {
2117
            $reason = new Reason();
2118
            $reason->setReason(Reason::REASON_GUI_SUBMIT);
2119
            $reason->setDetailText('The cli script of the crawler added to the queue');
2120
            EventDispatcher::getInstance()->post(
2121
                'invokeQueueChange',
2122
                $this->setID,
2123
                ['reason' => $reason]
2124
            );
2125
        }
2126
2127
        if ($this->extensionSettings['cleanUpOldQueueEntries']) {
2128
            $this->cleanUpOldQueueEntries();
2129
        }
2130
2131
        $this->setID = (int) GeneralUtility::md5int(microtime());
2132
        $this->getPageTreeAndUrls(
2133
            $pageId,
2134
            MathUtility::forceIntegerInRange($cliObj->cli_argValue('-d'), 0, 99),
2135
            $this->getCurrentTime(),
2136
            MathUtility::forceIntegerInRange($cliObj->cli_isArg('-n') ? $cliObj->cli_argValue('-n') : 30, 1, 1000),
2137
            $cliObj->cli_argValue('-o') === 'queue' || $cliObj->cli_argValue('-o') === 'exec',
2138
            $cliObj->cli_argValue('-o') === 'url',
2139
            GeneralUtility::trimExplode(',', $cliObj->cli_argValue('-proc'), true),
2140
            $configurationKeys
2141
        );
2142
2143
        if ($cliObj->cli_argValue('-o') === 'url') {
2144
            $cliObj->cli_echo(implode(chr(10), $this->downloadUrls) . chr(10), true);
2145
        } elseif ($cliObj->cli_argValue('-o') === 'exec') {
2146
            $cliObj->cli_echo("Executing " . count($this->urlList) . " requests right away:\n\n");
2147
            $cliObj->cli_echo(implode(chr(10), $this->urlList) . chr(10));
2148
            $cliObj->cli_echo("\nProcessing:\n");
2149
2150
            foreach ($this->queueEntries as $queueRec) {
2151
                $p = unserialize($queueRec['parameters']);
2152
                $cliObj->cli_echo($p['url'] . ' (' . implode(',', $p['procInstructions']) . ') => ');
2153
2154
                $result = $this->readUrlFromArray($queueRec);
2155
2156
                $requestResult = unserialize($result['content']);
2157
                if (is_array($requestResult)) {
2158
                    $resLog = is_array($requestResult['log']) ? chr(10) . chr(9) . chr(9) . implode(chr(10) . chr(9) . chr(9), $requestResult['log']) : '';
2159
                    $cliObj->cli_echo('OK: ' . $resLog . chr(10));
2160
                } else {
2161
                    $cliObj->cli_echo('Error checking Crawler Result: ' . substr(preg_replace('/\s+/', ' ', strip_tags($result['content'])), 0, 30000) . '...' . chr(10));
2162
                }
2163
            }
2164
        } elseif ($cliObj->cli_argValue('-o') === 'queue') {
2165
            $cliObj->cli_echo("Putting " . count($this->urlList) . " entries in queue:\n\n");
2166
            $cliObj->cli_echo(implode(chr(10), $this->urlList) . chr(10));
2167
        } else {
2168
            $cliObj->cli_echo(count($this->urlList) . " entries found for processing. (Use -o to decide action):\n\n", true);
2169
            $cliObj->cli_echo(implode(chr(10), $this->urlList) . chr(10), true);
2170
        }
2171
    }
2172
2173
    /**
2174
     * Function executed by crawler_im.php cli script.
2175
     *
2176
     * @return bool
2177
     */
2178
    public function CLI_main_flush()
2179
    {
2180
        $this->setAccessMode('cli_flush');
2181
        $cliObj = GeneralUtility::makeInstance(FlushCommandLineController::class);
2182
2183
        // Force user to admin state and set workspace to "Live":
2184
        $this->backendUser->user['admin'] = 1;
2185
        $this->backendUser->setWorkspace(0);
2186
2187
        // Print help
2188
        if (!isset($cliObj->cli_args['_DEFAULT'][1])) {
2189
            $cliObj->cli_validateArgs();
2190
            $cliObj->cli_help();
2191
            exit;
0 ignored issues
show
Coding Style Compatibility introduced by
The method CLI_main_flush() contains an exit expression.

An exit expression should only be used in rare cases. For example, if you write a short command line script.

In most cases however, using an exit expression makes the code untestable and often causes incompatibilities with other libraries. Thus, unless you are absolutely sure it is required here, we recommend to refactor your code to avoid its usage.

Loading history...
2192
        }
2193
2194
        $cliObj->cli_validateArgs();
2195
        $pageId = MathUtility::forceIntegerInRange($cliObj->cli_args['_DEFAULT'][1], 0);
2196
        $fullFlush = ($pageId == 0);
2197
2198
        $mode = $cliObj->cli_argValue('-o');
2199
2200
        switch ($mode) {
2201
            case 'all':
2202
                $result = $this->getLogEntriesForPageId($pageId, '', true, $fullFlush);
2203
                break;
2204
            case 'finished':
2205
            case 'pending':
2206
                $result = $this->getLogEntriesForPageId($pageId, $mode, true, $fullFlush);
2207
                break;
2208
            default:
2209
                $cliObj->cli_validateArgs();
2210
                $cliObj->cli_help();
2211
                $result = false;
2212
        }
2213
2214
        return $result !== false;
2215
    }
2216
2217
    /**
2218
     * Obtains configuration keys from the CLI arguments
2219
     *
2220
     * @param  QueueCommandLineController $cliObj    Command line object
2221
     * @return mixed                        Array of keys or null if no keys found
2222
     */
2223
    protected function getConfigurationKeys(QueueCommandLineController &$cliObj)
2224
    {
2225
        $parameter = trim($cliObj->cli_argValue('-conf'));
2226
        return ($parameter != '' ? GeneralUtility::trimExplode(',', $parameter) : []);
2227
    }
2228
2229
    /**
2230
     * Running the functionality of the CLI (crawling URLs from queue)
2231
     *
2232
     * @param int $countInARun
2233
     * @param int $sleepTime
2234
     * @param int $sleepAfterFinish
2235
     * @return string
2236
     */
2237
    public function CLI_run($countInARun, $sleepTime, $sleepAfterFinish)
2238
    {
2239
        $result = 0;
2240
        $counter = 0;
2241
2242
        // First, run hooks:
2243
        $this->CLI_runHooks();
2244
2245
        // Clean up the queue
2246
        if (intval($this->extensionSettings['purgeQueueDays']) > 0) {
2247
            $purgeDate = $this->getCurrentTime() - 24 * 60 * 60 * intval($this->extensionSettings['purgeQueueDays']);
2248
            $del = $this->db->exec_DELETEquery(
2249
                'tx_crawler_queue',
2250
                'exec_time!=0 AND exec_time<' . $purgeDate
2251
            );
2252
            if (false == $del) {
2253
                GeneralUtility::devLog('Records could not be deleted.', 'crawler', LogLevel::INFO);
2254
            }
2255
        }
2256
2257
        // Select entries:
2258
        //TODO Shouldn't this reside within the transaction?
2259
        $rows = $this->db->exec_SELECTgetRows(
2260
            'qid,scheduled',
2261
            'tx_crawler_queue',
2262
            'exec_time=0
2263
                AND process_scheduled= 0
2264
                AND scheduled<=' . $this->getCurrentTime(),
2265
            '',
2266
            'scheduled, qid',
2267
        intval($countInARun)
2268
        );
2269
2270
        if (count($rows) > 0) {
2271
            $quidList = [];
2272
2273
            foreach ($rows as $r) {
0 ignored issues
show
Bug introduced by
The expression $rows of type null|array is not guaranteed to be traversable. How about adding an additional type check?

There are different options of fixing this problem.

  1. If you want to be on the safe side, you can add an additional type-check:

    $collection = json_decode($data, true);
    if ( ! is_array($collection)) {
        throw new \RuntimeException('$collection must be an array.');
    }
    
    foreach ($collection as $item) { /** ... */ }
    
  2. If you are sure that the expression is traversable, you might want to add a doc comment cast to improve IDE auto-completion and static analysis:

    /** @var array $collection */
    $collection = json_decode($data, true);
    
    foreach ($collection as $item) { /** .. */ }
    
  3. Mark the issue as a false-positive: Just hover the remove button, in the top-right corner of this issue for more options.

Loading history...
2274
                $quidList[] = $r['qid'];
2275
            }
2276
2277
            $processId = $this->CLI_buildProcessId();
2278
2279
            //reserve queue entries for process
2280
            $this->db->sql_query('BEGIN');
2281
            //TODO make sure we're not taking assigned queue-entires
2282
            $this->db->exec_UPDATEquery(
2283
                'tx_crawler_queue',
2284
                'qid IN (' . implode(',', $quidList) . ')',
2285
                [
2286
                    'process_scheduled' => intval($this->getCurrentTime()),
2287
                    'process_id' => $processId
2288
                ]
2289
            );
2290
2291
            //save the number of assigned queue entrys to determine who many have been processed later
2292
            $numberOfAffectedRows = $this->db->sql_affected_rows();
2293
            $this->db->exec_UPDATEquery(
2294
                'tx_crawler_process',
2295
                "process_id = '" . $processId . "'",
2296
                [
2297
                    'assigned_items_count' => intval($numberOfAffectedRows)
2298
                ]
2299
            );
2300
2301
            if ($numberOfAffectedRows == count($quidList)) {
2302
                $this->db->sql_query('COMMIT');
2303
            } else {
2304
                $this->db->sql_query('ROLLBACK');
2305
                $this->CLI_debug("Nothing processed due to multi-process collision (" . $this->CLI_buildProcessId() . ")");
2306
                return ($result | self::CLI_STATUS_ABORTED);
2307
            }
2308
2309
            foreach ($rows as $r) {
0 ignored issues
show
Bug introduced by
The expression $rows of type null|array is not guaranteed to be traversable. How about adding an additional type check?

There are different options of fixing this problem.

  1. If you want to be on the safe side, you can add an additional type-check:

    $collection = json_decode($data, true);
    if ( ! is_array($collection)) {
        throw new \RuntimeException('$collection must be an array.');
    }
    
    foreach ($collection as $item) { /** ... */ }
    
  2. If you are sure that the expression is traversable, you might want to add a doc comment cast to improve IDE auto-completion and static analysis:

    /** @var array $collection */
    $collection = json_decode($data, true);
    
    foreach ($collection as $item) { /** .. */ }
    
  3. Mark the issue as a false-positive: Just hover the remove button, in the top-right corner of this issue for more options.

Loading history...
2310
                $result |= $this->readUrl($r['qid']);
2311
2312
                $counter++;
2313
                usleep(intval($sleepTime)); // Just to relax the system
2314
2315
                // if during the start and the current read url the cli has been disable we need to return from the function
2316
                // mark the process NOT as ended.
2317
                if ($this->getDisabled()) {
2318
                    return ($result | self::CLI_STATUS_ABORTED);
2319
                }
2320
2321
                if (!$this->CLI_checkIfProcessIsActive($this->CLI_buildProcessId())) {
2322
                    $this->CLI_debug("conflict / timeout (" . $this->CLI_buildProcessId() . ")");
2323
2324
                    //TODO might need an additional returncode
2325
                    $result |= self::CLI_STATUS_ABORTED;
2326
                    break; //possible timeout
2327
                }
2328
            }
2329
2330
            sleep(intval($sleepAfterFinish));
2331
2332
            $msg = 'Rows: ' . $counter;
2333
            $this->CLI_debug($msg . " (" . $this->CLI_buildProcessId() . ")");
2334
        } else {
2335
            $this->CLI_debug("Nothing within queue which needs to be processed (" . $this->CLI_buildProcessId() . ")");
2336
        }
2337
2338
        if ($counter > 0) {
2339
            $result |= self::CLI_STATUS_PROCESSED;
2340
        }
2341
2342
        return $result;
2343
    }
2344
2345
    /**
2346
     * Activate hooks
2347
     *
2348
     * @return void
2349
     */
2350
    public function CLI_runHooks()
2351
    {
2352
        global $TYPO3_CONF_VARS;
2353
        if (is_array($TYPO3_CONF_VARS['EXTCONF']['crawler']['cli_hooks'])) {
2354
            foreach ($TYPO3_CONF_VARS['EXTCONF']['crawler']['cli_hooks'] as $objRef) {
2355
                $hookObj = &GeneralUtility::getUserObj($objRef);
2356
                if (is_object($hookObj)) {
2357
                    $hookObj->crawler_init($this);
2358
                }
2359
            }
2360
        }
2361
    }
2362
2363
    /**
2364
     * Try to acquire a new process with the given id
2365
     * also performs some auto-cleanup for orphan processes
2366
     * @todo preemption might not be the most elegant way to clean up
2367
     *
2368
     * @param string $id identification string for the process
2369
     * @return boolean
2370
     */
2371
    public function CLI_checkAndAcquireNewProcess($id)
2372
    {
2373
        $ret = true;
2374
2375
        $systemProcessId = getmypid();
2376
        if ($systemProcessId < 1) {
2377
            return false;
2378
        }
2379
2380
        $processCount = 0;
2381
        $orphanProcesses = [];
2382
2383
        $this->db->sql_query('BEGIN');
2384
2385
        $res = $this->db->exec_SELECTquery(
2386
            'process_id,ttl',
2387
            'tx_crawler_process',
2388
            'active=1 AND deleted=0'
2389
            );
2390
2391
        $currentTime = $this->getCurrentTime();
2392
2393
        while ($row = $this->db->sql_fetch_assoc($res)) {
2394
            if ($row['ttl'] < $currentTime) {
2395
                $orphanProcesses[] = $row['process_id'];
2396
            } else {
2397
                $processCount++;
2398
            }
2399
        }
2400
2401
        // if there are less than allowed active processes then add a new one
2402
        if ($processCount < intval($this->extensionSettings['processLimit'])) {
2403
            $this->CLI_debug("add process " . $this->CLI_buildProcessId() . " (" . ($processCount + 1) . "/" . intval($this->extensionSettings['processLimit']) . ")");
2404
2405
            // create new process record
2406
            $this->db->exec_INSERTquery(
2407
                'tx_crawler_process',
2408
                [
2409
                    'process_id' => $id,
2410
                    'active' => '1',
2411
                    'ttl' => ($currentTime + intval($this->extensionSettings['processMaxRunTime'])),
2412
                    'system_process_id' => $systemProcessId
2413
                ]
2414
                );
2415
        } else {
2416
            $this->CLI_debug("Processlimit reached (" . ($processCount) . "/" . intval($this->extensionSettings['processLimit']) . ")");
2417
            $ret = false;
2418
        }
2419
2420
        $this->CLI_releaseProcesses($orphanProcesses, true); // maybe this should be somehow included into the current lock
2421
        $this->CLI_deleteProcessesMarkedDeleted();
2422
2423
        $this->db->sql_query('COMMIT');
2424
2425
        return $ret;
2426
    }
2427
2428
    /**
2429
     * Release a process and the required resources
2430
     *
2431
     * @param  mixed    $releaseIds   string with a single process-id or array with multiple process-ids
2432
     * @param  boolean  $withinLock   show whether the DB-actions are included within an existing lock
2433
     * @return boolean
2434
     */
2435
    public function CLI_releaseProcesses($releaseIds, $withinLock = false)
2436
    {
2437
        if (!is_array($releaseIds)) {
2438
            $releaseIds = [$releaseIds];
2439
        }
2440
2441
        if (!count($releaseIds) > 0) {
2442
            return false;   //nothing to release
2443
        }
2444
2445
        if (!$withinLock) {
2446
            $this->db->sql_query('BEGIN');
2447
        }
2448
2449
        // some kind of 2nd chance algo - this way you need at least 2 processes to have a real cleanup
2450
        // this ensures that a single process can't mess up the entire process table
2451
2452
        // mark all processes as deleted which have no "waiting" queue-entires and which are not active
2453
        $this->db->exec_UPDATEquery(
2454
            'tx_crawler_queue',
2455
            'process_id IN (SELECT process_id FROM tx_crawler_process WHERE active=0 AND deleted=0)',
2456
            [
2457
                'process_scheduled' => 0,
2458
                'process_id' => ''
2459
            ]
2460
        );
2461
        $this->db->exec_UPDATEquery(
2462
            'tx_crawler_process',
2463
            'active=0 AND deleted=0
2464
            AND NOT EXISTS (
2465
                SELECT * FROM tx_crawler_queue
2466
                WHERE tx_crawler_queue.process_id = tx_crawler_process.process_id
2467
                AND tx_crawler_queue.exec_time = 0
2468
            )',
2469
            [
2470
                'deleted' => '1',
2471
                'system_process_id' => 0
2472
            ]
2473
        );
2474
        // mark all requested processes as non-active
2475
        $this->db->exec_UPDATEquery(
2476
            'tx_crawler_process',
2477
            'process_id IN (\'' . implode('\',\'', $releaseIds) . '\') AND deleted=0',
2478
            [
2479
                'active' => '0'
2480
            ]
2481
        );
2482
        $this->db->exec_UPDATEquery(
2483
            'tx_crawler_queue',
2484
            'exec_time=0 AND process_id IN ("' . implode('","', $releaseIds) . '")',
2485
            [
2486
                'process_scheduled' => 0,
2487
                'process_id' => ''
2488
            ]
2489
        );
2490
2491
        if (!$withinLock) {
2492
            $this->db->sql_query('COMMIT');
2493
        }
2494
2495
        return true;
2496
    }
2497
2498
    /**
2499
     * Delete processes marked as deleted
2500
     *
2501
     * @return void
2502
     */
2503 1
    public function CLI_deleteProcessesMarkedDeleted()
2504
    {
2505 1
        $this->db->exec_DELETEquery('tx_crawler_process', 'deleted = 1');
2506 1
    }
2507
2508
    /**
2509
     * Check if there are still resources left for the process with the given id
2510
     * Used to determine timeouts and to ensure a proper cleanup if there's a timeout
2511
     *
2512
     * @param  string  identification string for the process
2513
     * @return boolean determines if the process is still active / has resources
2514
     *
2515
     * FIXME: Please remove Transaction, not needed as only a select query.
2516
     */
2517
    public function CLI_checkIfProcessIsActive($pid)
2518
    {
2519
        $ret = false;
2520
        $this->db->sql_query('BEGIN');
2521
        $res = $this->db->exec_SELECTquery(
2522
            'process_id,active,ttl',
2523
            'tx_crawler_process',
2524
            'process_id = \'' . $pid . '\'  AND deleted=0',
2525
            '',
2526
            'ttl',
2527
            '0,1'
2528
        );
2529
        if ($row = $this->db->sql_fetch_assoc($res)) {
2530
            $ret = intVal($row['active']) == 1;
2531
        }
2532
        $this->db->sql_query('COMMIT');
2533
2534
        return $ret;
2535
    }
2536
2537
    /**
2538
     * Create a unique Id for the current process
2539
     *
2540
     * @return string  the ID
2541
     */
2542 2
    public function CLI_buildProcessId()
2543
    {
2544 2
        if (!$this->processID) {
2545 1
            $this->processID = GeneralUtility::shortMD5($this->microtime(true));
2546
        }
2547 2
        return $this->processID;
2548
    }
2549
2550
    /**
2551
     * @param bool $get_as_float
2552
     *
2553
     * @return mixed
2554
     */
2555
    protected function microtime($get_as_float = false)
2556
    {
2557
        return microtime($get_as_float);
2558
    }
2559
2560
    /**
2561
     * Prints a message to the stdout (only if debug-mode is enabled)
2562
     *
2563
     * @param  string $msg  the message
2564
     */
2565
    public function CLI_debug($msg)
2566
    {
2567
        if (intval($this->extensionSettings['processDebug'])) {
2568
            echo $msg . "\n";
2569
            flush();
2570
        }
2571
    }
2572
2573
    /**
2574
     * Get URL content by making direct request to TYPO3.
2575
     *
2576
     * @param  string $url          Page URL
2577
     * @param  int    $crawlerId    Crawler-ID
2578
     * @return array
2579
     */
2580 2
    protected function sendDirectRequest($url, $crawlerId)
2581
    {
2582 2
        $parsedUrl = parse_url($url);
2583 2
        if (!is_array($parsedUrl)) {
2584
            return [];
2585
        }
2586
2587 2
        $requestHeaders = $this->buildRequestHeaderArray($parsedUrl, $crawlerId);
2588
2589 2
        $cmd = escapeshellcmd($this->extensionSettings['phpPath']);
2590 2
        $cmd .= ' ';
2591 2
        $cmd .= escapeshellarg(ExtensionManagementUtility::extPath('crawler') . 'cli/bootstrap.php');
2592 2
        $cmd .= ' ';
2593 2
        $cmd .= escapeshellarg($this->getFrontendBasePath());
2594 2
        $cmd .= ' ';
2595 2
        $cmd .= escapeshellarg($url);
2596 2
        $cmd .= ' ';
2597 2
        $cmd .= escapeshellarg(base64_encode(serialize($requestHeaders)));
2598
2599 2
        $startTime = microtime(true);
2600 2
        $content = $this->executeShellCommand($cmd);
2601 2
        $this->log($url . ' ' . (microtime(true) - $startTime));
2602
2603
        $result = [
2604 2
            'request' => implode("\r\n", $requestHeaders) . "\r\n\r\n",
2605 2
            'headers' => '',
2606 2
            'content' => $content
2607
        ];
2608
2609 2
        return $result;
2610
    }
2611
2612
    /**
2613
     * Cleans up entries that stayed for too long in the queue. These are:
2614
     * - processed entries that are over 1.5 days in age
2615
     * - scheduled entries that are over 7 days old
2616
     *
2617
     * @return void
2618
     */
2619
    protected function cleanUpOldQueueEntries()
2620
    {
2621
        $processedAgeInSeconds = $this->extensionSettings['cleanUpProcessedAge'] * 86400; // 24*60*60 Seconds in 24 hours
2622
        $scheduledAgeInSeconds = $this->extensionSettings['cleanUpScheduledAge'] * 86400;
2623
2624
        $now = time();
2625
        $condition = '(exec_time<>0 AND exec_time<' . ($now - $processedAgeInSeconds) . ') OR scheduled<=' . ($now - $scheduledAgeInSeconds);
2626
        $this->flushQueue($condition);
2627
    }
2628
2629
    /**
2630
     * Initializes a TypoScript Frontend necessary for using TypoScript and TypoLink functions
2631
     *
2632
     * @param int $id
2633
     * @param int $typeNum
2634
     *
2635
     * @return void
2636
     */
2637
    protected function initTSFE($id = 1, $typeNum = 0)
2638
    {
2639
        EidUtility::initTCA();
2640
        if (!is_object($GLOBALS['TT'])) {
2641
            $GLOBALS['TT'] = new NullTimeTracker();
2642
            $GLOBALS['TT']->start();
2643
        }
2644
2645
        $GLOBALS['TSFE'] = GeneralUtility::makeInstance(TypoScriptFrontendController::class, $GLOBALS['TYPO3_CONF_VARS'], $id, $typeNum);
2646
        $GLOBALS['TSFE']->sys_page = GeneralUtility::makeInstance(PageRepository::class);
2647
        $GLOBALS['TSFE']->sys_page->init(true);
2648
        $GLOBALS['TSFE']->connectToDB();
2649
        $GLOBALS['TSFE']->initFEuser();
2650
        $GLOBALS['TSFE']->determineId();
2651
        $GLOBALS['TSFE']->initTemplate();
2652
        $GLOBALS['TSFE']->rootLine = $GLOBALS['TSFE']->sys_page->getRootLine($id, '');
2653
        $GLOBALS['TSFE']->getConfigArray();
2654
        PageGenerator::pagegenInit();
2655
    }
2656
2657
    /**
2658
     * Returns a md5 hash generated from a serialized configuration array.
2659
     *
2660
     * @param array $configuration
2661
     *
2662
     * @return string
2663
     */
2664 9
    protected function getConfigurationHash(array $configuration) {
2665 9
        unset($configuration['paramExpanded']);
2666 9
        unset($configuration['URLs']);
2667 9
        return md5(serialize($configuration));
2668
    }
2669
}
2670