Completed
Push — TYPO3_8 ( 3ce6ed...1b9511 )
by
unknown
10:50
created

tx_crawler_lib::urlListFromUrlArray()   D

Complexity

Conditions 21
Paths 114

Size

Total Lines 118
Code Lines 67

Duplication

Lines 0
Ratio 0 %

Code Coverage

Tests 0
CRAP Score 462

Importance

Changes 0
Metric Value
cc 21
eloc 67
nc 114
nop 9
dl 0
loc 118
ccs 0
cts 50
cp 0
crap 462
rs 4.4991
c 0
b 0
f 0

How to fix   Long Method    Complexity    Many Parameters   

Long Method

Small methods make your code easier to understand, in particular if combined with a good name. Besides, if your method is small, finding a good name is usually much easier.

For example, if you find yourself adding comments to a method's body, this is usually a good sign to extract the commented part to a new method, and use the comment as a starting point when coming up with a good name for this new method.

Commonly applied refactorings include:

Many Parameters

Methods with many parameters are not only hard to understand, but their parameters also often become inconsistent when you need more, or different data.

There are several approaches to avoid long parameter lists:

1
<?php
2
/***************************************************************
3
 *  Copyright notice
4
 *
5
 *  (c) 2016 AOE GmbH <[email protected]>
6
 *
7
 *  All rights reserved
8
 *
9
 *  This script is part of the TYPO3 project. The TYPO3 project is
10
 *  free software; you can redistribute it and/or modify
11
 *  it under the terms of the GNU General Public License as published by
12
 *  the Free Software Foundation; either version 3 of the License, or
13
 *  (at your option) any later version.
14
 *
15
 *  The GNU General Public License can be found at
16
 *  http://www.gnu.org/copyleft/gpl.html.
17
 *
18
 *  This script is distributed in the hope that it will be useful,
19
 *  but WITHOUT ANY WARRANTY; without even the implied warranty of
20
 *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
21
 *  GNU General Public License for more details.
22
 *
23
 *  This copyright notice MUST APPEAR in all copies of the script!
24
 ***************************************************************/
25
26
use TYPO3\CMS\Core\Imaging\Icon;
27
use TYPO3\CMS\Core\Imaging\IconFactory;
28
29
/**
30
 * Class tx_crawler_lib
31
 */
32
class tx_crawler_lib {
33
34
    var $setID = 0;
35
    var $processID ='';
36
    var $max_CLI_exec_time = 3600;    // One hour is max stalled time for the CLI (If the process has had the status "start" for 3600 seconds it will be regarded stalled and a new process is started.
37
38
    var $duplicateTrack = array();
39
    var $downloadUrls = array();
40
41
    var $incomingProcInstructions = array();
42
    var $incomingConfigurationSelection = array();
43
44
45
    var $registerQueueEntriesInternallyOnly = array();
46
    var $queueEntries = array();
47
    var $urlList = array();
48
49
    var $debugMode=FALSE;
50
51
    var $extensionSettings=array();
52
53
    var $MP = false; // mount point
54
55
    protected $processFilename;
56
57
    /**
58
     * Holds the internal access mode can be 'gui','cli' or 'cli_im'
59
     *
60
     * @var string
61
     */
62
    protected $accessMode;
63
64
    /**
65
     * @var \TYPO3\CMS\Core\Database\DatabaseConnection
66
     */
67
    private $db;
68
69
    /**
70
     * @var TYPO3\CMS\Core\Authentication\BackendUserAuthentication
71
     */
72
    private $backendUser;
73
74
    const CLI_STATUS_NOTHING_PROCCESSED = 0;
75
    const CLI_STATUS_REMAIN = 1;    //queue not empty
76
    const CLI_STATUS_PROCESSED = 2;    //(some) queue items where processed
77
    const CLI_STATUS_ABORTED = 4;    //instance didn't finish
78
    const CLI_STATUS_POLLABLE_PROCESSED = 8;
79
80
    /**
81
     * Method to set the accessMode can be gui, cli or cli_im
82
     *
83
     * @return string
84
     */
85 1
    public function getAccessMode() {
86 1
        return $this->accessMode;
87
    }
88
89
    /**
90
     * @param string $accessMode
91
     */
92 1
    public function setAccessMode($accessMode) {
93 1
        $this->accessMode = $accessMode;
94 1
    }
95
96
    /**
97
     * Set disabled status to prevent processes from being processed
98
     *
99
     * @param  bool $disabled (optional, defaults to true)
100
     * @return void
101
     */
102 3
    public function setDisabled($disabled = true) {
103 3
        if ($disabled) {
104 2
            \TYPO3\CMS\Core\Utility\GeneralUtility::writeFile($this->processFilename, '');
105
        } else {
106 1
            if (is_file($this->processFilename)) {
107 1
                unlink($this->processFilename);
108
            }
109
        }
110 3
    }
111
112
    /**
113
     * Get disable status
114
     *
115
     * @return bool true if disabled
116
     */
117 3
    public function getDisabled() {
118 3
        if (is_file($this->processFilename)) {
119 2
            return true;
120
        } else {
121 1
            return false;
122
        }
123
    }
124
125
    /**
126
     * @param string $filenameWithPath
127
     *
128
     * @return void
129
     */
130 4
    public function setProcessFilename($filenameWithPath)
131
    {
132 4
        $this->processFilename = $filenameWithPath;
133 4
    }
134
135
    /**
136
     * @return string
137
     */
138 1
    public function getProcessFilename()
139
    {
140 1
        return $this->processFilename;
141
    }
142
143
144
145
    /************************************
146
     *
147
     * Getting URLs based on Page TSconfig
148
     *
149
     ************************************/
150
151 23
    public function __construct() {
152 23
        $this->db = $GLOBALS['TYPO3_DB'];
153 23
        $this->backendUser = $GLOBALS['BE_USER'];
154 23
        $this->processFilename = PATH_site.'typo3temp/tx_crawler.proc';
155
156 23
        $settings = unserialize($GLOBALS['TYPO3_CONF_VARS']['EXT']['extConf']['crawler']);
157 23
        $settings = is_array($settings) ? $settings : array();
158
159
        // read ext_em_conf_template settings and set
160 23
        $this->setExtensionSettings($settings);
161
162
163
        // set defaults:
164 23
        if (\TYPO3\CMS\Core\Utility\MathUtility::convertToPositiveInteger($this->extensionSettings['countInARun']) == 0) {
165 1
            $this->extensionSettings['countInARun'] = 100;
166
        }
167
168 23
        $this->extensionSettings['processLimit'] = \TYPO3\CMS\Core\Utility\MathUtility::forceIntegerInRange($this->extensionSettings['processLimit'],1,99,1);
169 23
    }
170
171
    /**
172
     * Sets the extensions settings (unserialized pendant of $TYPO3_CONF_VARS['EXT']['extConf']['crawler']).
173
     *
174
     * @param array $extensionSettings
175
     * @return void
176
     */
177 31
    public function setExtensionSettings(array $extensionSettings) {
178 31
        $this->extensionSettings = $extensionSettings;
179 31
    }
180
181
    /**
182
     * Check if the given page should be crawled
183
     *
184
     * @param array $pageRow
185
     * @return false|string false if the page should be crawled (not excluded), true / skipMessage if it should be skipped
186
     * @author Fabrizio Branca <[email protected]>
187
     */
188 6
    public function checkIfPageShouldBeSkipped(array $pageRow) {
189
190 6
        $skipPage = false;
191 6
        $skipMessage = 'Skipped'; // message will be overwritten later
192
193
            // if page is hidden
194 6
        if (!$this->extensionSettings['crawlHiddenPages']) {
195 6
            if ($pageRow['hidden']) {
196 1
                $skipPage = true;
197 1
                $skipMessage = 'Because page is hidden';
198
            }
199
        }
200
201 6
        if (!$skipPage) {
202 5
            if (\TYPO3\CMS\Core\Utility\GeneralUtility::inList('3,4', $pageRow['doktype']) || $pageRow['doktype']>=199)    {
203 3
                $skipPage = true;
204 3
                $skipMessage = 'Because doktype is not allowed';
205
            }
206
        }
207
208 6
        if (!$skipPage) {
209 2
            if (is_array($GLOBALS['TYPO3_CONF_VARS']['EXTCONF']['crawler']['excludeDoktype'])) {
210 2
                foreach ($GLOBALS['TYPO3_CONF_VARS']['EXTCONF']['crawler']['excludeDoktype'] as $key => $doktypeList) {
211 1
                    if (\TYPO3\CMS\Core\Utility\GeneralUtility::inList($doktypeList, $pageRow['doktype'])) {
212 1
                        $skipPage = true;
213 1
                        $skipMessage = 'Doktype was excluded by "'.$key.'"';
214 1
                        break;
215
                    }
216
                }
217
            }
218
        }
219
220 6
        if (!$skipPage) {
221
                // veto hook
222 1
            if (is_array($GLOBALS['TYPO3_CONF_VARS']['EXTCONF']['crawler']['pageVeto'])) {
223
                foreach($GLOBALS['TYPO3_CONF_VARS']['EXTCONF']['crawler']['pageVeto'] as $key => $func)    {
224
                    $params = array(
225
                        'pageRow' => $pageRow
226
                    );
227
                    // expects "false" if page is ok and "true" or a skipMessage if this page should _not_ be crawled
228
                    $veto = \TYPO3\CMS\Core\Utility\GeneralUtility::callUserFunction($func, $params, $this);
229
                    if ($veto !== false)    {
230
                        $skipPage = true;
231
                        if (is_string($veto)) {
232
                            $skipMessage = $veto;
233
                        } else {
234
                            $skipMessage = 'Veto from hook "'.htmlspecialchars($key).'"';
235
                        }
236
                        // no need to execute other hooks if a previous one return a veto
237
                        break;
238
                    }
239
                }
240
            }
241
        }
242
243 6
        return $skipPage ? $skipMessage : false;
244
    }
245
246
    /**
247
     * Wrapper method for getUrlsForPageId()
248
     * It returns an array of configurations and no urls!
249
     *
250
     * @param  array  $pageRow       Page record with at least dok-type and uid columns.
251
     * @param  string $skipMessage
252
     * @return array                 Result (see getUrlsForPageId())
253
     * @see getUrlsForPageId()
254
     */
255 2
    public function getUrlsForPageRow(array $pageRow, &$skipMessage = '') {
256 2
        $message = $this->checkIfPageShouldBeSkipped($pageRow);
257
258 2
        if ($message === false) {
259 1
            $forceSsl = ($pageRow['url_scheme'] === 2) ? true : false;
260 1
            $res = $this->getUrlsForPageId($pageRow['uid'], $forceSsl);
261 1
            $skipMessage = '';
262
        } else {
263 1
            $skipMessage = $message;
264 1
            $res = array();
265
        }
266
267 2
        return $res;
268
    }
269
270
    /**
271
     * This method is used to count if there are ANY unprocessed queue entries
272
     * of a given page_id and the configuration which matches a given hash.
273
     * If there if none, we can skip an inner detail check
274
     *
275
     * @param  int    $uid
276
     * @param  string $configurationHash
277
     * @return boolean
278
     */
279
    protected function noUnprocessedQueueEntriesForPageWithConfigurationHashExist($uid,$configurationHash) {
280
        $configurationHash = $this->db->fullQuoteStr($configurationHash,'tx_crawler_queue');
281
        $res = $this->db->exec_SELECTquery('count(*) as anz','tx_crawler_queue',"page_id=".intval($uid)." AND configuration_hash=".$configurationHash." AND exec_time=0");
282
        $row = $this->db->sql_fetch_assoc($res);
283
284
        return ($row['anz'] == 0);
285
    }
286
287
    /**
288
     * Creates a list of URLs from input array (and submits them to queue if asked for)
289
     * See Web > Info module script + "indexed_search"'s crawler hook-client using this!
290
     *
291
     * @param    array        Information about URLs from pageRow to crawl.
292
     * @param    array        Page row
293
     * @param    integer        Unix time to schedule indexing to, typically time()
294
     * @param    integer        Number of requests per minute (creates the interleave between requests)
295
     * @param    boolean        If set, submits the URLs to queue
296
     * @param    boolean        If set (and submitcrawlUrls is false) will fill $downloadUrls with entries)
297
     * @param    array        Array which is passed by reference and contains the an id per url to secure we will not crawl duplicates
298
     * @param    array        Array which will be filled with URLS for download if flag is set.
299
     * @param    array        Array of processing instructions
300
     * @return    string        List of URLs (meant for display in backend module)
301
     *
302
     */
303
    function urlListFromUrlArray(
0 ignored issues
show
Best Practice introduced by
It is generally recommended to explicitly declare the visibility for methods.

Adding explicit visibility (private, protected, or public) is generally recommend to communicate to other developers how, and from where this method is intended to be used.

Loading history...
304
    array $vv,
305
    array $pageRow,
306
    $scheduledTime,
307
    $reqMinute,
308
    $submitCrawlUrls,
309
    $downloadCrawlUrls,
310
    array &$duplicateTrack,
311
    array &$downloadUrls,
312
    array $incomingProcInstructions) {
313
314
        // realurl support (thanks to Ingo Renner)
315
        if (\TYPO3\CMS\Core\Utility\ExtensionManagementUtility::isLoaded('realurl') && $vv['subCfg']['realurl']) {
316
317
            /** @var tx_realurl $urlObj */
318
            $urlObj = \TYPO3\CMS\Core\Utility\GeneralUtility::makeInstance('tx_realurl');
319
320
            if (!empty($vv['subCfg']['baseUrl'])) {
321
                $urlParts = parse_url($vv['subCfg']['baseUrl']);
322
                $host = strtolower($urlParts['host']);
323
                $urlObj->host = $host;
324
325
                // First pass, finding configuration OR pointer string:
326
                $urlObj->extConf = isset($GLOBALS['TYPO3_CONF_VARS']['EXTCONF']['realurl'][$urlObj->host]) ? $GLOBALS['TYPO3_CONF_VARS']['EXTCONF']['realurl'][$urlObj->host] : $GLOBALS['TYPO3_CONF_VARS']['EXTCONF']['realurl']['_DEFAULT'];
327
328
                // If it turned out to be a string pointer, then look up the real config:
329
                if (is_string($urlObj->extConf)) {
330
                    $urlObj->extConf = is_array($GLOBALS['TYPO3_CONF_VARS']['EXTCONF']['realurl'][$urlObj->extConf]) ? $GLOBALS['TYPO3_CONF_VARS']['EXTCONF']['realurl'][$urlObj->extConf] : $GLOBALS['TYPO3_CONF_VARS']['EXTCONF']['realurl']['_DEFAULT'];
331
                }
332
333
            }
334
335
            if (!$GLOBALS['TSFE']->sys_page) {
336
                $GLOBALS['TSFE']->sys_page = \TYPO3\CMS\Core\Utility\GeneralUtility::makeInstance('TYPO3\CMS\Frontend\Page\PageRepository');
337
            }
338
            if (!$GLOBALS['TSFE']->csConvObj) {
339
                $GLOBALS['TSFE']->csConvObj = \TYPO3\CMS\Core\Utility\GeneralUtility::makeInstance('TYPO3\CMS\Core\Charset\CharsetConverter');
340
            }
341
            if (!$GLOBALS['TSFE']->tmpl->rootLine[0]['uid']) {
342
                $GLOBALS['TSFE']->tmpl->rootLine[0]['uid'] = $urlObj->extConf['pagePath']['rootpage_id'];
343
            }
344
        }
345
346
        if (is_array($vv['URLs']))    {
347
            $configurationHash     =    md5(serialize($vv));
348
            $skipInnerCheck     =    $this->noUnprocessedQueueEntriesForPageWithConfigurationHashExist($pageRow['uid'],$configurationHash);
349
350
            foreach($vv['URLs'] as $urlQuery)    {
351
352
                if ($this->drawURLs_PIfilter($vv['subCfg']['procInstrFilter'], $incomingProcInstructions))    {
353
354
                    // Calculate cHash:
355
                    if ($vv['subCfg']['cHash'])    {
356
                        /* @var $cacheHash \TYPO3\CMS\Frontend\Page\CacheHashCalculator */
357
                        $cacheHash = \TYPO3\CMS\Core\Utility\GeneralUtility::makeInstance('TYPO3\CMS\Frontend\Page\CacheHashCalculator');
358
                        $urlQuery .= '&cHash=' . $cacheHash->generateForParameters($urlQuery);
359
                    }
360
361
                    // Create key by which to determine unique-ness:
362
                    $uKey = $urlQuery.'|'.$vv['subCfg']['userGroups'].'|'.$vv['subCfg']['baseUrl'].'|'.$vv['subCfg']['procInstrFilter'];
363
364
                    // realurl support (thanks to Ingo Renner)
365
                    $urlQuery = 'index.php' . $urlQuery;
366
                    if (\TYPO3\CMS\Core\Utility\ExtensionManagementUtility::isLoaded('realurl') && $vv['subCfg']['realurl']) {
367
                        $params = array(
368
                            'LD' => array(
369
                                'totalURL' => $urlQuery
370
                            ),
371
                            'TCEmainHook' => true
372
                        );
373
                        $urlObj->encodeSpURL($params);
0 ignored issues
show
Bug introduced by
The variable $urlObj does not seem to be defined for all execution paths leading up to this point.

If you define a variable conditionally, it can happen that it is not defined for all execution paths.

Let’s take a look at an example:

function myFunction($a) {
    switch ($a) {
        case 'foo':
            $x = 1;
            break;

        case 'bar':
            $x = 2;
            break;
    }

    // $x is potentially undefined here.
    echo $x;
}

In the above example, the variable $x is defined if you pass “foo” or “bar” as argument for $a. However, since the switch statement has no default case statement, if you pass any other value, the variable $x would be undefined.

Available Fixes

  1. Check for existence of the variable explicitly:

    function myFunction($a) {
        switch ($a) {
            case 'foo':
                $x = 1;
                break;
    
            case 'bar':
                $x = 2;
                break;
        }
    
        if (isset($x)) { // Make sure it's always set.
            echo $x;
        }
    }
    
  2. Define a default value for the variable:

    function myFunction($a) {
        $x = ''; // Set a default which gets overridden for certain paths.
        switch ($a) {
            case 'foo':
                $x = 1;
                break;
    
            case 'bar':
                $x = 2;
                break;
        }
    
        echo $x;
    }
    
  3. Add a value for the missing path:

    function myFunction($a) {
        switch ($a) {
            case 'foo':
                $x = 1;
                break;
    
            case 'bar':
                $x = 2;
                break;
    
            // We add support for the missing case.
            default:
                $x = '';
                break;
        }
    
        echo $x;
    }
    
Loading history...
374
                        $urlQuery = $params['LD']['totalURL'];
375
                    }
376
377
                    // Scheduled time:
378
                    $schTime = $scheduledTime + round(count($duplicateTrack)*(60/$reqMinute));
379
                    $schTime = floor($schTime/60)*60;
380
381
                    if (isset($duplicateTrack[$uKey])) {
382
383
                        //if the url key is registered just display it and do not resubmit is
384
                        $urlList = '<em><span class="typo3-dimmed">'.htmlspecialchars($urlQuery).'</span></em><br/>';
385
386
                    } else {
387
388
                        $urlList = '['.date('d.m.y H:i', $schTime).'] '.htmlspecialchars($urlQuery);
389
                        $this->urlList[] = '['.date('d.m.y H:i', $schTime).'] '.$urlQuery;
390
391
                        $theUrl = ($vv['subCfg']['baseUrl'] ? $vv['subCfg']['baseUrl'] : \TYPO3\CMS\Core\Utility\GeneralUtility::getIndpEnv('TYPO3_SITE_URL')) . $urlQuery;
392
393
                        // Submit for crawling!
394
                        if ($submitCrawlUrls)    {
395
                            $added = $this->addUrl(
396
                            $pageRow['uid'],
397
                            $theUrl,
398
                            $vv['subCfg'],
399
                            $scheduledTime,
400
                            $configurationHash,
401
                            $skipInnerCheck
402
                            );
403
                            if ($added === false) {
404
                                $urlList .= ' (Url already existed)';
405
                            }
406
                        } elseif ($downloadCrawlUrls)    {
407
                            $downloadUrls[$theUrl] = $theUrl;
408
                        }
409
410
                        $urlList .= '<br />';
411
                    }
412
                    $duplicateTrack[$uKey] = TRUE;
413
                }
414
            }
415
        } else {
416
            $urlList = 'ERROR - no URL generated';
417
        }
418
419
        return $urlList;
0 ignored issues
show
Bug introduced by
The variable $urlList does not seem to be defined for all execution paths leading up to this point.

If you define a variable conditionally, it can happen that it is not defined for all execution paths.

Let’s take a look at an example:

function myFunction($a) {
    switch ($a) {
        case 'foo':
            $x = 1;
            break;

        case 'bar':
            $x = 2;
            break;
    }

    // $x is potentially undefined here.
    echo $x;
}

In the above example, the variable $x is defined if you pass “foo” or “bar” as argument for $a. However, since the switch statement has no default case statement, if you pass any other value, the variable $x would be undefined.

Available Fixes

  1. Check for existence of the variable explicitly:

    function myFunction($a) {
        switch ($a) {
            case 'foo':
                $x = 1;
                break;
    
            case 'bar':
                $x = 2;
                break;
        }
    
        if (isset($x)) { // Make sure it's always set.
            echo $x;
        }
    }
    
  2. Define a default value for the variable:

    function myFunction($a) {
        $x = ''; // Set a default which gets overridden for certain paths.
        switch ($a) {
            case 'foo':
                $x = 1;
                break;
    
            case 'bar':
                $x = 2;
                break;
        }
    
        echo $x;
    }
    
  3. Add a value for the missing path:

    function myFunction($a) {
        switch ($a) {
            case 'foo':
                $x = 1;
                break;
    
            case 'bar':
                $x = 2;
                break;
    
            // We add support for the missing case.
            default:
                $x = '';
                break;
        }
    
        echo $x;
    }
    
Loading history...
420
    }
421
422
    /**
423
     * Returns true if input processing instruction is among registered ones.
424
     *
425
     * @param  string $piString                     PI to test
426
     * @param  array  $incomingProcInstructions     Processing instructions
427
     * @return boolean                              TRUE if found
428
     */
429 5
    public function drawURLs_PIfilter($piString, array $incomingProcInstructions) {
430 5
        if (empty($incomingProcInstructions)) {
431 1
            return TRUE;
432
        }
433
434 4
        foreach($incomingProcInstructions as $pi) {
435 4
            if (\TYPO3\CMS\Core\Utility\GeneralUtility::inList($piString, $pi)) {
436 4
                return TRUE;
437
            }
438
        }
439 2
    }
440
441
442
    public function getPageTSconfigForId($id) {
443
        if(!$this->MP){
444
            $pageTSconfig = \TYPO3\CMS\Backend\Utility\BackendUtility::getPagesTSconfig($id);
445
        } else {
446
            list(,$mountPointId) = explode('-', $this->MP);
447
            $pageTSconfig = \TYPO3\CMS\Backend\Utility\BackendUtility::getPagesTSconfig($mountPointId);
448
        }
449
450
        // Call a hook to alter configuration
451
        if (is_array($GLOBALS['TYPO3_CONF_VARS']['EXTCONF']['crawler']['getPageTSconfigForId'])) {
452
            $params = array(
453
                'pageId' => $id,
454
                'pageTSConfig' => &$pageTSconfig
455
            );
456
            foreach ($GLOBALS['TYPO3_CONF_VARS']['EXTCONF']['crawler']['getPageTSconfigForId'] as $userFunc) {
457
                \TYPO3\CMS\Core\Utility\GeneralUtility::callUserFunction($userFunc, $params, $this);
458
            }
459
        }
460
461
        return $pageTSconfig;
462
    }
463
464
    /**
465
     * This methods returns an array of configurations.
466
     * And no urls!
467
     *
468
     * @param integer $id  Page ID
469
     * @param bool $forceSsl Use https
470
     * @return array        Configurations from pages and configuration records
471
     */
472
    protected function getUrlsForPageId($id, $forceSsl = false) {
473
474
        /**
475
         * Get configuration from tsConfig
476
         */
477
478
        // Get page TSconfig for page ID:
479
        $pageTSconfig = $this->getPageTSconfigForId($id);
480
481
        $res = array();
482
483
        if (is_array($pageTSconfig) && is_array($pageTSconfig['tx_crawler.']['crawlerCfg.']))    {
484
            $crawlerCfg = $pageTSconfig['tx_crawler.']['crawlerCfg.'];
485
486
            if (is_array($crawlerCfg['paramSets.']))    {
487
                foreach($crawlerCfg['paramSets.'] as $key => $values)    {
488
                    if (!is_array($values))    {
489
490
                        // Sub configuration for a single configuration string:
491
                        $subCfg = (array)$crawlerCfg['paramSets.'][$key.'.'];
492
                        $subCfg['key'] = $key;
493
494
                        if (strcmp($subCfg['procInstrFilter'],''))    {
495
                            $subCfg['procInstrFilter'] = implode(',',\TYPO3\CMS\Core\Utility\GeneralUtility::trimExplode(',',$subCfg['procInstrFilter']));
496
                        }
497
                        $pidOnlyList = implode(',',\TYPO3\CMS\Core\Utility\GeneralUtility::trimExplode(',',$subCfg['pidsOnly'],1));
0 ignored issues
show
Documentation introduced by
1 is of type integer, but the function expects a boolean.

It seems like the type of the argument is not accepted by the function/method which you are calling.

In some cases, in particular if PHP’s automatic type-juggling kicks in this might be fine. In other cases, however this might be a bug.

We suggest to add an explicit type cast like in the following example:

function acceptsInteger($int) { }

$x = '123'; // string "123"

// Instead of
acceptsInteger($x);

// we recommend to use
acceptsInteger((integer) $x);
Loading history...
498
499
                            // process configuration if it is not page-specific or if the specific page is the current page:
500
                        if (!strcmp($subCfg['pidsOnly'],'') || \TYPO3\CMS\Core\Utility\GeneralUtility::inList($pidOnlyList,$id))    {
501
502
                                // add trailing slash if not present
503
                            if (!empty($subCfg['baseUrl']) && substr($subCfg['baseUrl'], -1) != '/') {
504
                                $subCfg['baseUrl'] .= '/';
505
                            }
506
507
                                // Explode, process etc.:
508
                            $res[$key] = array();
509
                            $res[$key]['subCfg'] = $subCfg;
510
                            $res[$key]['paramParsed'] = $this->parseParams($values);
511
                            $res[$key]['paramExpanded'] = $this->expandParameters($res[$key]['paramParsed'],$id);
512
                            $res[$key]['origin'] = 'pagets';
513
514
                                // recognize MP value
515
                            if(!$this->MP){
516
                                $res[$key]['URLs'] = $this->compileUrls($res[$key]['paramExpanded'],array('?id='.$id));
517
                            } else {
518
                                $res[$key]['URLs'] = $this->compileUrls($res[$key]['paramExpanded'],array('?id='.$id.'&MP='.$this->MP));
519
                            }
520
                        }
521
                    }
522
                }
523
524
            }
525
        }
526
527
        /**
528
         * Get configuration from tx_crawler_configuration records
529
         */
530
531
            // get records along the rootline
532
        $rootLine = \TYPO3\CMS\Backend\Utility\BackendUtility::BEgetRootLine($id);
533
534
        foreach ($rootLine as $page) {
535
            $configurationRecordsForCurrentPage = \TYPO3\CMS\Backend\Utility\BackendUtility::getRecordsByField(
536
                'tx_crawler_configuration',
537
                'pid',
538
                intval($page['uid']),
539
                \TYPO3\CMS\Backend\Utility\BackendUtility::BEenableFields('tx_crawler_configuration') . \TYPO3\CMS\Backend\Utility\BackendUtility::deleteClause('tx_crawler_configuration')
540
            );
541
542
            if (is_array($configurationRecordsForCurrentPage)) {
543
                foreach ($configurationRecordsForCurrentPage as $configurationRecord) {
544
545
                        // check access to the configuration record
546
                    if (empty($configurationRecord['begroups']) || $GLOBALS['BE_USER']->isAdmin() || $this->hasGroupAccess($GLOBALS['BE_USER']->user['usergroup_cached_list'], $configurationRecord['begroups'])) {
547
548
                        $pidOnlyList = implode(',',\TYPO3\CMS\Core\Utility\GeneralUtility::trimExplode(',',$configurationRecord['pidsonly'],1));
0 ignored issues
show
Documentation introduced by
1 is of type integer, but the function expects a boolean.

It seems like the type of the argument is not accepted by the function/method which you are calling.

In some cases, in particular if PHP’s automatic type-juggling kicks in this might be fine. In other cases, however this might be a bug.

We suggest to add an explicit type cast like in the following example:

function acceptsInteger($int) { }

$x = '123'; // string "123"

// Instead of
acceptsInteger($x);

// we recommend to use
acceptsInteger((integer) $x);
Loading history...
549
550
                            // process configuration if it is not page-specific or if the specific page is the current page:
551
                        if (!strcmp($configurationRecord['pidsonly'],'') || \TYPO3\CMS\Core\Utility\GeneralUtility::inList($pidOnlyList,$id)) {
552
                            $key = $configurationRecord['name'];
553
554
                                // don't overwrite previously defined paramSets
555
                            if (!isset($res[$key])) {
556
557
                                    /* @var $TSparserObject \TYPO3\CMS\Core\TypoScript\Parser\TypoScriptParser */
558
                                $TSparserObject = \TYPO3\CMS\Core\Utility\GeneralUtility::makeInstance('TYPO3\CMS\Core\TypoScript\Parser\TypoScriptParser');
559
                                $TSparserObject->parse($configurationRecord['processing_instruction_parameters_ts']);
560
561
                                $subCfg = array(
562
                                    'procInstrFilter' => $configurationRecord['processing_instruction_filter'],
563
                                    'procInstrParams.' => $TSparserObject->setup,
564
                                    'baseUrl' => $this->getBaseUrlForConfigurationRecord(
565
                                        $configurationRecord['base_url'],
566
                                        $configurationRecord['sys_domain_base_url'],
567
                                        $forceSsl
568
                                    ),
569
                                    'realurl' => $configurationRecord['realurl'],
570
                                    'cHash' => $configurationRecord['chash'],
571
                                    'userGroups' => $configurationRecord['fegroups'],
572
                                    'exclude' => $configurationRecord['exclude'],
573
                                    'key' => $key,
574
                                );
575
576
                                    // add trailing slash if not present
577
                                if (!empty($subCfg['baseUrl']) && substr($subCfg['baseUrl'], -1) != '/') {
578
                                    $subCfg['baseUrl'] .= '/';
579
                                }
580
                                if (!in_array($id, $this->expandExcludeString($subCfg['exclude']))) {
581
                                    $res[$key] = array();
582
                                    $res[$key]['subCfg'] = $subCfg;
583
                                    $res[$key]['paramParsed'] = $this->parseParams($configurationRecord['configuration']);
584
                                    $res[$key]['paramExpanded'] = $this->expandParameters($res[$key]['paramParsed'], $id);
585
                                    $res[$key]['URLs'] = $this->compileUrls($res[$key]['paramExpanded'], array('?id=' . $id));
586
                                    $res[$key]['origin'] = 'tx_crawler_configuration_'.$configurationRecord['uid'];
587
                                }
588
                            }
589
                        }
590
                    }
591
                }
592
            }
593
        }
594
595
        if (is_array($GLOBALS['TYPO3_CONF_VARS']['EXTCONF']['crawler']['processUrls']))    {
596
            foreach($GLOBALS['TYPO3_CONF_VARS']['EXTCONF']['crawler']['processUrls'] as $func)    {
597
                $params = array(
598
                    'res' => &$res,
599
                );
600
                \TYPO3\CMS\Core\Utility\GeneralUtility::callUserFunction($func, $params, $this);
601
            }
602
        }
603
604
        return $res;
605
    }
606
607
    /**
608
     * Checks if a domain record exist and returns the base-url based on the record. If not the given baseUrl string is used.
609
     *
610
     * @param string   $baseUrl
611
     * @param integer  $sysDomainUid
612
     * @param bool     $ssl
613
     * @return string
614
     */
615
    protected function getBaseUrlForConfigurationRecord($baseUrl, $sysDomainUid, $ssl = false) {
616
        $sysDomainUid = intval($sysDomainUid);
617
        $urlScheme = ($ssl === false) ? 'http' : 'https';
618
619
        if ($sysDomainUid > 0) {
620
            $res = $this->db->exec_SELECTquery(
621
                '*',
622
                'sys_domain',
623
                'uid = '.$sysDomainUid .
624
                \TYPO3\CMS\Backend\Utility\BackendUtility::BEenableFields('sys_domain') .
625
                \TYPO3\CMS\Backend\Utility\BackendUtility::deleteClause('sys_domain')
626
            );
627
            $row = $this->db->sql_fetch_assoc($res);
628
            if ($row['domainName'] != '') {
629
                return $urlScheme .'://'. $row['domainName'];
630
            }
631
        }
632
        return $baseUrl;
633
    }
634
635
    function getConfigurationsForBranch($rootid, $depth) {
0 ignored issues
show
Best Practice introduced by
It is generally recommended to explicitly declare the visibility for methods.

Adding explicit visibility (private, protected, or public) is generally recommend to communicate to other developers how, and from where this method is intended to be used.

Loading history...
636
637
        $configurationsForBranch = array();
638
639
        $pageTSconfig = $this->getPageTSconfigForId($rootid);
640
        if (is_array($pageTSconfig) && is_array($pageTSconfig['tx_crawler.']['crawlerCfg.']) && is_array($pageTSconfig['tx_crawler.']['crawlerCfg.']['paramSets.']))    {
641
642
            $sets = $pageTSconfig['tx_crawler.']['crawlerCfg.']['paramSets.'];
643
            if(is_array($sets)) {
644
                foreach($sets as $key=>$value) {
645
                    if(!is_array($value)) continue;
646
                    $configurationsForBranch[] = substr($key,-1)=='.'?substr($key,0,-1):$key;
647
                }
648
649
            }
650
        }
651
        $pids = array();
652
        $rootLine = \TYPO3\CMS\Backend\Utility\BackendUtility::BEgetRootLine($rootid);
653
        foreach($rootLine as $node) {
654
            $pids[] = $node['uid'];
655
        }
656
        /* @var \TYPO3\CMS\Backend\Tree\View\PageTreeView */
657
        $tree = \TYPO3\CMS\Core\Utility\GeneralUtility::makeInstance('TYPO3\CMS\Backend\Tree\View\PageTreeView');
658
        $perms_clause = $GLOBALS['BE_USER']->getPagePermsClause(1);
659
        $tree->init('AND ' . $perms_clause);
660
        $tree->getTree($rootid, $depth, '');
661
        foreach($tree->tree as $node) {
662
            $pids[] = $node['row']['uid'];
663
        }
664
665
        $res = $this->db->exec_SELECTquery(
666
            '*',
667
            'tx_crawler_configuration',
668
            'pid IN ('.implode(',', $pids).') '.
669
            \TYPO3\CMS\Backend\Utility\BackendUtility::BEenableFields('tx_crawler_configuration') .
670
            \TYPO3\CMS\Backend\Utility\BackendUtility::deleteClause('tx_crawler_configuration').' '.
671
            \TYPO3\CMS\Backend\Utility\BackendUtility::versioningPlaceholderClause('tx_crawler_configuration').' '
672
        );
673
674
        while($row = $this->db->sql_fetch_assoc($res)) {
675
            $configurationsForBranch[] = $row['name'];
676
        }
677
        $this->db->sql_free_result($res);
678
        return $configurationsForBranch;
679
    }
680
681
    /**
682
     * Check if a user has access to an item
683
     * (e.g. get the group list of the current logged in user from $GLOBALS['TSFE']->gr_list)
684
     *
685
     * @see \TYPO3\CMS\Frontend\Page\PageRepository::getMultipleGroupsWhereClause()
686
     * @param  string $groupList    Comma-separated list of (fe_)group UIDs from a user
687
     * @param  string $accessList   Comma-separated list of (fe_)group UIDs of the item to access
688
     * @return bool                 TRUE if at least one of the users group UIDs is in the access list or the access list is empty
689
     * @author Fabrizio Branca <[email protected]>
690
     * @since 2009-01-19
691
     */
692 3
    function hasGroupAccess($groupList, $accessList) {
0 ignored issues
show
Best Practice introduced by
It is generally recommended to explicitly declare the visibility for methods.

Adding explicit visibility (private, protected, or public) is generally recommend to communicate to other developers how, and from where this method is intended to be used.

Loading history...
693 3
        if (empty($accessList)) {
694 1
            return true;
695
        }
696 2
        foreach(\TYPO3\CMS\Core\Utility\GeneralUtility::intExplode(',', $groupList) as $groupUid) {
697 2
            if (\TYPO3\CMS\Core\Utility\GeneralUtility::inList($accessList, $groupUid)) {
698 2
                return true;
699
            }
700
        }
701 1
        return false;
702
    }
703
704
    /**
705
     * Parse GET vars of input Query into array with key=>value pairs
706
     *
707
     * @param  string  $inputQuery  Input query string
708
     * @return array                Keys are Get var names, values are the values of the GET vars.
709
     */
710 3
    function parseParams($inputQuery) {
0 ignored issues
show
Best Practice introduced by
It is generally recommended to explicitly declare the visibility for methods.

Adding explicit visibility (private, protected, or public) is generally recommend to communicate to other developers how, and from where this method is intended to be used.

Loading history...
711
            // Extract all GET parameters into an ARRAY:
712 3
        $paramKeyValues = array();
713 3
        $GETparams = explode('&', $inputQuery);
714
715 3
        foreach($GETparams as $paramAndValue)    {
716 3
            list($p,$v) = explode('=', $paramAndValue, 2);
717 3
            if (strlen($p))        {
718 3
                $paramKeyValues[rawurldecode($p)] = rawurldecode($v);
719
            }
720
        }
721
722 3
        return $paramKeyValues;
723
    }
724
725
    /**
726
     * Will expand the parameters configuration to individual values. This follows a certain syntax of the value of each parameter.
727
     * Syntax of values:
728
     * - Basically: If the value is wrapped in [...] it will be expanded according to the following syntax, otherwise the value is taken literally
729
     * - Configuration is splitted by "|" and the parts are processed individually and finally added together
730
     * - For each configuration part:
731
     *         - "[int]-[int]" = Integer range, will be expanded to all values in between, values included, starting from low to high (max. 1000). Example "1-34" or "-40--30"
732
     *         - "_TABLE:[TCA table name];[_PID:[optional page id, default is current page]];[_ENABLELANG:1]" = Look up of table records from PID, filtering out deleted records. Example "_TABLE:tt_content; _PID:123"
733
     *        _ENABLELANG:1 picks only original records without their language overlays
734
     *         - Default: Literal value
735
     *
736
     * @param    array        Array with key (GET var name) and values (value of GET var which is configuration for expansion)
737
     * @param    integer        Current page ID
738
     * @return    array        Array with key (GET var name) with the value being an array of all possible values for that key.
739
     */
740
    function expandParameters($paramArray, $pid)    {
0 ignored issues
show
Best Practice introduced by
It is generally recommended to explicitly declare the visibility for methods.

Adding explicit visibility (private, protected, or public) is generally recommend to communicate to other developers how, and from where this method is intended to be used.

Loading history...
741
        global $TCA;
742
743
            // Traverse parameter names:
744
        foreach($paramArray as $p => $v)    {
745
            $v = trim($v);
746
747
                // If value is encapsulated in square brackets it means there are some ranges of values to find, otherwise the value is literal
748
            if (substr($v,0,1)==='[' && substr($v,-1)===']')    {
749
                    // So, find the value inside brackets and reset the paramArray value as an array.
750
                $v = substr($v,1,-1);
751
                $paramArray[$p] = array();
752
753
                    // Explode parts and traverse them:
754
                $parts = explode('|',$v);
755
                foreach($parts as $pV)    {
756
757
                        // Look for integer range: (fx. 1-34 or -40--30 // reads minus 40 to minus 30)
758
                    if (preg_match('/^(-?[0-9]+)\s*-\s*(-?[0-9]+)$/',trim($pV),$reg))    {    // Integer range:
759
760
                            // Swap if first is larger than last:
761
                        if ($reg[1] > $reg[2])    {
762
                            $temp = $reg[2];
763
                            $reg[2] = $reg[1];
764
                            $reg[1] = $temp;
765
                        }
766
767
                            // Traverse range, add values:
768
                        $runAwayBrake = 1000;    // Limit to size of range!
769
                        for($a=$reg[1]; $a<=$reg[2];$a++)    {
770
                            $paramArray[$p][] = $a;
771
                            $runAwayBrake--;
772
                            if ($runAwayBrake<=0)    {
773
                                break;
774
                            }
775
                        }
776
                    } elseif (substr(trim($pV),0,7)=='_TABLE:')    {
777
778
                            // Parse parameters:
779
                        $subparts = \TYPO3\CMS\Core\Utility\GeneralUtility::trimExplode(';',$pV);
780
                        $subpartParams = array();
781
                        foreach($subparts as $spV)    {
782
                            list($pKey,$pVal) = \TYPO3\CMS\Core\Utility\GeneralUtility::trimExplode(':',$spV);
783
                            $subpartParams[$pKey] = $pVal;
784
                        }
785
786
                            // Table exists:
787
                        if (isset($TCA[$subpartParams['_TABLE']]))    {
788
                            $lookUpPid = isset($subpartParams['_PID']) ? intval($subpartParams['_PID']) : $pid;
789
                            $pidField = isset($subpartParams['_PIDFIELD']) ? trim($subpartParams['_PIDFIELD']) : 'pid';
790
                            $where = isset($subpartParams['_WHERE']) ? $subpartParams['_WHERE'] : '';
791
                            $addTable = isset($subpartParams['_ADDTABLE']) ? $subpartParams['_ADDTABLE'] : '';
792
793
                            $fieldName = $subpartParams['_FIELD'] ? $subpartParams['_FIELD'] : 'uid';
794
                            if ($fieldName==='uid' || $TCA[$subpartParams['_TABLE']]['columns'][$fieldName]) {
795
796
                                $andWhereLanguage = '';
797
                                $transOrigPointerField = $TCA[$subpartParams['_TABLE']]['ctrl']['transOrigPointerField'];
798
799
                                if ($subpartParams['_ENABLELANG'] && $transOrigPointerField) {
800
                                    $andWhereLanguage = ' AND ' . $this->db->quoteStr($transOrigPointerField, $subpartParams['_TABLE']) .' <= 0 ';
801
                                }
802
803
                                $where = $this->db->quoteStr($pidField, $subpartParams['_TABLE']) .'='.intval($lookUpPid) . ' ' .
804
                                    $andWhereLanguage . $where;
805
806
                                $rows = $this->db->exec_SELECTgetRows(
807
                                    $fieldName,
808
                                    $subpartParams['_TABLE'] . $addTable,
809
                                    $where . \TYPO3\CMS\Backend\Utility\BackendUtility::deleteClause($subpartParams['_TABLE']),
810
                                    '',
811
                                    '',
812
                                    '',
813
                                    $fieldName
814
                                );
815
816
                                if (is_array($rows))    {
817
                                    $paramArray[$p] = array_merge($paramArray[$p],array_keys($rows));
818
                                }
819
                            }
820
                        }
821
                    } else {    // Just add value:
822
                        $paramArray[$p][] = $pV;
823
                    }
824
                        // Hook for processing own expandParameters place holder
825
                    if (is_array($GLOBALS['TYPO3_CONF_VARS']['SC_OPTIONS']['crawler/class.tx_crawler_lib.php']['expandParameters'])) {
826
                        $_params = array(
827
                            'pObj' => &$this,
828
                            'paramArray' => &$paramArray,
829
                            'currentKey' => $p,
830
                            'currentValue' => $pV,
831
                            'pid' => $pid
832
                        );
833
                        foreach($GLOBALS['TYPO3_CONF_VARS']['SC_OPTIONS']['crawler/class.tx_crawler_lib.php']['expandParameters'] as $key => $_funcRef)    {
834
                            \TYPO3\CMS\Core\Utility\GeneralUtility::callUserFunction($_funcRef, $_params, $this);
835
                        }
836
                    }
837
                }
838
839
                    // Make unique set of values and sort array by key:
840
                $paramArray[$p] = array_unique($paramArray[$p]);
841
                ksort($paramArray);
842
            } else {
843
                    // Set the literal value as only value in array:
844
                $paramArray[$p] = array($v);
845
            }
846
        }
847
848
        return $paramArray;
849
    }
850
851
    /**
852
     * Compiling URLs from parameter array (output of expandParameters())
853
     * The number of URLs will be the multiplication of the number of parameter values for each key
854
     *
855
     * @param  array  $paramArray   Output of expandParameters(): Array with keys (GET var names) and for each an array of values
856
     * @param  array  $urls         URLs accumulated in this array (for recursion)
857
     * @return array                URLs accumulated, if number of urls exceed 'maxCompileUrls' it will return false as an error!
858
     */
859 3
    public function compileUrls($paramArray, $urls = array()) {
860
861 3
        if (count($paramArray) && is_array($urls)) {
862
                // shift first off stack:
863 2
            reset($paramArray);
864 2
            $varName = key($paramArray);
865 2
            $valueSet = array_shift($paramArray);
866
867
                // Traverse value set:
868 2
            $newUrls = array();
869 2
            foreach($urls as $url) {
870 1
                foreach($valueSet as $val) {
871 1
                    $newUrls[] = $url.(strcmp($val,'') ? '&'.rawurlencode($varName).'='.rawurlencode($val) : '');
872
873 1
                    if (count($newUrls) >  \TYPO3\CMS\Core\Utility\MathUtility::forceIntegerInRange($this->extensionSettings['maxCompileUrls'], 1, 1000000000, 10000)) {
874 1
                        break;
875
                    }
876
                }
877
            }
878 2
            $urls = $newUrls;
879 2
            $urls = $this->compileUrls($paramArray, $urls);
880
        }
881
882 3
        return $urls;
883
    }
884
885
    /************************************
886
     *
887
     * Crawler log
888
     *
889
     ************************************/
890
891
    /**
892
     * Return array of records from crawler queue for input page ID
893
     *
894
     * @param  integer $id              Page ID for which to look up log entries.
895
     * @param  string  $filter          Filter: "all" => all entries, "pending" => all that is not yet run, "finished" => all complete ones
896
     * @param  boolean $doFlush         If TRUE, then entries selected at DELETED(!) instead of selected!
897
     * @param  boolean $doFullFlush
898
     * @param  integer $itemsPerPage    Limit the amount of entries per page default is 10
899
     * @return array
900
     */
901
    public function getLogEntriesForPageId($id, $filter = '', $doFlush = FALSE, $doFullFlush = FALSE, $itemsPerPage = 10) {
902
        // FIXME: Write Unit tests for Filters
903
        switch($filter) {
904
            case 'pending':
905
                $addWhere = ' AND exec_time=0';
906
                break;
907
            case 'finished':
908
                $addWhere = ' AND exec_time>0';
909
                break;
910
            default:
911
                $addWhere = '';
912
                break;
913
        }
914
915
        // FIXME: Write unit test that ensures that the right records are deleted.
916
        if ($doFlush) {
917
            $this->flushQueue( ($doFullFlush?'1=1':('page_id='.intval($id))) .$addWhere);
918
            return array();
919
        } else {
920
            return $this->db->exec_SELECTgetRows('*',
921
                'tx_crawler_queue',
922
                'page_id=' . intval($id) . $addWhere, '', 'scheduled DESC',
923
                (intval($itemsPerPage)>0 ? intval($itemsPerPage) : ''));
924
        }
925
    }
926
927
    /**
928
     * Return array of records from crawler queue for input set ID
929
     *
930
     * @param    integer        Set ID for which to look up log entries.
931
     * @param    string        Filter: "all" => all entries, "pending" => all that is not yet run, "finished" => all complete ones
932
     * @param    boolean        If TRUE, then entries selected at DELETED(!) instead of selected!
933
     * @param    integer        Limit the amount of entires per page default is 10
934
     * @return    array
935
     */
936
    public function getLogEntriesForSetId($set_id,$filter='',$doFlush=FALSE, $doFullFlush=FALSE, $itemsPerPage=10)    {
937
        // FIXME: Write Unit tests for Filters
938
        switch($filter)    {
939
            case 'pending':
940
                $addWhere = ' AND exec_time=0';
941
                break;
942
            case 'finished':
943
                $addWhere = ' AND exec_time>0';
944
                break;
945
            default:
946
                $addWhere = '';
947
                break;
948
        }
949
        // FIXME: Write unit test that ensures that the right records are deleted.
950
        if ($doFlush)    {
951
            $this->flushQueue($doFullFlush?'':('set_id='.intval($set_id).$addWhere));
952
            return array();
953
        } else {
954
            return $this->db->exec_SELECTgetRows('*',
955
                'tx_crawler_queue',
956
                'set_id='.intval($set_id).$addWhere,'','scheduled DESC',
957
                (intval($itemsPerPage)>0 ? intval($itemsPerPage) : ''));
958
        }
959
    }
960
961
    /**
962
     * Removes queue entires
963
     *
964
     * @param $where    SQL related filter for the entries which should be removed
965
     * @return void
966
     */
967
    protected function flushQueue($where='') {
968
969
        $realWhere = strlen($where)>0?$where:'1=1';
970
971
        if(tx_crawler_domain_events_dispatcher::getInstance()->hasObserver('queueEntryFlush')) {
972
            $groups = $this->db->exec_SELECTgetRows('DISTINCT set_id','tx_crawler_queue',$realWhere);
973
            foreach($groups as $group) {
0 ignored issues
show
Bug introduced by
The expression $groups of type null|array is not guaranteed to be traversable. How about adding an additional type check?

There are different options of fixing this problem.

  1. If you want to be on the safe side, you can add an additional type-check:

    $collection = json_decode($data, true);
    if ( ! is_array($collection)) {
        throw new \RuntimeException('$collection must be an array.');
    }
    
    foreach ($collection as $item) { /** ... */ }
    
  2. If you are sure that the expression is traversable, you might want to add a doc comment cast to improve IDE auto-completion and static analysis:

    /** @var array $collection */
    $collection = json_decode($data, true);
    
    foreach ($collection as $item) { /** .. */ }
    
  3. Mark the issue as a false-positive: Just hover the remove button, in the top-right corner of this issue for more options.

Loading history...
974
                tx_crawler_domain_events_dispatcher::getInstance()->post('queueEntryFlush',$group['set_id'], $this->db->exec_SELECTgetRows('uid, set_id','tx_crawler_queue',$realWhere.' AND set_id="'.$group['set_id'].'"'));
975
            }
976
        }
977
978
        $this->db->exec_DELETEquery('tx_crawler_queue', $realWhere);
979
    }
980
981
    /**
982
     * Adding call back entries to log (called from hooks typically, see indexed search class "class.crawler.php"
983
     *
984
     * @param    integer        Set ID
985
     * @param    array        Parameters to pass to call back function
986
     * @param    string        Call back object reference, eg. 'EXT:indexed_search/class.crawler.php:&tx_indexedsearch_crawler'
987
     * @param    integer        Page ID to attach it to
988
     * @param    integer        Time at which to activate
989
     * @return    void
990
     */
991
    function addQueueEntry_callBack($setId,$params,$callBack,$page_id=0,$schedule=0) {
0 ignored issues
show
Best Practice introduced by
It is generally recommended to explicitly declare the visibility for methods.

Adding explicit visibility (private, protected, or public) is generally recommend to communicate to other developers how, and from where this method is intended to be used.

Loading history...
992
993
        if (!is_array($params))    $params = array();
994
        $params['_CALLBACKOBJ'] = $callBack;
995
996
            // Compile value array:
997
        $fieldArray = array(
998
            'page_id' => intval($page_id),
999
            'parameters' => serialize($params),
1000
            'scheduled' => intval($schedule) ? intval($schedule) : $this->getCurrentTime(),
1001
            'exec_time' => 0,
1002
            'set_id' => intval($setId),
1003
            'result_data' => '',
1004
        );
1005
1006
        $this->db->exec_INSERTquery('tx_crawler_queue',$fieldArray);
1007
    }
1008
1009
1010
1011
1012
1013
1014
1015
1016
1017
1018
1019
    /************************************
1020
     *
1021
     * URL setting
1022
     *
1023
     ************************************/
1024
1025
    /**
1026
     * Setting a URL for crawling:
1027
     *
1028
     * @param    integer        Page ID
1029
     * @param    string        Complete URL
1030
     * @param    array        Sub configuration array (from TS config)
1031
     * @param    integer        Scheduled-time
1032
     * @param     string        (optional) configuration hash
1033
     * @param     bool        (optional) skip inner duplication check
1034
     * @return    bool        true if the url was added, false if it already existed
1035
     */
1036
    function addUrl (
0 ignored issues
show
Best Practice introduced by
It is generally recommended to explicitly declare the visibility for methods.

Adding explicit visibility (private, protected, or public) is generally recommend to communicate to other developers how, and from where this method is intended to be used.

Loading history...
1037
        $id,
1038
        $url,
1039
        array $subCfg,
1040
        $tstamp,
1041
        $configurationHash='',
1042
        $skipInnerDuplicationCheck=false
1043
    ) {
1044
1045
        $urlAdded = false;
1046
1047
            // Creating parameters:
1048
        $parameters = array(
1049
            'url' => $url
1050
        );
1051
1052
            // fe user group simulation:
1053
        $uGs = implode(',',array_unique(\TYPO3\CMS\Core\Utility\GeneralUtility::intExplode(',',$subCfg['userGroups'],1)));
0 ignored issues
show
Documentation introduced by
1 is of type integer, but the function expects a boolean.

It seems like the type of the argument is not accepted by the function/method which you are calling.

In some cases, in particular if PHP’s automatic type-juggling kicks in this might be fine. In other cases, however this might be a bug.

We suggest to add an explicit type cast like in the following example:

function acceptsInteger($int) { }

$x = '123'; // string "123"

// Instead of
acceptsInteger($x);

// we recommend to use
acceptsInteger((integer) $x);
Loading history...
1054
        if ($uGs)    {
1055
            $parameters['feUserGroupList'] = $uGs;
1056
        }
1057
1058
            // Setting processing instructions
1059
        $parameters['procInstructions'] = \TYPO3\CMS\Core\Utility\GeneralUtility::trimExplode(',',$subCfg['procInstrFilter']);
1060
        if (is_array($subCfg['procInstrParams.']))    {
1061
            $parameters['procInstrParams'] = $subCfg['procInstrParams.'];
1062
        }
1063
1064
1065
            // Compile value array:
1066
        $parameters_serialized = serialize($parameters);
1067
        $fieldArray = array(
1068
            'page_id' => intval($id),
1069
            'parameters' => $parameters_serialized,
1070
            'parameters_hash' => \TYPO3\CMS\Core\Utility\GeneralUtility::shortMD5($parameters_serialized),
1071
            'configuration_hash' => $configurationHash,
1072
            'scheduled' => $tstamp,
1073
            'exec_time' => 0,
1074
            'set_id' => intval($this->setID),
1075
            'result_data' => '',
1076
            'configuration' => $subCfg['key'],
1077
        );
1078
1079
        if ($this->registerQueueEntriesInternallyOnly)    {
0 ignored issues
show
Bug Best Practice introduced by
The expression $this->registerQueueEntriesInternallyOnly of type array is implicitly converted to a boolean; are you sure this is intended? If so, consider using ! empty($expr) instead to make it clear that you intend to check for an array without elements.

This check marks implicit conversions of arrays to boolean values in a comparison. While in PHP an empty array is considered to be equal (but not identical) to false, this is not always apparent.

Consider making the comparison explicit by using empty(..) or ! empty(...) instead.

Loading history...
1080
                //the entries will only be registered and not stored to the database
1081
            $this->queueEntries[] = $fieldArray;
1082
        } else {
1083
1084
            if(!$skipInnerDuplicationCheck){
1085
                    // check if there is already an equal entry
1086
                $rows = $this->getDuplicateRowsIfExist($tstamp,$fieldArray);
1087
            }
1088
1089
            if (count($rows) == 0) {
1090
                $this->db->exec_INSERTquery('tx_crawler_queue', $fieldArray);
1091
                $uid = $this->db->sql_insert_id();
1092
                $rows[] = $uid;
0 ignored issues
show
Bug introduced by
The variable $rows does not seem to be defined for all execution paths leading up to this point.

If you define a variable conditionally, it can happen that it is not defined for all execution paths.

Let’s take a look at an example:

function myFunction($a) {
    switch ($a) {
        case 'foo':
            $x = 1;
            break;

        case 'bar':
            $x = 2;
            break;
    }

    // $x is potentially undefined here.
    echo $x;
}

In the above example, the variable $x is defined if you pass “foo” or “bar” as argument for $a. However, since the switch statement has no default case statement, if you pass any other value, the variable $x would be undefined.

Available Fixes

  1. Check for existence of the variable explicitly:

    function myFunction($a) {
        switch ($a) {
            case 'foo':
                $x = 1;
                break;
    
            case 'bar':
                $x = 2;
                break;
        }
    
        if (isset($x)) { // Make sure it's always set.
            echo $x;
        }
    }
    
  2. Define a default value for the variable:

    function myFunction($a) {
        $x = ''; // Set a default which gets overridden for certain paths.
        switch ($a) {
            case 'foo':
                $x = 1;
                break;
    
            case 'bar':
                $x = 2;
                break;
        }
    
        echo $x;
    }
    
  3. Add a value for the missing path:

    function myFunction($a) {
        switch ($a) {
            case 'foo':
                $x = 1;
                break;
    
            case 'bar':
                $x = 2;
                break;
    
            // We add support for the missing case.
            default:
                $x = '';
                break;
        }
    
        echo $x;
    }
    
Loading history...
1093
                $urlAdded = true;
1094
                tx_crawler_domain_events_dispatcher::getInstance()->post('urlAddedToQueue',$this->setID,array('uid' => $uid, 'fieldArray' => $fieldArray));
1095
            }else{
1096
                tx_crawler_domain_events_dispatcher::getInstance()->post('duplicateUrlInQueue',$this->setID,array('rows' => $rows, 'fieldArray' => $fieldArray));
1097
            }
1098
        }
1099
1100
        return $urlAdded;
1101
    }
1102
1103
    /**
1104
     * This method determines duplicates for a queue entry with the same parameters and this timestamp.
1105
     * If the timestamp is in the past, it will check if there is any unprocessed queue entry in the past.
1106
     * If the timestamp is in the future it will check, if the queued entry has exactly the same timestamp
1107
     *
1108
     * @param int $tstamp
1109
     * @param string $parameters
0 ignored issues
show
Bug introduced by
There is no parameter named $parameters. Was it maybe removed?

This check looks for PHPDoc comments describing methods or function parameters that do not exist on the corresponding method or function.

Consider the following example. The parameter $italy is not defined by the method finale(...).

/**
 * @param array $germany
 * @param array $island
 * @param array $italy
 */
function finale($germany, $island) {
    return "2:1";
}

The most likely cause is that the parameter was removed, but the annotation was not.

Loading history...
1110
     * @author Fabrizio Branca
1111
     * @author Timo Schmidt
1112
     * @return array;
0 ignored issues
show
Documentation introduced by
The doc-type array; could not be parsed: Expected "|" or "end of type", but got ";" at position 5. (view supported doc-types)

This check marks PHPDoc comments that could not be parsed by our parser. To see which comment annotations we can parse, please refer to our documentation on supported doc-types.

Loading history...
1113
     */
1114
    protected function getDuplicateRowsIfExist($tstamp,$fieldArray){
1115
        $rows = array();
1116
1117
        $currentTime = $this->getCurrentTime();
1118
1119
            //if this entry is scheduled with "now"
1120
        if ($tstamp <= $currentTime) {
1121
            if($this->extensionSettings['enableTimeslot']){
1122
                $timeBegin     = $currentTime - 100;
1123
                $timeEnd     = $currentTime + 100;
1124
                $where         = ' ((scheduled BETWEEN '.$timeBegin.' AND '.$timeEnd.' ) OR scheduled <= '. $currentTime.') ';
1125
            }else{
1126
                $where = 'scheduled <= ' . $currentTime;
1127
            }
1128
        } elseif ($tstamp > $currentTime) {
1129
                //entry with a timestamp in the future need to have the same schedule time
1130
            $where = 'scheduled = ' . $tstamp ;
1131
        }
1132
1133
        if(!empty($where)){
1134
            $result = $this->db->exec_SELECTgetRows(
1135
                'qid',
1136
                'tx_crawler_queue',
1137
                $where.
1138
                ' AND NOT exec_time' .
1139
                ' AND NOT process_id '.
1140
                ' AND page_id='.intval($fieldArray['page_id']).
1141
                ' AND parameters_hash = ' . $this->db->fullQuoteStr($fieldArray['parameters_hash'], 'tx_crawler_queue')
1142
            );
1143
1144
            if (is_array($result)) {
1145
                foreach ($result as $value) {
1146
                    $rows[] = $value['qid'];
1147
                }
1148
            }
1149
        }
1150
1151
1152
        return $rows;
1153
    }
1154
1155
    /**
1156
     * Returns the current system time
1157
     *
1158
     * @author Timo Schmidt <[email protected]>
1159
     * @return int
1160
     */
1161
    public function getCurrentTime(){
1162
        return time();
1163
    }
1164
1165
1166
1167
    /************************************
1168
     *
1169
     * URL reading
1170
     *
1171
     ************************************/
1172
1173
    /**
1174
     * Read URL for single queue entry
1175
     *
1176
     * @param integer $queueId
1177
     * @param boolean $force If set, will process even if exec_time has been set!
1178
     * @return integer
1179
     */
1180
    function readUrl($queueId, $force = FALSE) {
0 ignored issues
show
Best Practice introduced by
It is generally recommended to explicitly declare the visibility for methods.

Adding explicit visibility (private, protected, or public) is generally recommend to communicate to other developers how, and from where this method is intended to be used.

Loading history...
1181
        $ret = 0;
1182
        if ($this->debugMode) {
1183
            \TYPO3\CMS\Core\Utility\GeneralUtility::devlog('crawler-readurl start ' . microtime(true), __FUNCTION__);
1184
        }
1185
        // Get entry:
1186
        list($queueRec) = $this->db->exec_SELECTgetRows('*', 'tx_crawler_queue',
1187
            'qid=' . intval($queueId) . ($force ? '' : ' AND exec_time=0 AND process_scheduled > 0'));
1188
1189
        if (!is_array($queueRec)) {
1190
            return;
1191
        }
1192
1193
        $pageUidRootTypoScript = \AOE\Crawler\Utility\TypoScriptUtility::getPageUidForTypoScriptRootTemplateInRootLine((int)$queueRec['page_id']);
1194
        $this->initTSFE((int)$pageUidRootTypoScript);
1195
1196
        \AOE\Crawler\Utility\SignalSlotUtility::emitSignal(
1197
            __CLASS__,
1198
            \AOE\Crawler\Utility\SignalSlotUtility::SIGNNAL_QUEUEITEM_PREPROCESS,
1199
            array($queueId, &$queueRec)
1200
        );
1201
1202
        // Set exec_time to lock record:
1203
        $field_array = array('exec_time' => $this->getCurrentTime());
1204
1205
        if (isset($this->processID)) {
1206
            //if mulitprocessing is used we need to store the id of the process which has handled this entry
1207
            $field_array['process_id_completed'] = $this->processID;
1208
        }
1209
        $this->db->exec_UPDATEquery('tx_crawler_queue', 'qid=' . intval($queueId), $field_array);
1210
1211
        $result = $this->readUrl_exec($queueRec);
1212
        $resultData = unserialize($result['content']);
1213
1214
        //atm there's no need to point to specific pollable extensions
1215
        if (is_array($GLOBALS['TYPO3_CONF_VARS']['EXTCONF']['crawler']['pollSuccess'])) {
1216
            foreach ($GLOBALS['TYPO3_CONF_VARS']['EXTCONF']['crawler']['pollSuccess'] as $pollable) {
1217
                // only check the success value if the instruction is runnig
1218
                // it is important to name the pollSuccess key same as the procInstructions key
1219
                if (is_array($resultData['parameters']['procInstructions']) && in_array($pollable,
1220
                        $resultData['parameters']['procInstructions'])
1221
                ) {
1222
                    if (!empty($resultData['success'][$pollable]) && $resultData['success'][$pollable]) {
1223
                        $ret |= self::CLI_STATUS_POLLABLE_PROCESSED;
1224
                    }
1225
                }
1226
            }
1227
        }
1228
1229
        // Set result in log which also denotes the end of the processing of this entry.
1230
        $field_array = array('result_data' => serialize($result));
1231
1232
        \AOE\Crawler\Utility\SignalSlotUtility::emitSignal(
1233
            __CLASS__,
1234
            \AOE\Crawler\Utility\SignalSlotUtility::SIGNNAL_QUEUEITEM_POSTPROCESS,
1235
            array($queueId, &$field_array)
1236
        );
1237
1238
        $this->db->exec_UPDATEquery('tx_crawler_queue', 'qid=' . intval($queueId), $field_array);
1239
1240
1241
        if ($this->debugMode) {
1242
            \TYPO3\CMS\Core\Utility\GeneralUtility::devlog('crawler-readurl stop ' . microtime(true), __FUNCTION__);
1243
        }
1244
1245
        return $ret;
1246
    }
1247
1248
    /**
1249
     * Read URL for not-yet-inserted log-entry
1250
     *
1251
     * @param    integer        Queue field array,
1252
     * @return    string
1253
     */
1254
    function readUrlFromArray($field_array)    {
0 ignored issues
show
Best Practice introduced by
It is generally recommended to explicitly declare the visibility for methods.

Adding explicit visibility (private, protected, or public) is generally recommend to communicate to other developers how, and from where this method is intended to be used.

Loading history...
1255
1256
            // Set exec_time to lock record:
1257
        $field_array['exec_time'] = $this->getCurrentTime();
1258
        $this->db->exec_INSERTquery('tx_crawler_queue', $field_array);
1259
        $queueId = $field_array['qid'] = $this->db->sql_insert_id();
1260
1261
        $result = $this->readUrl_exec($field_array);
1262
1263
            // Set result in log which also denotes the end of the processing of this entry.
1264
        $field_array = array('result_data' => serialize($result));
1265
        $this->db->exec_UPDATEquery('tx_crawler_queue','qid='.intval($queueId), $field_array);
1266
1267
        return $result;
1268
    }
1269
1270
    /**
1271
     * Read URL for a queue record
1272
     *
1273
     * @param    array        Queue record
1274
     * @return    string        Result output.
1275
     */
1276
    function readUrl_exec($queueRec)    {
0 ignored issues
show
Best Practice introduced by
It is generally recommended to explicitly declare the visibility for methods.

Adding explicit visibility (private, protected, or public) is generally recommend to communicate to other developers how, and from where this method is intended to be used.

Loading history...
1277
            // Decode parameters:
1278
        $parameters = unserialize($queueRec['parameters']);
1279
        $result = 'ERROR';
1280
        if (is_array($parameters))    {
1281
            if ($parameters['_CALLBACKOBJ'])    {    // Calling object:
1282
                $objRef = $parameters['_CALLBACKOBJ'];
1283
                $callBackObj = &\TYPO3\CMS\Core\Utility\GeneralUtility::getUserObj($objRef);
1284
                if (is_object($callBackObj))    {
1285
                    unset($parameters['_CALLBACKOBJ']);
1286
                    $result = array('content' => serialize($callBackObj->crawler_execute($parameters,$this)));
1287
                } else {
1288
                    $result = array('content' => 'No object: '.$objRef);
1289
                }
1290
            } else {    // Regular FE request:
1291
1292
                    // Prepare:
1293
                $crawlerId = $queueRec['qid'].':'.md5($queueRec['qid'].'|'.$queueRec['set_id'].'|'.$GLOBALS['TYPO3_CONF_VARS']['SYS']['encryptionKey']);
1294
1295
                    // Get result:
1296
                $result = $this->requestUrl($parameters['url'],$crawlerId);
1297
1298
                tx_crawler_domain_events_dispatcher::getInstance()->post('urlCrawled',$queueRec['set_id'],array('url' => $parameters['url'], 'result' => $result));
1299
            }
1300
        }
1301
1302
1303
        return $result;
1304
    }
1305
1306
    /**
1307
     * Gets the content of a URL.
1308
     *
1309
     * @param  string   $originalUrl    URL to read
1310
     * @param  string   $crawlerId      Crawler ID string (qid + hash to verify)
1311
     * @param  integer  $timeout        Timeout time
1312
     * @param  integer  $recursion      Recursion limiter for 302 redirects
1313
     * @return array                    Array with content
1314
     */
1315 2
    function requestUrl($originalUrl, $crawlerId, $timeout=2, $recursion=10) {
0 ignored issues
show
Best Practice introduced by
It is generally recommended to explicitly declare the visibility for methods.

Adding explicit visibility (private, protected, or public) is generally recommend to communicate to other developers how, and from where this method is intended to be used.

Loading history...
1316
1317 2
        if (!$recursion) return false;
1318
1319
            // Parse URL, checking for scheme:
1320 2
        $url = parse_url($originalUrl);
1321
1322 2
        if ($url === FALSE) {
1323
            if (TYPO3_DLOG) \TYPO3\CMS\Core\Utility\GeneralUtility::devLog(sprintf('Could not parse_url() for string "%s"', $url), 'crawler', 4, array('crawlerId' => $crawlerId));
1324
            return FALSE;
1325
        }
1326
1327 2
        if (!in_array($url['scheme'], array('','http','https'))) {
1328
            if (TYPO3_DLOG) \TYPO3\CMS\Core\Utility\GeneralUtility::devLog(sprintf('Scheme does not match for url "%s"', $url), 'crawler', 4, array('crawlerId' => $crawlerId));
1329
            return FALSE;
1330
        }
1331
1332
 	    // direct request
1333 2
        if ($this->extensionSettings['makeDirectRequests']) {
1334 2
            $result = $this->sendDirectRequest($originalUrl, $crawlerId);
1335 2
            return $result;
1336
        }
1337
1338
        $reqHeaders = $this->buildRequestHeaderArray($url, $crawlerId);
1339
1340
            // thanks to Pierrick Caillon for adding proxy support
1341
        $rurl = $url;
1342
1343
        if ($GLOBALS['TYPO3_CONF_VARS']['SYS']['curlUse'] && $GLOBALS['TYPO3_CONF_VARS']['SYS']['curlProxyServer']) {
1344
            $rurl = parse_url($GLOBALS['TYPO3_CONF_VARS']['SYS']['curlProxyServer']);
1345
            $url['path'] = $url['scheme'] . '://' . $url['host'] . ($url['port'] > 0 ? ':' . $url['port'] : '') . $url['path'];
1346
            $reqHeaders = $this->buildRequestHeaderArray($url, $crawlerId);
1347
        }
1348
1349
        $host = $rurl['host'];
1350
1351
        if ($url['scheme'] == 'https') {
1352
            $host = 'ssl://' . $host;
1353
            $port = ($rurl['port'] > 0) ? $rurl['port'] : 443;
1354
        } else {
1355
            $port = ($rurl['port'] > 0) ? $rurl['port'] : 80;
1356
        }
1357
1358
        $startTime = microtime(true);
1359
        $fp = fsockopen($host, $port, $errno, $errstr, $timeout);
1360
1361
        if (!$fp) {
1362
            if (TYPO3_DLOG) \TYPO3\CMS\Core\Utility\GeneralUtility::devLog(sprintf('Error while opening "%s"', $url), 'crawler', 4, array('crawlerId' => $crawlerId));
1363
            return FALSE;
1364
        } else {
1365
                // Request message:
1366
            $msg = implode("\r\n",$reqHeaders)."\r\n\r\n";
1367
            fputs ($fp, $msg);
1368
1369
                // Read response:
1370
            $d = $this->getHttpResponseFromStream($fp);
1371
            fclose ($fp);
1372
1373
            $time = microtime(true) - $startTime;
1374
            $this->log($originalUrl .' '.$time);
1375
1376
                // Implode content and headers:
1377
            $result = array(
1378
                'request' => $msg,
1379
                'headers' => implode('', $d['headers']),
1380
                'content' => implode('', (array)$d['content'])
1381
            );
1382
1383
            if (($this->extensionSettings['follow30x']) && ($newUrl = $this->getRequestUrlFrom302Header($d['headers'],$url['user'],$url['pass']))) {
1384
                $result = array_merge(array('parentRequest'=>$result), $this->requestUrl($newUrl, $crawlerId, $recursion--));
1385
                $newRequestUrl = $this->requestUrl($newUrl, $crawlerId, $timeout, --$recursion);
1386
1387
                if (is_array($newRequestUrl)) {
1388
                    $result = array_merge(array('parentRequest'=>$result), $newRequestUrl);
1389
                } else {
1390
                    if (TYPO3_DLOG) \TYPO3\CMS\Core\Utility\GeneralUtility::devLog(sprintf('Error while opening "%s"', $url), 'crawler', 4, array('crawlerId' => $crawlerId));
1391
                    return FALSE;
1392
                }
1393
            }
1394
1395
            return $result;
1396
        }
1397
    }
1398
1399
    /**
1400
     * Gets the base path of the website frontend.
1401
     * (e.g. if you call http://mydomain.com/cms/index.php in
1402
     * the browser the base path is "/cms/")
1403
     *
1404
     * @return string Base path of the website frontend
1405
     */
1406
    protected function getFrontendBasePath() {
1407
        $frontendBasePath = '/';
1408
1409
        // Get the path from the extension settings:
1410
        if (isset($this->extensionSettings['frontendBasePath']) && $this->extensionSettings['frontendBasePath']) {
1411
            $frontendBasePath = $this->extensionSettings['frontendBasePath'];
1412
        // If empty, try to use config.absRefPrefix:
1413
        } elseif (isset($GLOBALS['TSFE']->absRefPrefix) && !empty($GLOBALS['TSFE']->absRefPrefix)) {
1414
            $frontendBasePath = $GLOBALS['TSFE']->absRefPrefix;
1415
        // If not in CLI mode the base path can be determined from $_SERVER environment:
1416
        } elseif (!defined('TYPO3_REQUESTTYPE') || !(TYPO3_REQUESTTYPE & TYPO3_REQUESTTYPE_CLI)) {
1417
            $frontendBasePath = \TYPO3\CMS\Core\Utility\GeneralUtility::getIndpEnv('TYPO3_SITE_PATH');
1418
        }
1419
1420
        // Base path must be '/<pathSegements>/':
1421
        if ($frontendBasePath != '/') {
1422
            $frontendBasePath = '/' . ltrim($frontendBasePath, '/');
1423
            $frontendBasePath = rtrim($frontendBasePath, '/') . '/';
1424
        }
1425
1426
        return $frontendBasePath;
1427
    }
1428
1429
    /**
1430
     * Executes a shell command and returns the outputted result.
1431
     *
1432
     * @param string $command Shell command to be executed
1433
     * @return string Outputted result of the command execution
1434
     */
1435
    protected function executeShellCommand($command) {
1436
        $result = shell_exec($command);
1437
        return $result;
1438
    }
1439
1440
    /**
1441
     * Reads HTTP response from the given stream.
1442
     *
1443
     * @param  resource $streamPointer  Pointer to connection stream.
1444
     * @return array                    Associative array with the following items:
1445
     *                                  headers <array> Response headers sent by server.
1446
     *                                  content <array> Content, with each line as an array item.
1447
     */
1448
    protected function getHttpResponseFromStream($streamPointer) {
1449
        $response = array('headers' => array(), 'content' => array());
1450
1451
        if (is_resource($streamPointer)) {
1452
                // read headers
1453
            while($line = fgets($streamPointer, '2048')) {
1454
                $line = trim($line);
1455
                if ($line !== '') {
1456
                    $response['headers'][] = $line;
1457
                } else {
1458
                    break;
1459
                }
1460
            }
1461
1462
                // read content
1463
            while($line = fgets($streamPointer, '2048')) {
1464
                $response['content'][] = $line;
1465
            }
1466
        }
1467
1468
        return $response;
1469
    }
1470
1471
    /**
1472
     * @param message
1473
     */
1474 2
    protected function log($message) {
1475 2
        if (!empty($this->extensionSettings['logFileName'])) {
1476
            @file_put_contents($this->extensionSettings['logFileName'], date('Ymd His') . $message . "\n", FILE_APPEND);
0 ignored issues
show
Security Best Practice introduced by
It seems like you do not handle an error condition here. This can introduce security issues, and is generally not recommended.

If you suppress an error, we recommend checking for the error condition explicitly:

// For example instead of
@mkdir($dir);

// Better use
if (@mkdir($dir) === false) {
    throw new \RuntimeException('The directory '.$dir.' could not be created.');
}
Loading history...
1477
        }
1478 2
    }
1479
1480
    /**
1481
     * Builds HTTP request headers.
1482
     *
1483
     * @param array $url
1484
     * @param string $crawlerId
1485
     *
1486
     * @return array
1487
     */
1488 6
    protected function buildRequestHeaderArray(array $url, $crawlerId) {
1489 6
        $reqHeaders = array();
1490 6
        $reqHeaders[] = 'GET '.$url['path'].($url['query'] ? '?'.$url['query'] : '').' HTTP/1.0';
1491 6
        $reqHeaders[] = 'Host: '.$url['host'];
1492 6
        if (stristr($url['query'],'ADMCMD_previewWS')) {
1493 2
            $reqHeaders[] = 'Cookie: $Version="1"; be_typo_user="1"; $Path=/';
1494
        }
1495 6
        $reqHeaders[] = 'Connection: close';
1496 6
        if ($url['user']!='') {
1497 2
            $reqHeaders[] = 'Authorization: Basic '. base64_encode($url['user'].':'.$url['pass']);
1498
        }
1499 6
        $reqHeaders[] = 'X-T3crawler: '.$crawlerId;
1500 6
        $reqHeaders[] = 'User-Agent: TYPO3 crawler';
1501 6
        return $reqHeaders;
1502
    }
1503
1504
    /**
1505
     * Check if the submitted HTTP-Header contains a redirect location and built new crawler-url
1506
     *
1507
     * @param    array        HTTP Header
1508
     * @param    string        HTTP Auth. User
1509
     * @param    string        HTTP Auth. Password
1510
     * @return    string        URL from redirection
1511
     */
1512 12
    protected function getRequestUrlFrom302Header($headers,$user='',$pass='') {
1513 12
        if(!is_array($headers)) return false;
1514 11
        if(!(stristr($headers[0],'301 Moved') || stristr($headers[0],'302 Found') || stristr($headers[0],'302 Moved'))) return false;
1515
1516 9
        foreach($headers as $hl) {
1517 9
            $tmp = explode(": ",$hl);
1518 9
            $header[trim($tmp[0])] = trim($tmp[1]);
0 ignored issues
show
Coding Style Comprehensibility introduced by
$header was never initialized. Although not strictly required by PHP, it is generally a good practice to add $header = array(); before regardless.

Adding an explicit array definition is generally preferable to implicit array definition as it guarantees a stable state of the code.

Let’s take a look at an example:

foreach ($collection as $item) {
    $myArray['foo'] = $item->getFoo();

    if ($item->hasBar()) {
        $myArray['bar'] = $item->getBar();
    }

    // do something with $myArray
}

As you can see in this example, the array $myArray is initialized the first time when the foreach loop is entered. You can also see that the value of the bar key is only written conditionally; thus, its value might result from a previous iteration.

This might or might not be intended. To make your intention clear, your code more readible and to avoid accidental bugs, we recommend to add an explicit initialization $myArray = array() either outside or inside the foreach loop.

Loading history...
1519 9
            if(trim($tmp[0])=='Location') break;
1520
        }
1521 9
        if(!array_key_exists('Location',$header)) return false;
0 ignored issues
show
Bug introduced by
The variable $header does not seem to be defined for all execution paths leading up to this point.

If you define a variable conditionally, it can happen that it is not defined for all execution paths.

Let’s take a look at an example:

function myFunction($a) {
    switch ($a) {
        case 'foo':
            $x = 1;
            break;

        case 'bar':
            $x = 2;
            break;
    }

    // $x is potentially undefined here.
    echo $x;
}

In the above example, the variable $x is defined if you pass “foo” or “bar” as argument for $a. However, since the switch statement has no default case statement, if you pass any other value, the variable $x would be undefined.

Available Fixes

  1. Check for existence of the variable explicitly:

    function myFunction($a) {
        switch ($a) {
            case 'foo':
                $x = 1;
                break;
    
            case 'bar':
                $x = 2;
                break;
        }
    
        if (isset($x)) { // Make sure it's always set.
            echo $x;
        }
    }
    
  2. Define a default value for the variable:

    function myFunction($a) {
        $x = ''; // Set a default which gets overridden for certain paths.
        switch ($a) {
            case 'foo':
                $x = 1;
                break;
    
            case 'bar':
                $x = 2;
                break;
        }
    
        echo $x;
    }
    
  3. Add a value for the missing path:

    function myFunction($a) {
        switch ($a) {
            case 'foo':
                $x = 1;
                break;
    
            case 'bar':
                $x = 2;
                break;
    
            // We add support for the missing case.
            default:
                $x = '';
                break;
        }
    
        echo $x;
    }
    
Loading history...
1522
1523 6
        if($user!='') {
1524 3
            if(!($tmp = parse_url($header['Location']))) return false;
1525 2
            $newUrl = $tmp['scheme'] . '://' . $user . ':' . $pass . '@' . $tmp['host'] . $tmp['path'];
1526 2
            if($tmp['query']!='') $newUrl .= '?' . $tmp['query'];
1527
        } else {
1528 3
            $newUrl = $header['Location'];
1529
        }
1530 5
        return $newUrl;
1531
    }
1532
1533
1534
1535
1536
1537
1538
1539
1540
    /**************************
1541
     *
1542
     * tslib_fe hooks:
1543
     *
1544
     **************************/
1545
1546
    /**
1547
     * Initialization hook (called after database connection)
1548
     * Takes the "HTTP_X_T3CRAWLER" header and looks up queue record and verifies if the session comes from the system (by comparing hashes)
1549
     *
1550
     * @param    array        Parameters from frontend
1551
     * @param    object        TSFE object (reference under PHP5)
1552
     * @return    void
1553
     */
1554
    function fe_init(&$params, $ref)    {
0 ignored issues
show
Unused Code introduced by
The parameter $ref is not used and could be removed.

This check looks from parameters that have been defined for a function or method, but which are not used in the method body.

Loading history...
Best Practice introduced by
It is generally recommended to explicitly declare the visibility for methods.

Adding explicit visibility (private, protected, or public) is generally recommend to communicate to other developers how, and from where this method is intended to be used.

Loading history...
1555
1556
            // Authenticate crawler request:
1557
        if (isset($_SERVER['HTTP_X_T3CRAWLER']))    {
1558
            list($queueId,$hash) = explode(':', $_SERVER['HTTP_X_T3CRAWLER']);
1559
            list($queueRec) = $this->db->exec_SELECTgetRows('*','tx_crawler_queue','qid='.intval($queueId));
1560
1561
                // If a crawler record was found and hash was matching, set it up:
1562
            if (is_array($queueRec) && $hash === md5($queueRec['qid'].'|'.$queueRec['set_id'].'|'.$GLOBALS['TYPO3_CONF_VARS']['SYS']['encryptionKey']))    {
1563
                $params['pObj']->applicationData['tx_crawler']['running'] = TRUE;
1564
                $params['pObj']->applicationData['tx_crawler']['parameters'] = unserialize($queueRec['parameters']);
1565
                $params['pObj']->applicationData['tx_crawler']['log'] = array();
1566
            } else {
1567
                die('No crawler entry found!');
0 ignored issues
show
Coding Style Compatibility introduced by
The method fe_init() contains an exit expression.

An exit expression should only be used in rare cases. For example, if you write a short command line script.

In most cases however, using an exit expression makes the code untestable and often causes incompatibilities with other libraries. Thus, unless you are absolutely sure it is required here, we recommend to refactor your code to avoid its usage.

Loading history...
1568
            }
1569
        }
1570
    }
1571
1572
1573
1574
    /*****************************
1575
     *
1576
     * Compiling URLs to crawl - tools
1577
     *
1578
     *****************************/
1579
1580
    /**
1581
     * @param    integer        Root page id to start from.
1582
     * @param    integer        Depth of tree, 0=only id-page, 1= on sublevel, 99 = infinite
1583
     * @param    integer        Unix Time when the URL is timed to be visited when put in queue
1584
     * @param    integer        Number of requests per minute (creates the interleave between requests)
1585
     * @param    boolean        If set, submits the URLs to queue in database (real crawling)
1586
     * @param    boolean        If set (and submitcrawlUrls is false) will fill $downloadUrls with entries)
1587
     * @param    array        Array of processing instructions
1588
     * @param    array        Array of configuration keys
1589
     * @return    string        HTML code
1590
     */
1591
    function getPageTreeAndUrls(
0 ignored issues
show
Best Practice introduced by
It is generally recommended to explicitly declare the visibility for methods.

Adding explicit visibility (private, protected, or public) is generally recommend to communicate to other developers how, and from where this method is intended to be used.

Loading history...
1592
        $id,
1593
        $depth,
1594
        $scheduledTime,
1595
        $reqMinute,
1596
        $submitCrawlUrls,
1597
        $downloadCrawlUrls,
1598
        array $incomingProcInstructions,
1599
        array $configurationSelection
1600
    ) {
1601
1602
        global $BACK_PATH;
1603
        global $LANG;
1604
        if (!is_object($LANG)) {
1605
            $LANG = \TYPO3\CMS\Core\Utility\GeneralUtility::makeInstance('language');
1606
            $LANG->init(0);
1607
        }
1608
        $this->scheduledTime = $scheduledTime;
0 ignored issues
show
Bug introduced by
The property scheduledTime does not exist. Did you maybe forget to declare it?

In PHP it is possible to write to properties without declaring them. For example, the following is perfectly valid PHP code:

class MyClass { }

$x = new MyClass();
$x->foo = true;

Generally, it is a good practice to explictly declare properties to avoid accidental typos and provide IDE auto-completion:

class MyClass {
    public $foo;
}

$x = new MyClass();
$x->foo = true;
Loading history...
1609
        $this->reqMinute = $reqMinute;
0 ignored issues
show
Bug introduced by
The property reqMinute does not exist. Did you maybe forget to declare it?

In PHP it is possible to write to properties without declaring them. For example, the following is perfectly valid PHP code:

class MyClass { }

$x = new MyClass();
$x->foo = true;

Generally, it is a good practice to explictly declare properties to avoid accidental typos and provide IDE auto-completion:

class MyClass {
    public $foo;
}

$x = new MyClass();
$x->foo = true;
Loading history...
1610
        $this->submitCrawlUrls = $submitCrawlUrls;
0 ignored issues
show
Bug introduced by
The property submitCrawlUrls does not exist. Did you maybe forget to declare it?

In PHP it is possible to write to properties without declaring them. For example, the following is perfectly valid PHP code:

class MyClass { }

$x = new MyClass();
$x->foo = true;

Generally, it is a good practice to explictly declare properties to avoid accidental typos and provide IDE auto-completion:

class MyClass {
    public $foo;
}

$x = new MyClass();
$x->foo = true;
Loading history...
1611
        $this->downloadCrawlUrls = $downloadCrawlUrls;
0 ignored issues
show
Bug introduced by
The property downloadCrawlUrls does not exist. Did you maybe forget to declare it?

In PHP it is possible to write to properties without declaring them. For example, the following is perfectly valid PHP code:

class MyClass { }

$x = new MyClass();
$x->foo = true;

Generally, it is a good practice to explictly declare properties to avoid accidental typos and provide IDE auto-completion:

class MyClass {
    public $foo;
}

$x = new MyClass();
$x->foo = true;
Loading history...
1612
        $this->incomingProcInstructions = $incomingProcInstructions;
1613
        $this->incomingConfigurationSelection = $configurationSelection;
1614
1615
        $this->duplicateTrack = array();
1616
        $this->downloadUrls = array();
1617
1618
            // Drawing tree:
1619
            /* @var $tree \TYPO3\CMS\Backend\Tree\View\PageTreeView */
1620
        $tree = \TYPO3\CMS\Core\Utility\GeneralUtility::makeInstance('TYPO3\CMS\Backend\Tree\View\PageTreeView');
1621
        $perms_clause = $GLOBALS['BE_USER']->getPagePermsClause(1);
1622
        $tree->init('AND ' . $perms_clause);
1623
1624
        $pageinfo = \TYPO3\CMS\Backend\Utility\BackendUtility::readPageAccess($id, $perms_clause);
1625
        /** @var \TYPO3\CMS\Core\Imaging\IconFactory $iconFactory */
1626
        $iconFactory = \TYPO3\CMS\Core\Utility\GeneralUtility::makeInstance(IconFactory::class);
1627
1628
            // Set root row:
1629
        $tree->tree[] = [
1630
            'row' => $pageinfo,
1631
            'HTML' => $iconFactory->getIconForRecord('pages', $pageinfo, Icon::SIZE_SMALL)->render()
0 ignored issues
show
Security Bug introduced by
It seems like $pageinfo defined by \TYPO3\CMS\Backend\Utili...ess($id, $perms_clause) on line 1624 can also be of type false; however, TYPO3\CMS\Core\Imaging\I...ory::getIconForRecord() does only seem to accept array, did you maybe forget to handle an error condition?

This check looks for type mismatches where the missing type is false. This is usually indicative of an error condtion.

Consider the follow example

<?php

function getDate($date)
{
    if ($date !== null) {
        return new DateTime($date);
    }

    return false;
}

This function either returns a new DateTime object or false, if there was an error. This is a typical pattern in PHP programming to show that an error has occurred without raising an exception. The calling code should check for this returned false before passing on the value to another function or method that may not be able to handle a false.

Loading history...
1632
        ];
1633
1634
            // Get branch beneath:
1635
        if ($depth)    {
1636
            $tree->getTree($id, $depth, '');
1637
        }
1638
1639
            // Traverse page tree:
1640
        $code = '';
1641
1642
        foreach ($tree->tree as $data) {
1643
1644
            $this->MP = false;
1645
1646
                // recognize mount points
1647
            if($data['row']['doktype'] == 7){
1648
                $mountpage = $this->db->exec_SELECTgetRows('*', 'pages', 'uid = '.$data['row']['uid']);
1649
1650
                    // fetch mounted pages
1651
                $this->MP = $mountpage[0]['mount_pid'].'-'.$data['row']['uid'];
0 ignored issues
show
Documentation Bug introduced by
The property $MP was declared of type boolean, but $mountpage[0]['mount_pid...' . $data['row']['uid'] is of type string. Maybe add a type cast?

This check looks for assignments to scalar types that may be of the wrong type.

To ensure the code behaves as expected, it may be a good idea to add an explicit type cast.

$answer = 42;

$correct = false;

$correct = (bool) $answer;
Loading history...
1652
1653
                $mountTree = \TYPO3\CMS\Core\Utility\GeneralUtility::makeInstance('TYPO3\CMS\Backend\Tree\View\PageTreeView');
1654
                $mountTree->init('AND '.$perms_clause);
1655
                $mountTree->getTree($mountpage[0]['mount_pid'], $depth, '');
1656
1657
                foreach($mountTree->tree as $mountData)    {
1658
                    $code .= $this->drawURLs_addRowsForPage(
1659
                        $mountData['row'],
1660
                        $mountData['HTML'].\TYPO3\CMS\Backend\Utility\BackendUtility::getRecordTitle('pages',$mountData['row'],TRUE)
1661
                    );
1662
                }
1663
1664
                    // replace page when mount_pid_ol is enabled
1665
                if($mountpage[0]['mount_pid_ol']){
1666
                    $data['row']['uid'] = $mountpage[0]['mount_pid'];
1667
                } else {
1668
                        // if the mount_pid_ol is not set the MP must not be used for the mountpoint page
1669
                    $this->MP = false;
1670
                }
1671
            }
1672
1673
            $code .= $this->drawURLs_addRowsForPage(
1674
                $data['row'],
0 ignored issues
show
Security Bug introduced by
It seems like $data['row'] can also be of type false; however, tx_crawler_lib::drawURLs_addRowsForPage() does only seem to accept array, did you maybe forget to handle an error condition?
Loading history...
1675
                $data['HTML'] . \TYPO3\CMS\Backend\Utility\BackendUtility::getRecordTitle('pages', $data['row'], TRUE)
0 ignored issues
show
Security Bug introduced by
It seems like $data['row'] can also be of type false; however, TYPO3\CMS\Backend\Utilit...ility::getRecordTitle() does only seem to accept array, did you maybe forget to handle an error condition?
Loading history...
1676
            );
1677
        }
1678
1679
        return $code;
1680
    }
1681
1682
    /**
1683
     * Expands exclude string.
1684
     *
1685
     * @param  string $excludeString    Exclude string
1686
     * @return array                    Array of page ids.
1687
     */
1688
    public function expandExcludeString($excludeString) {
1689
            // internal static caches;
1690
        static $expandedExcludeStringCache;
1691
        static $treeCache;
1692
1693
        if (empty($expandedExcludeStringCache[$excludeString])) {
1694
            $pidList = array();
1695
1696
            if (!empty($excludeString)) {
1697
                /* @var $tree \TYPO3\CMS\Backend\Tree\View\PageTreeView */
1698
                $tree = \TYPO3\CMS\Core\Utility\GeneralUtility::makeInstance('TYPO3\CMS\Backend\Tree\View\PageTreeView');
1699
                $tree->init('AND ' . $this->backendUser->getPagePermsClause(1));
1700
1701
                $excludeParts = \TYPO3\CMS\Core\Utility\GeneralUtility::trimExplode(',', $excludeString);
1702
1703
                foreach ($excludeParts as $excludePart) {
1704
                    list($pid, $depth) = \TYPO3\CMS\Core\Utility\GeneralUtility::trimExplode('+', $excludePart);
1705
1706
                        // default is "page only" = "depth=0"
1707
                    if (empty($depth)) {
1708
                        $depth = ( stristr($excludePart,'+')) ? 99 : 0;
1709
                    }
1710
1711
                    $pidList[] = $pid;
1712
1713
                    if ($depth > 0) {
1714
                        if (empty($treeCache[$pid][$depth])) {
1715
                            $tree->reset();
1716
                            $tree->getTree($pid, $depth);
1717
                            $treeCache[$pid][$depth] = $tree->tree;
1718
                        }
1719
1720
                        foreach ($treeCache[$pid][$depth] as $data) {
1721
                            $pidList[] = $data['row']['uid'];
1722
                        }
1723
                    }
1724
                }
1725
            }
1726
1727
            $expandedExcludeStringCache[$excludeString] = array_unique($pidList);
1728
        }
1729
1730
        return $expandedExcludeStringCache[$excludeString];
1731
    }
1732
1733
    /**
1734
     * Create the rows for display of the page tree
1735
     * For each page a number of rows are shown displaying GET variable configuration
1736
     *
1737
     * @param    array        Page row
1738
     * @param    string        Page icon and title for row
1739
     * @return    string        HTML <tr> content (one or more)
1740
     */
1741
    public function drawURLs_addRowsForPage(array $pageRow, $pageTitleAndIcon)    {
1742
1743
        $skipMessage = '';
1744
1745
            // Get list of configurations
1746
        $configurations = $this->getUrlsForPageRow($pageRow, $skipMessage);
1747
1748
        if (count($this->incomingConfigurationSelection) > 0) {
1749
                //     remove configuration that does not match the current selection
1750
            foreach ($configurations as $confKey => $confArray) {
1751
                if (!in_array($confKey, $this->incomingConfigurationSelection)) {
1752
                    unset($configurations[$confKey]);
1753
                }
1754
            }
1755
        }
1756
1757
            // Traverse parameter combinations:
1758
        $c = 0;
1759
        $cc = 0;
0 ignored issues
show
Unused Code introduced by
$cc is not used, you could remove the assignment.

This check looks for variable assignements that are either overwritten by other assignments or where the variable is not used subsequently.

$myVar = 'Value';
$higher = false;

if (rand(1, 6) > 3) {
    $higher = true;
} else {
    $higher = false;
}

Both the $myVar assignment in line 1 and the $higher assignment in line 2 are dead. The first because $myVar is never used and the second because $higher is always overwritten for every possible time line.

Loading history...
1760
        $content = '';
1761
        if (count($configurations)) {
1762
            foreach($configurations as $confKey => $confArray)    {
1763
1764
                    // Title column:
1765
                if (!$c) {
1766
                    $titleClm = '<td rowspan="'.count($configurations).'">'.$pageTitleAndIcon.'</td>';
1767
                } else {
1768
                    $titleClm = '';
1769
                }
1770
1771
1772
                if (!in_array($pageRow['uid'], $this->expandExcludeString($confArray['subCfg']['exclude']))) {
1773
1774
                        // URL list:
1775
                    $urlList = $this->urlListFromUrlArray(
1776
                        $confArray,
1777
                        $pageRow,
1778
                        $this->scheduledTime,
1779
                        $this->reqMinute,
1780
                        $this->submitCrawlUrls,
1781
                        $this->downloadCrawlUrls,
1782
                        $this->duplicateTrack,
1783
                        $this->downloadUrls,
1784
                        $this->incomingProcInstructions // if empty the urls won't be filtered by processing instructions
1785
                    );
1786
1787
                        // Expanded parameters:
1788
                    $paramExpanded = '';
1789
                    $calcAccu = array();
1790
                    $calcRes = 1;
1791
                    foreach($confArray['paramExpanded'] as $gVar => $gVal)    {
1792
                        $paramExpanded.= '
1793
                            <tr>
1794
                                <td class="bgColor4-20">'.htmlspecialchars('&'.$gVar.'=').'<br/>'.
1795
                                                '('.count($gVal).')'.
1796
                                                '</td>
1797
                                <td class="bgColor4" nowrap="nowrap">'.nl2br(htmlspecialchars(implode(chr(10),$gVal))).'</td>
1798
                            </tr>
1799
                        ';
1800
                        $calcRes*= count($gVal);
1801
                        $calcAccu[] = count($gVal);
1802
                    }
1803
                    $paramExpanded = '<table class="lrPadding c-list param-expanded">'.$paramExpanded.'</table>';
1804
                    $paramExpanded.= 'Comb: '.implode('*',$calcAccu).'='.$calcRes;
1805
1806
                        // Options
1807
                    $optionValues = '';
1808
                    if ($confArray['subCfg']['userGroups'])    {
1809
                        $optionValues.='User Groups: '.$confArray['subCfg']['userGroups'].'<br/>';
1810
                    }
1811
                    if ($confArray['subCfg']['baseUrl'])    {
1812
                        $optionValues.='Base Url: '.$confArray['subCfg']['baseUrl'].'<br/>';
1813
                    }
1814
                    if ($confArray['subCfg']['procInstrFilter'])    {
1815
                        $optionValues.='ProcInstr: '.$confArray['subCfg']['procInstrFilter'].'<br/>';
1816
                    }
1817
1818
                        // Compile row:
1819
                    $content .= '
1820
                        <tr class="bgColor' . ($c%2 ? '-20':'-10') . '">
1821
                            ' . $titleClm . '
1822
                            <td>' . htmlspecialchars($confKey) . '</td>
1823
                            <td>' . nl2br(htmlspecialchars(rawurldecode(trim(str_replace('&', chr(10) . '&', \TYPO3\CMS\Core\Utility\GeneralUtility::implodeArrayForUrl('', $confArray['paramParsed'])))))) . '</td>
1824
                            <td>'.$paramExpanded.'</td>
1825
                            <td nowrap="nowrap">' . $urlList . '</td>
1826
                            <td nowrap="nowrap">' . $optionValues . '</td>
1827
                            <td nowrap="nowrap">' . \TYPO3\CMS\Core\Utility\DebugUtility::viewArray($confArray['subCfg']['procInstrParams.']) . '</td>
1828
                        </tr>';
1829
                } else {
1830
1831
                    $content .= '<tr class="bgColor'.($c%2 ? '-20':'-10') . '">
1832
                            '.$titleClm.'
1833
                            <td>'.htmlspecialchars($confKey).'</td>
1834
                            <td colspan="5"><em>No entries</em> (Page is excluded in this configuration)</td>
1835
                        </tr>';
1836
1837
                }
1838
1839
1840
                $c++;
1841
            }
1842
        } else {
1843
            $message = !empty($skipMessage) ? ' ('.$skipMessage.')' : '';
1844
1845
                // Compile row:
1846
            $content.= '
1847
                <tr class="bgColor-20" style="border-bottom: 1px solid black;">
1848
                    <td>'.$pageTitleAndIcon.'</td>
1849
                    <td colspan="6"><em>No entries</em>'.$message.'</td>
1850
                </tr>';
1851
        }
1852
1853
        return $content;
1854
    }
1855
1856
    /**
1857
     *
1858
     * @return int
1859
     */
1860
    function getUnprocessedItemsCount() {
1861
        $res = $this->db->exec_SELECTquery(
1862
                    'count(*) as num',
1863
                    'tx_crawler_queue',
1864
                    'exec_time=0
1865
                    AND process_scheduled= 0
1866
                    AND scheduled<='.$this->getCurrentTime()
1867
        );
1868
1869
        $count = $this->db->sql_fetch_assoc($res);
1870
        return $count['num'];
1871
    }
1872
1873
1874
1875
1876
1877
1878
1879
1880
    /*****************************
1881
     *
1882
     * CLI functions
1883
     *
1884
     *****************************/
1885
1886
    /**
1887
     * Main function for running from Command Line PHP script (cron job)
1888
     * See ext/crawler/cli/crawler_cli.phpsh for details
1889
     *
1890
     * @return    int number of remaining items or false if error
1891
     */
1892
    function CLI_main() {
1893
        $this->setAccessMode('cli');
1894
        $result = self::CLI_STATUS_NOTHING_PROCCESSED;
1895
        $cliObj = \TYPO3\CMS\Core\Utility\GeneralUtility::makeInstance('tx_crawler_cli');
1896
1897
        if (isset($cliObj->cli_args['-h']) || isset($cliObj->cli_args['--help'])) {
1898
            $cliObj->cli_validateArgs();
1899
            $cliObj->cli_help();
1900
            exit;
0 ignored issues
show
Coding Style Compatibility introduced by
The method CLI_main() contains an exit expression.

An exit expression should only be used in rare cases. For example, if you write a short command line script.

In most cases however, using an exit expression makes the code untestable and often causes incompatibilities with other libraries. Thus, unless you are absolutely sure it is required here, we recommend to refactor your code to avoid its usage.

Loading history...
1901
        }
1902
1903
        if (!$this->getDisabled() && $this->CLI_checkAndAcquireNewProcess($this->CLI_buildProcessId())) {
1904
            $countInARun = $cliObj->cli_argValue('--countInARun') ? intval($cliObj->cli_argValue('--countInARun')) : $this->extensionSettings['countInARun'];
1905
                // Seconds
1906
            $sleepAfterFinish = $cliObj->cli_argValue('--sleepAfterFinish') ? intval($cliObj->cli_argValue('--sleepAfterFinish')) : $this->extensionSettings['sleepAfterFinish'];
1907
                // Milliseconds
1908
            $sleepTime = $cliObj->cli_argValue('--sleepTime') ? intval($cliObj->cli_argValue('--sleepTime')) : $this->extensionSettings['sleepTime'];
1909
1910
            try {
1911
                    // Run process:
1912
                $result = $this->CLI_run($countInARun, $sleepTime, $sleepAfterFinish);
1913
            } catch (Exception $e) {
1914
                $result = self::CLI_STATUS_ABORTED;
1915
            }
1916
1917
                // Cleanup
1918
            $this->db->exec_DELETEquery('tx_crawler_process', 'assigned_items_count = 0');
1919
1920
                //TODO can't we do that in a clean way?
1921
            $releaseStatus = $this->CLI_releaseProcesses($this->CLI_buildProcessId());
0 ignored issues
show
Unused Code introduced by
$releaseStatus is not used, you could remove the assignment.

This check looks for variable assignements that are either overwritten by other assignments or where the variable is not used subsequently.

$myVar = 'Value';
$higher = false;

if (rand(1, 6) > 3) {
    $higher = true;
} else {
    $higher = false;
}

Both the $myVar assignment in line 1 and the $higher assignment in line 2 are dead. The first because $myVar is never used and the second because $higher is always overwritten for every possible time line.

Loading history...
1922
1923
            $this->CLI_debug("Unprocessed Items remaining:".$this->getUnprocessedItemsCount()." (".$this->CLI_buildProcessId().")");
1924
            $result |= ( $this->getUnprocessedItemsCount() > 0 ? self::CLI_STATUS_REMAIN : self::CLI_STATUS_NOTHING_PROCCESSED );
1925
        } else {
1926
            $result |= self::CLI_STATUS_ABORTED;
1927
        }
1928
1929
        return $result;
1930
    }
1931
1932
    /**
1933
     * Function executed by crawler_im.php cli script.
1934
     *
1935
     * @return    void
1936
     */
1937
    function CLI_main_im()    {
1938
        $this->setAccessMode('cli_im');
1939
1940
        $cliObj = \TYPO3\CMS\Core\Utility\GeneralUtility::makeInstance('tx_crawler_cli_im');
1941
1942
            // Force user to admin state and set workspace to "Live":
1943
        $this->backendUser->user['admin'] = 1;
1944
        $this->backendUser->setWorkspace(0);
1945
1946
            // Print help
1947
        if (!isset($cliObj->cli_args['_DEFAULT'][1]))    {
1948
            $cliObj->cli_validateArgs();
1949
            $cliObj->cli_help();
1950
            exit;
0 ignored issues
show
Coding Style Compatibility introduced by
The method CLI_main_im() contains an exit expression.

An exit expression should only be used in rare cases. For example, if you write a short command line script.

In most cases however, using an exit expression makes the code untestable and often causes incompatibilities with other libraries. Thus, unless you are absolutely sure it is required here, we recommend to refactor your code to avoid its usage.

Loading history...
1951
        }
1952
1953
        $cliObj->cli_validateArgs();
1954
1955
        if ($cliObj->cli_argValue('-o')==='exec')    {
1956
            $this->registerQueueEntriesInternallyOnly=TRUE;
0 ignored issues
show
Documentation Bug introduced by
It seems like TRUE of type boolean is incompatible with the declared type array of property $registerQueueEntriesInternallyOnly.

Our type inference engine has found an assignment to a property that is incompatible with the declared type of that property.

Either this assignment is in error or the assigned type should be added to the documentation/type hint for that property..

Loading history...
1957
        }
1958
1959
        if (isset($cliObj->cli_args['_DEFAULT'][2])) {
1960
            // Crawler is called over TYPO3 BE
1961
            $pageId = \TYPO3\CMS\Core\Utility\MathUtility::forceIntegerInRange($cliObj->cli_args['_DEFAULT'][2], 0);
1962
        } else {
1963
            // Crawler is called over cli
1964
            $pageId = \TYPO3\CMS\Core\Utility\MathUtility::forceIntegerInRange($cliObj->cli_args['_DEFAULT'][1], 0);
1965
        }
1966
1967
        $configurationKeys  = $this->getConfigurationKeys($cliObj);
1968
1969
        if(!is_array($configurationKeys)){
1970
            $configurations = $this->getUrlsForPageId($pageId);
1971
            if(is_array($configurations)){
1972
                $configurationKeys = array_keys($configurations);
1973
            }else{
1974
                $configurationKeys = array();
1975
            }
1976
        }
1977
1978
        if($cliObj->cli_argValue('-o')==='queue' || $cliObj->cli_argValue('-o')==='exec'){
1979
1980
            $reason = new tx_crawler_domain_reason();
1981
            $reason->setReason(tx_crawler_domain_reason::REASON_GUI_SUBMIT);
1982
            $reason->setDetailText('The cli script of the crawler added to the queue');
1983
            tx_crawler_domain_events_dispatcher::getInstance()->post(
1984
                'invokeQueueChange',
1985
                $this->setID,
1986
                array(    'reason' => $reason )
1987
            );
1988
        }
1989
1990
        if ($this->extensionSettings['cleanUpOldQueueEntries']) {
1991
            $this->cleanUpOldQueueEntries();
1992
        }
1993
1994
        $this->setID = \TYPO3\CMS\Core\Utility\GeneralUtility::md5int(microtime());
0 ignored issues
show
Documentation Bug introduced by
It seems like \TYPO3\CMS\Core\Utility\...ty::md5int(microtime()) can also be of type double. However, the property $setID is declared as type integer. Maybe add an additional type check?

Our type inference engine has found a suspicous assignment of a value to a property. This check raises an issue when a value that can be of a mixed type is assigned to a property that is type hinted more strictly.

For example, imagine you have a variable $accountId that can either hold an Id object or false (if there is no account id yet). Your code now assigns that value to the id property of an instance of the Account class. This class holds a proper account, so the id value must no longer be false.

Either this assignment is in error or a type check should be added for that assignment.

class Id
{
    public $id;

    public function __construct($id)
    {
        $this->id = $id;
    }

}

class Account
{
    /** @var  Id $id */
    public $id;
}

$account_id = false;

if (starsAreRight()) {
    $account_id = new Id(42);
}

$account = new Account();
if ($account instanceof Id)
{
    $account->id = $account_id;
}
Loading history...
1995
        $this->getPageTreeAndUrls(
1996
            $pageId,
1997
            \TYPO3\CMS\Core\Utility\MathUtility::forceIntegerInRange($cliObj->cli_argValue('-d'),0,99),
1998
            $this->getCurrentTime(),
1999
            \TYPO3\CMS\Core\Utility\MathUtility::forceIntegerInRange($cliObj->cli_isArg('-n') ? $cliObj->cli_argValue('-n') : 30,1,1000),
2000
            $cliObj->cli_argValue('-o')==='queue' || $cliObj->cli_argValue('-o')==='exec',
2001
            $cliObj->cli_argValue('-o')==='url',
2002
            \TYPO3\CMS\Core\Utility\GeneralUtility::trimExplode(',',$cliObj->cli_argValue('-proc'),1),
0 ignored issues
show
Documentation introduced by
1 is of type integer, but the function expects a boolean.

It seems like the type of the argument is not accepted by the function/method which you are calling.

In some cases, in particular if PHP’s automatic type-juggling kicks in this might be fine. In other cases, however this might be a bug.

We suggest to add an explicit type cast like in the following example:

function acceptsInteger($int) { }

$x = '123'; // string "123"

// Instead of
acceptsInteger($x);

// we recommend to use
acceptsInteger((integer) $x);
Loading history...
2003
            $configurationKeys
2004
        );
2005
2006
        if ($cliObj->cli_argValue('-o')==='url') {
2007
            $cliObj->cli_echo(implode(chr(10),$this->downloadUrls).chr(10),1);
0 ignored issues
show
Documentation introduced by
1 is of type integer, but the function expects a boolean.

It seems like the type of the argument is not accepted by the function/method which you are calling.

In some cases, in particular if PHP’s automatic type-juggling kicks in this might be fine. In other cases, however this might be a bug.

We suggest to add an explicit type cast like in the following example:

function acceptsInteger($int) { }

$x = '123'; // string "123"

// Instead of
acceptsInteger($x);

// we recommend to use
acceptsInteger((integer) $x);
Loading history...
2008
        } elseif ($cliObj->cli_argValue('-o')==='exec')    {
2009
            $cliObj->cli_echo("Executing ".count($this->urlList)." requests right away:\n\n");
2010
            $cliObj->cli_echo(implode(chr(10),$this->urlList).chr(10));
2011
            $cliObj->cli_echo("\nProcessing:\n");
2012
2013
            foreach($this->queueEntries as $queueRec)    {
2014
                $p = unserialize($queueRec['parameters']);
2015
                $cliObj->cli_echo($p['url'].' ('.implode(',',$p['procInstructions']).') => ');
2016
2017
                $result = $this->readUrlFromArray($queueRec);
2018
2019
                $requestResult = unserialize($result['content']);
2020
                if (is_array($requestResult))    {
2021
                    $resLog = is_array($requestResult['log']) ?  chr(10).chr(9).chr(9).implode(chr(10).chr(9).chr(9),$requestResult['log']) : '';
2022
                    $cliObj->cli_echo('OK: '.$resLog.chr(10));
2023
                } else {
2024
                    $cliObj->cli_echo('Error checking Crawler Result: '.substr(preg_replace('/\s+/',' ',strip_tags($result['content'])),0,30000).'...'.chr(10));
2025
                }
2026
            }
2027
        } elseif ($cliObj->cli_argValue('-o')==='queue')    {
2028
            $cliObj->cli_echo("Putting ".count($this->urlList)." entries in queue:\n\n");
2029
            $cliObj->cli_echo(implode(chr(10),$this->urlList).chr(10));
2030
        } else {
2031
            $cliObj->cli_echo(count($this->urlList)." entries found for processing. (Use -o to decide action):\n\n",1);
0 ignored issues
show
Documentation introduced by
1 is of type integer, but the function expects a boolean.

It seems like the type of the argument is not accepted by the function/method which you are calling.

In some cases, in particular if PHP’s automatic type-juggling kicks in this might be fine. In other cases, however this might be a bug.

We suggest to add an explicit type cast like in the following example:

function acceptsInteger($int) { }

$x = '123'; // string "123"

// Instead of
acceptsInteger($x);

// we recommend to use
acceptsInteger((integer) $x);
Loading history...
2032
            $cliObj->cli_echo(implode(chr(10),$this->urlList).chr(10),1);
0 ignored issues
show
Documentation introduced by
1 is of type integer, but the function expects a boolean.

It seems like the type of the argument is not accepted by the function/method which you are calling.

In some cases, in particular if PHP’s automatic type-juggling kicks in this might be fine. In other cases, however this might be a bug.

We suggest to add an explicit type cast like in the following example:

function acceptsInteger($int) { }

$x = '123'; // string "123"

// Instead of
acceptsInteger($x);

// we recommend to use
acceptsInteger((integer) $x);
Loading history...
2033
        }
2034
    }
2035
2036
    /**
2037
     * Function executed by crawler_im.php cli script.
2038
     *
2039
     * @return bool
2040
     */
2041
    function CLI_main_flush() {
2042
        $this->setAccessMode('cli_flush');
2043
        $cliObj = \TYPO3\CMS\Core\Utility\GeneralUtility::makeInstance('tx_crawler_cli_flush');
2044
2045
            // Force user to admin state and set workspace to "Live":
2046
        $this->backendUser->user['admin'] = 1;
2047
        $this->backendUser->setWorkspace(0);
2048
2049
            // Print help
2050
        if (!isset($cliObj->cli_args['_DEFAULT'][1])) {
2051
            $cliObj->cli_validateArgs();
2052
            $cliObj->cli_help();
2053
            exit;
0 ignored issues
show
Coding Style Compatibility introduced by
The method CLI_main_flush() contains an exit expression.

An exit expression should only be used in rare cases. For example, if you write a short command line script.

In most cases however, using an exit expression makes the code untestable and often causes incompatibilities with other libraries. Thus, unless you are absolutely sure it is required here, we recommend to refactor your code to avoid its usage.

Loading history...
2054
        }
2055
2056
        $cliObj->cli_validateArgs();
2057
        $pageId = \TYPO3\CMS\Core\Utility\MathUtility::forceIntegerInRange($cliObj->cli_args['_DEFAULT'][1],0);
2058
        $fullFlush = ($pageId == 0);
2059
2060
        $mode = $cliObj->cli_argValue('-o');
2061
2062
        switch($mode) {
2063
            case 'all':
2064
                $result = $this->getLogEntriesForPageId($pageId, '', true, $fullFlush);
2065
                break;
2066
            case 'finished':
2067
            case 'pending':
2068
                $result = $this->getLogEntriesForPageId($pageId, $mode, true, $fullFlush);
2069
                break;
2070
            default:
2071
                $cliObj->cli_validateArgs();
2072
                $cliObj->cli_help();
2073
                $result = false;
2074
        }
2075
2076
        return $result !== false;
2077
    }
2078
2079
    /**
2080
     * Obtains configuration keys from the CLI arguments
2081
     *
2082
     * @param  tx_crawler_cli_im $cliObj    Command line object
2083
     * @return mixed                        Array of keys or null if no keys found
2084
     */
2085
    protected function getConfigurationKeys(tx_crawler_cli_im &$cliObj) {
2086
        $parameter = trim($cliObj->cli_argValue('-conf'));
2087
        return ($parameter != '' ? \TYPO3\CMS\Core\Utility\GeneralUtility::trimExplode(',', $parameter) : array());
2088
    }
2089
2090
    /**
2091
     * Running the functionality of the CLI (crawling URLs from queue)
2092
     *
2093
     * @param  int $countInARun
2094
     * @param  int $sleepTime
2095
     * @param  int $sleepAfterFinish
2096
     * @return string                   Status message
2097
     */
2098
    public function CLI_run($countInARun, $sleepTime, $sleepAfterFinish) {
2099
        $result = 0;
2100
        $counter = 0;
2101
2102
            // First, run hooks:
2103
        $this->CLI_runHooks();
2104
2105
            // Clean up the queue
2106
        if (intval($this->extensionSettings['purgeQueueDays']) > 0) {
2107
            $purgeDate = $this->getCurrentTime() - 24 * 60 * 60 * intval($this->extensionSettings['purgeQueueDays']);
2108
            $del = $this->db->exec_DELETEquery(
0 ignored issues
show
Unused Code introduced by
$del is not used, you could remove the assignment.

This check looks for variable assignements that are either overwritten by other assignments or where the variable is not used subsequently.

$myVar = 'Value';
$higher = false;

if (rand(1, 6) > 3) {
    $higher = true;
} else {
    $higher = false;
}

Both the $myVar assignment in line 1 and the $higher assignment in line 2 are dead. The first because $myVar is never used and the second because $higher is always overwritten for every possible time line.

Loading history...
2109
                'tx_crawler_queue',
2110
                'exec_time!=0 AND exec_time<' . $purgeDate
2111
            );
2112
        }
2113
2114
            // Select entries:
2115
            //TODO Shouldn't this reside within the transaction?
2116
        $rows = $this->db->exec_SELECTgetRows(
2117
            'qid,scheduled',
2118
            'tx_crawler_queue',
2119
            'exec_time=0
2120
                AND process_scheduled= 0
2121
                AND scheduled<='.$this->getCurrentTime(),
2122
            '',
2123
            'scheduled, qid',
2124
        intval($countInARun)
2125
        );
2126
2127
        if (count($rows)>0) {
2128
            $quidList = array();
2129
2130
            foreach($rows as $r) {
0 ignored issues
show
Bug introduced by
The expression $rows of type null|array is not guaranteed to be traversable. How about adding an additional type check?

There are different options of fixing this problem.

  1. If you want to be on the safe side, you can add an additional type-check:

    $collection = json_decode($data, true);
    if ( ! is_array($collection)) {
        throw new \RuntimeException('$collection must be an array.');
    }
    
    foreach ($collection as $item) { /** ... */ }
    
  2. If you are sure that the expression is traversable, you might want to add a doc comment cast to improve IDE auto-completion and static analysis:

    /** @var array $collection */
    $collection = json_decode($data, true);
    
    foreach ($collection as $item) { /** .. */ }
    
  3. Mark the issue as a false-positive: Just hover the remove button, in the top-right corner of this issue for more options.

Loading history...
2131
                $quidList[] = $r['qid'];
2132
            }
2133
2134
            $processId = $this->CLI_buildProcessId();
2135
2136
                //reserve queue entrys for process
2137
            $this->db->sql_query('BEGIN');
2138
                //TODO make sure we're not taking assigned queue-entires
2139
            $this->db->exec_UPDATEquery(
2140
                'tx_crawler_queue',
2141
                'qid IN ('.implode(',',$quidList).')',
2142
                array(
2143
                    'process_scheduled' => intval($this->getCurrentTime()),
2144
                    'process_id' => $processId
2145
                )
2146
            );
2147
2148
                //save the number of assigned queue entrys to determine who many have been processed later
2149
            $numberOfAffectedRows = $this->db->sql_affected_rows();
2150
            $this->db->exec_UPDATEquery(
2151
                'tx_crawler_process',
2152
                "process_id = '".$processId."'" ,
2153
                array(
2154
                    'assigned_items_count' => intval($numberOfAffectedRows)
2155
                )
2156
            );
2157
2158
            if($numberOfAffectedRows == count($quidList)) {
2159
                $this->db->sql_query('COMMIT');
2160
            } else  {
2161
                $this->db->sql_query('ROLLBACK');
2162
                $this->CLI_debug("Nothing processed due to multi-process collision (".$this->CLI_buildProcessId().")");
2163
                return ( $result | self::CLI_STATUS_ABORTED );
2164
            }
2165
2166
2167
2168
            foreach($rows as $r)    {
0 ignored issues
show
Bug introduced by
The expression $rows of type null|array is not guaranteed to be traversable. How about adding an additional type check?

There are different options of fixing this problem.

  1. If you want to be on the safe side, you can add an additional type-check:

    $collection = json_decode($data, true);
    if ( ! is_array($collection)) {
        throw new \RuntimeException('$collection must be an array.');
    }
    
    foreach ($collection as $item) { /** ... */ }
    
  2. If you are sure that the expression is traversable, you might want to add a doc comment cast to improve IDE auto-completion and static analysis:

    /** @var array $collection */
    $collection = json_decode($data, true);
    
    foreach ($collection as $item) { /** .. */ }
    
  3. Mark the issue as a false-positive: Just hover the remove button, in the top-right corner of this issue for more options.

Loading history...
2169
                $result |= $this->readUrl($r['qid']);
2170
2171
                $counter++;
2172
                usleep(intval($sleepTime));    // Just to relax the system
2173
2174
                    // if during the start and the current read url the cli has been disable we need to return from the function
2175
                    // mark the process NOT as ended.
2176
                if ($this->getDisabled()) {
2177
                    return ( $result | self::CLI_STATUS_ABORTED );
2178
                }
2179
2180
                if (!$this->CLI_checkIfProcessIsActive($this->CLI_buildProcessId())) {
2181
                    $this->CLI_debug("conflict / timeout (".$this->CLI_buildProcessId().")");
2182
2183
                        //TODO might need an additional returncode
2184
                    $result |= self::CLI_STATUS_ABORTED;
2185
                    break;        //possible timeout
2186
                }
2187
            }
2188
2189
            sleep(intval($sleepAfterFinish));
2190
2191
            $msg = 'Rows: '.$counter;
2192
            $this->CLI_debug($msg." (".$this->CLI_buildProcessId().")");
2193
2194
        } else {
2195
            $this->CLI_debug("Nothing within queue which needs to be processed (".$this->CLI_buildProcessId().")");
2196
        }
2197
2198
        if($counter > 0) {
2199
            $result |= self::CLI_STATUS_PROCESSED;
2200
        }
2201
2202
        return $result;
2203
    }
2204
2205
    /**
2206
     * Activate hooks
2207
     *
2208
     * @return    void
2209
     */
2210
    function CLI_runHooks()    {
2211
        global $TYPO3_CONF_VARS;
2212
        if (is_array($TYPO3_CONF_VARS['EXTCONF']['crawler']['cli_hooks']))    {
2213
            foreach($TYPO3_CONF_VARS['EXTCONF']['crawler']['cli_hooks'] as $objRef)    {
2214
                $hookObj = &\TYPO3\CMS\Core\Utility\GeneralUtility::getUserObj($objRef);
2215
                if (is_object($hookObj))    {
2216
                    $hookObj->crawler_init($this);
2217
                }
2218
            }
2219
        }
2220
    }
2221
2222
    /**
2223
     * Try to acquire a new process with the given id
2224
     * also performs some auto-cleanup for orphan processes
2225
     * @todo preemption might not be the most elegant way to clean up
2226
     *
2227
     * @param  string    $id  identification string for the process
2228
     * @return boolean        determines whether the attempt to get resources was successful
2229
     */
2230
    function CLI_checkAndAcquireNewProcess($id) {
2231
2232
        $ret = true;
2233
2234
        $systemProcessId = getmypid();
2235
        if ($systemProcessId < 1) {
2236
            return FALSE;
2237
        }
2238
2239
        $processCount = 0;
2240
        $orphanProcesses = array();
2241
2242
        $this->db->sql_query('BEGIN');
2243
2244
        $res = $this->db->exec_SELECTquery(
2245
            'process_id,ttl',
2246
            'tx_crawler_process',
2247
            'active=1 AND deleted=0'
2248
            );
2249
2250
            $currentTime = $this->getCurrentTime();
2251
2252
            while($row = $this->db->sql_fetch_assoc($res))    {
2253
                if ($row['ttl'] < $currentTime) {
2254
                    $orphanProcesses[] = $row['process_id'];
2255
                } else {
2256
                    $processCount++;
2257
                }
2258
            }
2259
2260
                // if there are less than allowed active processes then add a new one
2261
            if ($processCount < intval($this->extensionSettings['processLimit'])) {
2262
                $this->CLI_debug("add ".$this->CLI_buildProcessId()." (".($processCount+1)."/".intval($this->extensionSettings['processLimit']).")");
2263
2264
                    // create new process record
2265
                $this->db->exec_INSERTquery(
2266
                'tx_crawler_process',
2267
                array(
2268
                    'process_id' => $id,
2269
                    'active'=>'1',
2270
                    'ttl' => ($currentTime + intval($this->extensionSettings['processMaxRunTime'])),
2271
                    'system_process_id' => $systemProcessId
2272
                )
2273
                );
2274
2275
            } else {
2276
                $this->CLI_debug("Processlimit reached (".($processCount)."/".intval($this->extensionSettings['processLimit']).")");
2277
                $ret = false;
2278
            }
2279
2280
            $this->CLI_releaseProcesses($orphanProcesses, true); // maybe this should be somehow included into the current lock
2281
            $this->CLI_deleteProcessesMarkedDeleted();
2282
2283
            $this->db->sql_query('COMMIT');
2284
2285
            return $ret;
2286
    }
2287
2288
    /**
2289
     * Release a process and the required resources
2290
     *
2291
     * @param  mixed    $releaseIds   string with a single process-id or array with multiple process-ids
2292
     * @param  boolean  $withinLock   show whether the DB-actions are included within an existing lock
2293
     * @return boolean
2294
     */
2295
    function CLI_releaseProcesses($releaseIds, $withinLock=false) {
2296
2297
        if (!is_array($releaseIds)) {
2298
            $releaseIds = array($releaseIds);
2299
        }
2300
2301
        if (!count($releaseIds) > 0) {
2302
            return false;   //nothing to release
2303
        }
2304
2305
        if(!$withinLock) $this->db->sql_query('BEGIN');
2306
2307
            // some kind of 2nd chance algo - this way you need at least 2 processes to have a real cleanup
2308
            // this ensures that a single process can't mess up the entire process table
2309
2310
            // mark all processes as deleted which have no "waiting" queue-entires and which are not active
2311
        $this->db->exec_UPDATEquery(
2312
            'tx_crawler_queue',
2313
            'process_id IN (SELECT process_id FROM tx_crawler_process WHERE active=0 AND deleted=0)',
2314
            array(
2315
                'process_scheduled' => 0,
2316
                'process_id' => ''
2317
            )
2318
        );
2319
        $this->db->exec_UPDATEquery(
2320
            'tx_crawler_process',
2321
            'active=0 AND deleted=0
2322
            AND NOT EXISTS (
2323
                SELECT * FROM tx_crawler_queue
2324
                WHERE tx_crawler_queue.process_id = tx_crawler_process.process_id
2325
                AND tx_crawler_queue.exec_time = 0
2326
            )',
2327
            array(
2328
                'deleted'=>'1',
2329
                'system_process_id' => 0
2330
            )
2331
        );
2332
                // mark all requested processes as non-active
2333
        $this->db->exec_UPDATEquery(
2334
            'tx_crawler_process',
2335
            'process_id IN (\''.implode('\',\'',$releaseIds).'\') AND deleted=0',
2336
            array(
2337
                'active'=>'0'
2338
            )
2339
        );
2340
        $this->db->exec_UPDATEquery(
2341
            'tx_crawler_queue',
2342
            'exec_time=0 AND process_id IN ("'.implode('","',$releaseIds).'")',
2343
            array(
2344
                'process_scheduled'=>0,
2345
                'process_id'=>''
2346
            )
2347
        );
2348
2349
        if(!$withinLock) $this->db->sql_query('COMMIT');
2350
2351
        return true;
2352
    }
2353
2354
    /**
2355
     * Delete processes marked as deleted
2356
     *
2357
     * @return void
2358
     */
2359
     public function CLI_deleteProcessesMarkedDeleted() {
2360
        $this->db->exec_DELETEquery('tx_crawler_process', 'deleted = 1');
2361
    }
2362
2363
    /**
2364
     * Check if there are still resources left for the process with the given id
2365
     * Used to determine timeouts and to ensure a proper cleanup if there's a timeout
2366
     *
2367
     * @param  string  identification string for the process
2368
     * @return boolean determines if the process is still active / has resources
2369
     *
2370
     * FIXME: Please remove Transaction, not needed as only a select query.
2371
     */
2372
    function CLI_checkIfProcessIsActive($pid) {
2373
        $ret = false;
2374
        $this->db->sql_query('BEGIN');
2375
        $res = $this->db->exec_SELECTquery(
2376
            'process_id,active,ttl',
2377
            'tx_crawler_process','process_id = \''.$pid.'\'  AND deleted=0',
2378
            '',
2379
            'ttl',
2380
            '0,1'
2381
        );
2382
        if($row = $this->db->sql_fetch_assoc($res))    {
2383
            $ret = intVal($row['active'])==1;
2384
        }
2385
        $this->db->sql_query('COMMIT');
2386
2387
        return $ret;
2388
    }
2389
2390
    /**
2391
     * Create a unique Id for the current process
2392
     *
2393
     * @return string  the ID
2394
     */
2395 2
    function CLI_buildProcessId() {
2396 2
        if(!$this->processID) {
2397 1
            $this->processID= \TYPO3\CMS\Core\Utility\GeneralUtility::shortMD5($this->microtime(true));
2398
        }
2399 2
        return $this->processID;
2400
    }
2401
2402
    /**
2403
     * @param bool $get_as_float
2404
     *
2405
     * @return mixed
2406
     */
2407
    protected function microtime($get_as_float = false )
2408
    {
2409
        return microtime($get_as_float);
2410
    }
2411
2412
    /**
2413
     * Prints a message to the stdout (only if debug-mode is enabled)
2414
     *
2415
     * @param  string $msg  the message
2416
     */
2417
    function CLI_debug($msg) {
2418
        if(intval($this->extensionSettings['processDebug'])) {
2419
            echo $msg."\n"; flush();
2420
        }
2421
    }
2422
2423
2424
2425
    /**
2426
     * Get URL content by making direct request to TYPO3.
2427
     *
2428
     * @param  string $url          Page URL
2429
     * @param  int    $crawlerId    Crawler-ID
2430
     * @return array
2431
     */
2432 2
    protected function sendDirectRequest($url, $crawlerId) {
2433 2
        $requestHeaders = $this->buildRequestHeaderArray(parse_url($url), $crawlerId);
0 ignored issues
show
Security Bug introduced by
It seems like parse_url($url) targeting parse_url() can also be of type false; however, tx_crawler_lib::buildRequestHeaderArray() does only seem to accept array, did you maybe forget to handle an error condition?
Loading history...
2434
2435 2
        $cmd  = escapeshellcmd($this->extensionSettings['phpPath']);
2436 2
        $cmd .= ' ';
2437 2
        $cmd .= escapeshellarg(\TYPO3\CMS\Core\Utility\ExtensionManagementUtility::extPath('crawler') . 'cli/bootstrap.php');
2438 2
        $cmd .= ' ';
2439 2
        $cmd .= escapeshellarg($this->getFrontendBasePath());
2440 2
        $cmd .= ' ';
2441 2
        $cmd .= escapeshellarg($url);
2442 2
        $cmd .= ' ';
2443 2
        $cmd .= escapeshellarg(base64_encode(serialize($requestHeaders)));
2444
2445 2
        $startTime = microtime(true);
2446 2
        $content = $this->executeShellCommand($cmd);
2447 2
        $this->log($url . (microtime(true) - $startTime));
2448
2449
        $result = array(
2450 2
            'request' => implode("\r\n", $requestHeaders) . "\r\n\r\n",
2451 2
            'headers' => '',
2452 2
            'content' => $content
2453
        );
2454
2455 2
        return $result;
2456
    }
2457
2458
    /**
2459
     * Cleans up entries that stayed for too long in the queue. These are:
2460
     * - processed entries that are over 1.5 days in age
2461
     * - scheduled entries that are over 7 days old
2462
     *
2463
     * @return void
2464
     */
2465
    protected function cleanUpOldQueueEntries() {
2466
        $processedAgeInSeconds = $this->extensionSettings['cleanUpProcessedAge'] * 86400; // 24*60*60 Seconds in 24 hours
2467
        $scheduledAgeInSeconds = $this->extensionSettings['cleanUpScheduledAge'] * 86400;
2468
2469
        $now = time();
2470
        $condition = '(exec_time<>0 AND exec_time<' . ($now - $processedAgeInSeconds) . ') OR scheduled<=' . ($now - $scheduledAgeInSeconds);
2471
        $this->flushQueue($condition);
2472
    }
2473
2474
    /**
2475
     * Initializes a TypoScript Frontend necessary for using TypoScript and TypoLink functions
2476
     *
2477
     * @param int $id
2478
     * @param int $typeNum
2479
     *
2480
     * @return void
2481
     */
2482
    protected function initTSFE($id = 1, $typeNum = 0) {
2483
        \TYPO3\CMS\Frontend\Utility\EidUtility::initTCA();
2484
        if (!is_object($GLOBALS['TT'])) {
2485
            $GLOBALS['TT'] = new \TYPO3\CMS\Core\TimeTracker\NullTimeTracker;
0 ignored issues
show
Deprecated Code introduced by
The class TYPO3\CMS\Core\TimeTracker\NullTimeTracker has been deprecated with message: since TYPO3 v8, will be removed in v9

This class, trait or interface has been deprecated. The supplier of the file has supplied an explanatory message.

The explanatory message should give you some clue as to whether and when the type will be removed from the class and what other constant to use instead.

Loading history...
2486
            $GLOBALS['TT']->start();
0 ignored issues
show
Deprecated Code introduced by
The method TYPO3\CMS\Core\TimeTrack...ullTimeTracker::start() has been deprecated with message: since TYPO3 v8, will be removed in v9, use the regular time tracking

This method has been deprecated. The supplier of the class has supplied an explanatory message.

The explanatory message should give you some clue as to whether and when the method will be removed from the class and what other method or class to use instead.

Loading history...
2487
        }
2488
2489
        $GLOBALS['TSFE'] = \TYPO3\CMS\Core\Utility\GeneralUtility::makeInstance('TYPO3\\CMS\\Frontend\\Controller\\TypoScriptFrontendController',  $GLOBALS['TYPO3_CONF_VARS'], $id, $typeNum);
2490
        $GLOBALS['TSFE']->sys_page = \TYPO3\CMS\Core\Utility\GeneralUtility::makeInstance('TYPO3\\CMS\\Frontend\\Page\\PageRepository');
2491
        $GLOBALS['TSFE']->sys_page->init(TRUE);
2492
        $GLOBALS['TSFE']->connectToDB();
2493
        $GLOBALS['TSFE']->getPageAndRootline();
2494
        $GLOBALS['TSFE']->initFEuser();
2495
        $GLOBALS['TSFE']->initTemplate();
2496
        $GLOBALS['TSFE']->rootLine = $GLOBALS['TSFE']->sys_page->getRootLine($id, '');
2497
        $GLOBALS['TSFE']->getConfigArray();
2498
        \TYPO3\CMS\Frontend\Page\PageGenerator::pagegenInit();
2499
    }
2500
}
2501
2502 1
if (defined('TYPO3_MODE') && $TYPO3_CONF_VARS[TYPO3_MODE]['XCLASS']['ext/crawler/class.tx_crawler_lib.php'])    {
2503
    include_once($TYPO3_CONF_VARS[TYPO3_MODE]['XCLASS']['ext/crawler/class.tx_crawler_lib.php']);
2504
}
2505