Completed
Push — issue/182 ( 3b84dc...2c0a23 )
by Tomas Norre
09:54
created

tx_crawler_lib::compileUrls()   C

Complexity

Conditions 7
Paths 2

Size

Total Lines 25
Code Lines 14

Duplication

Lines 0
Ratio 0 %

Code Coverage

Tests 14
CRAP Score 7

Importance

Changes 0
Metric Value
cc 7
eloc 14
nc 2
nop 2
dl 0
loc 25
ccs 14
cts 14
cp 1
crap 7
rs 6.7272
c 0
b 0
f 0
1
<?php
2
/***************************************************************
3
 *  Copyright notice
4
 *
5
 *  (c) 2016 AOE GmbH <[email protected]>
6
 *
7
 *  All rights reserved
8
 *
9
 *  This script is part of the TYPO3 project. The TYPO3 project is
10
 *  free software; you can redistribute it and/or modify
11
 *  it under the terms of the GNU General Public License as published by
12
 *  the Free Software Foundation; either version 3 of the License, or
13
 *  (at your option) any later version.
14
 *
15
 *  The GNU General Public License can be found at
16
 *  http://www.gnu.org/copyleft/gpl.html.
17
 *
18
 *  This script is distributed in the hope that it will be useful,
19
 *  but WITHOUT ANY WARRANTY; without even the implied warranty of
20
 *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
21
 *  GNU General Public License for more details.
22
 *
23
 *  This copyright notice MUST APPEAR in all copies of the script!
24
 ***************************************************************/
25
26
/**
27
 * Class tx_crawler_lib
28
 */
29
class tx_crawler_lib {
30
31
    var $setID = 0;
32
    var $processID ='';
33
    var $max_CLI_exec_time = 3600;    // One hour is max stalled time for the CLI (If the process has had the status "start" for 3600 seconds it will be regarded stalled and a new process is started.
34
35
    var $duplicateTrack = array();
36
    var $downloadUrls = array();
37
38
    var $incomingProcInstructions = array();
39
    var $incomingConfigurationSelection = array();
40
41
42
    var $registerQueueEntriesInternallyOnly = array();
43
    var $queueEntries = array();
44
    var $urlList = array();
45
46
    var $debugMode=FALSE;
47
48
    var $extensionSettings=array();
49
50
    var $MP = false; // mount point
51
52
    protected $processFilename;
53
54
    /**
55
     * Holds the internal access mode can be 'gui','cli' or 'cli_im'
56
     *
57
     * @var string
58
     */
59
    protected $accessMode;
60
61
    /**
62
     * @var \TYPO3\CMS\Core\Database\DatabaseConnection
63
     */
64
    private $db;
65
66
    /**
67
     * @var TYPO3\CMS\Core\Authentication\BackendUserAuthentication
68
     */
69
    private $backendUser;
70
71
    const CLI_STATUS_NOTHING_PROCCESSED = 0;
72
    const CLI_STATUS_REMAIN = 1;    //queue not empty
73
    const CLI_STATUS_PROCESSED = 2;    //(some) queue items where processed
74
    const CLI_STATUS_ABORTED = 4;    //instance didn't finish
75
    const CLI_STATUS_POLLABLE_PROCESSED = 8;
76
77
    /**
78
     * Method to set the accessMode can be gui, cli or cli_im
79
     *
80
     * @return string
81
     */
82 1
    public function getAccessMode() {
83 1
        return $this->accessMode;
84
    }
85
86
    /**
87
     * @param string $accessMode
88
     */
89 1
    public function setAccessMode($accessMode) {
90 1
        $this->accessMode = $accessMode;
91 1
    }
92
93
    /**
94
     * Set disabled status to prevent processes from being processed
95
     *
96
     * @param  bool $disabled (optional, defaults to true)
97
     * @return void
98
     */
99 3
    public function setDisabled($disabled = true) {
100 3
        if ($disabled) {
101 2
            \TYPO3\CMS\Core\Utility\GeneralUtility::writeFile($this->processFilename, '');
102
        } else {
103 1
            if (is_file($this->processFilename)) {
104 1
                unlink($this->processFilename);
105
            }
106
        }
107 3
    }
108
109
    /**
110
     * Get disable status
111
     *
112
     * @return bool true if disabled
113
     */
114 3
    public function getDisabled() {
115 3
        if (is_file($this->processFilename)) {
116 2
            return true;
117
        } else {
118 1
            return false;
119
        }
120
    }
121
122
    /**
123
     * @param string $filenameWithPath
124
     *
125
     * @return void
126
     */
127 4
    public function setProcessFilename($filenameWithPath)
128
    {
129 4
        $this->processFilename = $filenameWithPath;
130 4
    }
131
132
    /**
133
     * @return string
134
     */
135 1
    public function getProcessFilename()
136
    {
137 1
        return $this->processFilename;
138
    }
139
140
141
142
    /************************************
143
     *
144
     * Getting URLs based on Page TSconfig
145
     *
146
     ************************************/
147
148 23
    public function __construct() {
149 23
        $this->db = $GLOBALS['TYPO3_DB'];
150 23
        $this->backendUser = $GLOBALS['BE_USER'];
151 23
        $this->processFilename = PATH_site.'typo3temp/tx_crawler.proc';
152
153 23
        $settings = unserialize($GLOBALS['TYPO3_CONF_VARS']['EXT']['extConf']['crawler']);
154 23
        $settings = is_array($settings) ? $settings : array();
155
156
        // read ext_em_conf_template settings and set
157 23
        $this->setExtensionSettings($settings);
158
159
160
        // set defaults:
161 23
        if (\TYPO3\CMS\Core\Utility\MathUtility::convertToPositiveInteger($this->extensionSettings['countInARun']) == 0) {
162 1
            $this->extensionSettings['countInARun'] = 100;
163
        }
164
165 23
        $this->extensionSettings['processLimit'] = \TYPO3\CMS\Core\Utility\MathUtility::forceIntegerInRange($this->extensionSettings['processLimit'],1,99,1);
166 23
    }
167
168
    /**
169
     * Sets the extensions settings (unserialized pendant of $TYPO3_CONF_VARS['EXT']['extConf']['crawler']).
170
     *
171
     * @param array $extensionSettings
172
     * @return void
173
     */
174 31
    public function setExtensionSettings(array $extensionSettings) {
175 31
        $this->extensionSettings = $extensionSettings;
176 31
    }
177
178
    /**
179
     * Check if the given page should be crawled
180
     *
181
     * @param array $pageRow
182
     * @return false|string false if the page should be crawled (not excluded), true / skipMessage if it should be skipped
183
     * @author Fabrizio Branca <[email protected]>
184
     */
185 6
    public function checkIfPageShouldBeSkipped(array $pageRow) {
186
187 6
        $skipPage = false;
188 6
        $skipMessage = 'Skipped'; // message will be overwritten later
189
190
            // if page is hidden
191 6
        if (!$this->extensionSettings['crawlHiddenPages']) {
192 6
            if ($pageRow['hidden']) {
193 1
                $skipPage = true;
194 1
                $skipMessage = 'Because page is hidden';
195
            }
196
        }
197
198 6
        if (!$skipPage) {
199 5
            if (\TYPO3\CMS\Core\Utility\GeneralUtility::inList('3,4', $pageRow['doktype']) || $pageRow['doktype']>=199)    {
200 3
                $skipPage = true;
201 3
                $skipMessage = 'Because doktype is not allowed';
202
            }
203
        }
204
205 6
        if (!$skipPage) {
206 2
            if (is_array($GLOBALS['TYPO3_CONF_VARS']['EXTCONF']['crawler']['excludeDoktype'])) {
207 2
                foreach ($GLOBALS['TYPO3_CONF_VARS']['EXTCONF']['crawler']['excludeDoktype'] as $key => $doktypeList) {
208 1
                    if (\TYPO3\CMS\Core\Utility\GeneralUtility::inList($doktypeList, $pageRow['doktype'])) {
209 1
                        $skipPage = true;
210 1
                        $skipMessage = 'Doktype was excluded by "'.$key.'"';
211 1
                        break;
212
                    }
213
                }
214
            }
215
        }
216
217 6
        if (!$skipPage) {
218
                // veto hook
219 1
            if (is_array($GLOBALS['TYPO3_CONF_VARS']['EXTCONF']['crawler']['pageVeto'])) {
220
                foreach($GLOBALS['TYPO3_CONF_VARS']['EXTCONF']['crawler']['pageVeto'] as $key => $func)    {
221
                    $params = array(
222
                        'pageRow' => $pageRow
223
                    );
224
                    // expects "false" if page is ok and "true" or a skipMessage if this page should _not_ be crawled
225
                    $veto = \TYPO3\CMS\Core\Utility\GeneralUtility::callUserFunction($func, $params, $this);
226
                    if ($veto !== false)    {
227
                        $skipPage = true;
228
                        if (is_string($veto)) {
229
                            $skipMessage = $veto;
230
                        } else {
231
                            $skipMessage = 'Veto from hook "'.htmlspecialchars($key).'"';
232
                        }
233
                        // no need to execute other hooks if a previous one return a veto
234
                        break;
235
                    }
236
                }
237
            }
238
        }
239
240 6
        return $skipPage ? $skipMessage : false;
241
    }
242
243
    /**
244
     * Wrapper method for getUrlsForPageId()
245
     * It returns an array of configurations and no urls!
246
     *
247
     * @param  array  $pageRow       Page record with at least dok-type and uid columns.
248
     * @param  string $skipMessage
249
     * @return array                 Result (see getUrlsForPageId())
250
     * @see getUrlsForPageId()
251
     */
252 2
    public function getUrlsForPageRow(array $pageRow, &$skipMessage = '') {
253 2
        $message = $this->checkIfPageShouldBeSkipped($pageRow);
254
255 2
        if ($message === false) {
256 1
            $res = $this->getUrlsForPageId($pageRow['uid']);
257 1
            $skipMessage = '';
258
        } else {
259 1
            $skipMessage = $message;
260 1
            $res = array();
261
        }
262
263 2
        return $res;
264
    }
265
266
    /**
267
     * This method is used to count if there are ANY unprocessed queue entries
268
     * of a given page_id and the configuration which matches a given hash.
269
     * If there if none, we can skip an inner detail check
270
     *
271
     * @param  int    $uid
272
     * @param  string $configurationHash
273
     * @return boolean
274
     */
275
    protected function noUnprocessedQueueEntriesForPageWithConfigurationHashExist($uid,$configurationHash) {
276
        $configurationHash = $this->db->fullQuoteStr($configurationHash,'tx_crawler_queue');
277
        $res = $this->db->exec_SELECTquery('count(*) as anz','tx_crawler_queue',"page_id=".intval($uid)." AND configuration_hash=".$configurationHash." AND exec_time=0");
278
        $row = $this->db->sql_fetch_assoc($res);
279
280
        return ($row['anz'] == 0);
281
    }
282
283
    /**
284
     * Creates a list of URLs from input array (and submits them to queue if asked for)
285
     * See Web > Info module script + "indexed_search"'s crawler hook-client using this!
286
     *
287
     * @param    array        Information about URLs from pageRow to crawl.
288
     * @param    array        Page row
289
     * @param    integer        Unix time to schedule indexing to, typically time()
290
     * @param    integer        Number of requests per minute (creates the interleave between requests)
291
     * @param    boolean        If set, submits the URLs to queue
292
     * @param    boolean        If set (and submitcrawlUrls is false) will fill $downloadUrls with entries)
293
     * @param    array        Array which is passed by reference and contains the an id per url to secure we will not crawl duplicates
294
     * @param    array        Array which will be filled with URLS for download if flag is set.
295
     * @param    array        Array of processing instructions
296
     * @return    string        List of URLs (meant for display in backend module)
297
     *
298
     */
299
    function urlListFromUrlArray(
0 ignored issues
show
Best Practice introduced by
It is generally recommended to explicitly declare the visibility for methods.

Adding explicit visibility (private, protected, or public) is generally recommend to communicate to other developers how, and from where this method is intended to be used.

Loading history...
300
    array $vv,
301
    array $pageRow,
302
    $scheduledTime,
303
    $reqMinute,
304
    $submitCrawlUrls,
305
    $downloadCrawlUrls,
306
    array &$duplicateTrack,
307
    array &$downloadUrls,
308
    array $incomingProcInstructions) {
309
310
        // realurl support (thanks to Ingo Renner)
311
        if (\TYPO3\CMS\Core\Utility\ExtensionManagementUtility::isLoaded('realurl') && $vv['subCfg']['realurl']) {
312
313
            /** @var tx_realurl $urlObj */
314
            $urlObj = \TYPO3\CMS\Core\Utility\GeneralUtility::makeInstance('tx_realurl');
315
316
            if (!empty($vv['subCfg']['baseUrl'])) {
317
                $urlParts = parse_url($vv['subCfg']['baseUrl']);
318
                $host = strtolower($urlParts['host']);
319
                $urlObj->host = $host;
320
321
                // First pass, finding configuration OR pointer string:
322
                $urlObj->extConf = isset($GLOBALS['TYPO3_CONF_VARS']['EXTCONF']['realurl'][$urlObj->host]) ? $GLOBALS['TYPO3_CONF_VARS']['EXTCONF']['realurl'][$urlObj->host] : $GLOBALS['TYPO3_CONF_VARS']['EXTCONF']['realurl']['_DEFAULT'];
323
324
                // If it turned out to be a string pointer, then look up the real config:
325
                if (is_string($urlObj->extConf)) {
326
                    $urlObj->extConf = is_array($GLOBALS['TYPO3_CONF_VARS']['EXTCONF']['realurl'][$urlObj->extConf]) ? $GLOBALS['TYPO3_CONF_VARS']['EXTCONF']['realurl'][$urlObj->extConf] : $GLOBALS['TYPO3_CONF_VARS']['EXTCONF']['realurl']['_DEFAULT'];
327
                }
328
329
            }
330
331
            if (!$GLOBALS['TSFE']->sys_page) {
332
                $GLOBALS['TSFE']->sys_page = \TYPO3\CMS\Core\Utility\GeneralUtility::makeInstance('TYPO3\CMS\Frontend\Page\PageRepository');
333
            }
334
            if (!$GLOBALS['TSFE']->csConvObj) {
335
                $GLOBALS['TSFE']->csConvObj = \TYPO3\CMS\Core\Utility\GeneralUtility::makeInstance('TYPO3\CMS\Core\Charset\CharsetConverter');
336
            }
337
            if (!$GLOBALS['TSFE']->tmpl->rootLine[0]['uid']) {
338
                $GLOBALS['TSFE']->tmpl->rootLine[0]['uid'] = $urlObj->extConf['pagePath']['rootpage_id'];
339
            }
340
        }
341
342
        if (is_array($vv['URLs']))    {
343
            $configurationHash     =    md5(serialize($vv));
344
            $skipInnerCheck     =    $this->noUnprocessedQueueEntriesForPageWithConfigurationHashExist($pageRow['uid'],$configurationHash);
345
346
            foreach($vv['URLs'] as $urlQuery)    {
347
348
                if ($this->drawURLs_PIfilter($vv['subCfg']['procInstrFilter'], $incomingProcInstructions))    {
349
350
                    // Calculate cHash:
351
                    if ($vv['subCfg']['cHash'])    {
352
                        /* @var $cacheHash \TYPO3\CMS\Frontend\Page\CacheHashCalculator */
353
                        $cacheHash = \TYPO3\CMS\Core\Utility\GeneralUtility::makeInstance('TYPO3\CMS\Frontend\Page\CacheHashCalculator');
354
                        $urlQuery .= '&cHash=' . $cacheHash->generateForParameters($urlQuery);
355
                    }
356
357
                    // Create key by which to determine unique-ness:
358
                    $uKey = $urlQuery.'|'.$vv['subCfg']['userGroups'].'|'.$vv['subCfg']['baseUrl'].'|'.$vv['subCfg']['procInstrFilter'];
359
360
                    // realurl support (thanks to Ingo Renner)
361
                    $urlQuery = 'index.php' . $urlQuery;
362
                    if (\TYPO3\CMS\Core\Utility\ExtensionManagementUtility::isLoaded('realurl') && $vv['subCfg']['realurl']) {
363
                        $params = array(
364
                            'LD' => array(
365
                                'totalURL' => $urlQuery
366
                            ),
367
                            'TCEmainHook' => true
368
                        );
369
                        $urlObj->encodeSpURL($params);
0 ignored issues
show
Bug introduced by
The variable $urlObj does not seem to be defined for all execution paths leading up to this point.

If you define a variable conditionally, it can happen that it is not defined for all execution paths.

Let’s take a look at an example:

function myFunction($a) {
    switch ($a) {
        case 'foo':
            $x = 1;
            break;

        case 'bar':
            $x = 2;
            break;
    }

    // $x is potentially undefined here.
    echo $x;
}

In the above example, the variable $x is defined if you pass “foo” or “bar” as argument for $a. However, since the switch statement has no default case statement, if you pass any other value, the variable $x would be undefined.

Available Fixes

  1. Check for existence of the variable explicitly:

    function myFunction($a) {
        switch ($a) {
            case 'foo':
                $x = 1;
                break;
    
            case 'bar':
                $x = 2;
                break;
        }
    
        if (isset($x)) { // Make sure it's always set.
            echo $x;
        }
    }
    
  2. Define a default value for the variable:

    function myFunction($a) {
        $x = ''; // Set a default which gets overridden for certain paths.
        switch ($a) {
            case 'foo':
                $x = 1;
                break;
    
            case 'bar':
                $x = 2;
                break;
        }
    
        echo $x;
    }
    
  3. Add a value for the missing path:

    function myFunction($a) {
        switch ($a) {
            case 'foo':
                $x = 1;
                break;
    
            case 'bar':
                $x = 2;
                break;
    
            // We add support for the missing case.
            default:
                $x = '';
                break;
        }
    
        echo $x;
    }
    
Loading history...
370
                        $urlQuery = $params['LD']['totalURL'];
371
                    }
372
373
                    // Scheduled time:
374
                    $schTime = $scheduledTime + round(count($duplicateTrack)*(60/$reqMinute));
375
                    $schTime = floor($schTime/60)*60;
376
377
                    if (isset($duplicateTrack[$uKey])) {
378
379
                        //if the url key is registered just display it and do not resubmit is
380
                        $urlList = '<em><span class="typo3-dimmed">'.htmlspecialchars($urlQuery).'</span></em><br/>';
381
382
                    } else {
383
384
                        $urlList = '['.date('d.m.y H:i', $schTime).'] '.htmlspecialchars($urlQuery);
385
                        $this->urlList[] = '['.date('d.m.y H:i', $schTime).'] '.$urlQuery;
386
387
                        $theUrl = ($vv['subCfg']['baseUrl'] ? $vv['subCfg']['baseUrl'] : \TYPO3\CMS\Core\Utility\GeneralUtility::getIndpEnv('TYPO3_SITE_URL')) . $urlQuery;
388
389
                        // Submit for crawling!
390
                        if ($submitCrawlUrls)    {
391
                            $added = $this->addUrl(
392
                            $pageRow['uid'],
393
                            $theUrl,
394
                            $vv['subCfg'],
395
                            $scheduledTime,
396
                            $configurationHash,
397
                            $skipInnerCheck
398
                            );
399
                            if ($added === false) {
400
                                $urlList .= ' (Url already existed)';
401
                            }
402
                        } elseif ($downloadCrawlUrls)    {
403
                            $downloadUrls[$theUrl] = $theUrl;
404
                        }
405
406
                        $urlList .= '<br />';
407
                    }
408
                    $duplicateTrack[$uKey] = TRUE;
409
                }
410
            }
411
        } else {
412
            $urlList = 'ERROR - no URL generated';
413
        }
414
415
        return $urlList;
0 ignored issues
show
Bug introduced by
The variable $urlList does not seem to be defined for all execution paths leading up to this point.

If you define a variable conditionally, it can happen that it is not defined for all execution paths.

Let’s take a look at an example:

function myFunction($a) {
    switch ($a) {
        case 'foo':
            $x = 1;
            break;

        case 'bar':
            $x = 2;
            break;
    }

    // $x is potentially undefined here.
    echo $x;
}

In the above example, the variable $x is defined if you pass “foo” or “bar” as argument for $a. However, since the switch statement has no default case statement, if you pass any other value, the variable $x would be undefined.

Available Fixes

  1. Check for existence of the variable explicitly:

    function myFunction($a) {
        switch ($a) {
            case 'foo':
                $x = 1;
                break;
    
            case 'bar':
                $x = 2;
                break;
        }
    
        if (isset($x)) { // Make sure it's always set.
            echo $x;
        }
    }
    
  2. Define a default value for the variable:

    function myFunction($a) {
        $x = ''; // Set a default which gets overridden for certain paths.
        switch ($a) {
            case 'foo':
                $x = 1;
                break;
    
            case 'bar':
                $x = 2;
                break;
        }
    
        echo $x;
    }
    
  3. Add a value for the missing path:

    function myFunction($a) {
        switch ($a) {
            case 'foo':
                $x = 1;
                break;
    
            case 'bar':
                $x = 2;
                break;
    
            // We add support for the missing case.
            default:
                $x = '';
                break;
        }
    
        echo $x;
    }
    
Loading history...
416
    }
417
418
    /**
419
     * Returns true if input processing instruction is among registered ones.
420
     *
421
     * @param  string $piString                     PI to test
422
     * @param  array  $incomingProcInstructions     Processing instructions
423
     * @return boolean                              TRUE if found
424
     */
425 5
    public function drawURLs_PIfilter($piString, array $incomingProcInstructions) {
426 5
        if (empty($incomingProcInstructions)) {
427 1
            return TRUE;
428
        }
429
430 4
        foreach($incomingProcInstructions as $pi) {
431 4
            if (\TYPO3\CMS\Core\Utility\GeneralUtility::inList($piString, $pi)) {
432 4
                return TRUE;
433
            }
434
        }
435 2
    }
436
437
438
    public function getPageTSconfigForId($id) {
439
        if(!$this->MP){
440
            $pageTSconfig = \TYPO3\CMS\Backend\Utility\BackendUtility::getPagesTSconfig($id);
441
        } else {
442
            list(,$mountPointId) = explode('-', $this->MP);
443
            $pageTSconfig = \TYPO3\CMS\Backend\Utility\BackendUtility::getPagesTSconfig($mountPointId);
444
        }
445
446
        // Call a hook to alter configuration
447
        if (is_array($GLOBALS['TYPO3_CONF_VARS']['EXTCONF']['crawler']['getPageTSconfigForId'])) {
448
            $params = array(
449
                'pageId' => $id,
450
                'pageTSConfig' => &$pageTSconfig
451
            );
452
            foreach ($GLOBALS['TYPO3_CONF_VARS']['EXTCONF']['crawler']['getPageTSconfigForId'] as $userFunc) {
453
                \TYPO3\CMS\Core\Utility\GeneralUtility::callUserFunction($userFunc, $params, $this);
454
            }
455
        }
456
457
        return $pageTSconfig;
458
    }
459
460
    /**
461
     * This methods returns an array of configurations.
462
     * And no urls!
463
     *
464
     * @param  integer $id  Page ID
465
     * @return array        Configurations from pages and configuration records
466
     */
467
    protected function getUrlsForPageId($id)    {
468
469
        /**
470
         * Get configuration from tsConfig
471
         */
472
473
        // Get page TSconfig for page ID:
474
        $pageTSconfig = $this->getPageTSconfigForId($id);
475
476
        $res = array();
477
478
        if (is_array($pageTSconfig) && is_array($pageTSconfig['tx_crawler.']['crawlerCfg.']))    {
479
            $crawlerCfg = $pageTSconfig['tx_crawler.']['crawlerCfg.'];
480
481
            if (is_array($crawlerCfg['paramSets.']))    {
482
                foreach($crawlerCfg['paramSets.'] as $key => $values)    {
483
                    if (!is_array($values))    {
484
485
                        // Sub configuration for a single configuration string:
486
                        $subCfg = (array)$crawlerCfg['paramSets.'][$key.'.'];
487
                        $subCfg['key'] = $key;
488
489
                        if (strcmp($subCfg['procInstrFilter'],''))    {
490
                            $subCfg['procInstrFilter'] = implode(',',\TYPO3\CMS\Core\Utility\GeneralUtility::trimExplode(',',$subCfg['procInstrFilter']));
491
                        }
492
                        $pidOnlyList = implode(',',\TYPO3\CMS\Core\Utility\GeneralUtility::trimExplode(',',$subCfg['pidsOnly'],1));
0 ignored issues
show
Documentation introduced by
1 is of type integer, but the function expects a boolean.

It seems like the type of the argument is not accepted by the function/method which you are calling.

In some cases, in particular if PHP’s automatic type-juggling kicks in this might be fine. In other cases, however this might be a bug.

We suggest to add an explicit type cast like in the following example:

function acceptsInteger($int) { }

$x = '123'; // string "123"

// Instead of
acceptsInteger($x);

// we recommend to use
acceptsInteger((integer) $x);
Loading history...
493
494
                            // process configuration if it is not page-specific or if the specific page is the current page:
495
                        if (!strcmp($subCfg['pidsOnly'],'') || \TYPO3\CMS\Core\Utility\GeneralUtility::inList($pidOnlyList,$id))    {
496
497
                                // add trailing slash if not present
498
                            if (!empty($subCfg['baseUrl']) && substr($subCfg['baseUrl'], -1) != '/') {
499
                                $subCfg['baseUrl'] .= '/';
500
                            }
501
502
                                // Explode, process etc.:
503
                            $res[$key] = array();
504
                            $res[$key]['subCfg'] = $subCfg;
505
                            $res[$key]['paramParsed'] = $this->parseParams($values);
506
                            $res[$key]['paramExpanded'] = $this->expandParameters($res[$key]['paramParsed'],$id);
507
                            $res[$key]['origin'] = 'pagets';
508
509
                                // recognize MP value
510
                            if(!$this->MP){
511
                                $res[$key]['URLs'] = $this->compileUrls($res[$key]['paramExpanded'],array('?id='.$id));
512
                            } else {
513
                                $res[$key]['URLs'] = $this->compileUrls($res[$key]['paramExpanded'],array('?id='.$id.'&MP='.$this->MP));
514
                            }
515
                        }
516
                    }
517
                }
518
519
            }
520
        }
521
522
        /**
523
         * Get configuration from tx_crawler_configuration records
524
         */
525
526
            // get records along the rootline
527
        $rootLine = \TYPO3\CMS\Backend\Utility\BackendUtility::BEgetRootLine($id);
528
529
        foreach ($rootLine as $page) {
530
            $configurationRecordsForCurrentPage = \TYPO3\CMS\Backend\Utility\BackendUtility::getRecordsByField(
531
                'tx_crawler_configuration',
532
                'pid',
533
                intval($page['uid']),
534
                \TYPO3\CMS\Backend\Utility\BackendUtility::BEenableFields('tx_crawler_configuration') . \TYPO3\CMS\Backend\Utility\BackendUtility::deleteClause('tx_crawler_configuration')
535
            );
536
537
            if (is_array($configurationRecordsForCurrentPage)) {
538
                foreach ($configurationRecordsForCurrentPage as $configurationRecord) {
539
540
                        // check access to the configuration record
541
                    if (empty($configurationRecord['begroups']) || $GLOBALS['BE_USER']->isAdmin() || $this->hasGroupAccess($GLOBALS['BE_USER']->user['usergroup_cached_list'], $configurationRecord['begroups'])) {
542
543
                        $pidOnlyList = implode(',',\TYPO3\CMS\Core\Utility\GeneralUtility::trimExplode(',',$configurationRecord['pidsonly'],1));
0 ignored issues
show
Documentation introduced by
1 is of type integer, but the function expects a boolean.

It seems like the type of the argument is not accepted by the function/method which you are calling.

In some cases, in particular if PHP’s automatic type-juggling kicks in this might be fine. In other cases, however this might be a bug.

We suggest to add an explicit type cast like in the following example:

function acceptsInteger($int) { }

$x = '123'; // string "123"

// Instead of
acceptsInteger($x);

// we recommend to use
acceptsInteger((integer) $x);
Loading history...
544
545
                            // process configuration if it is not page-specific or if the specific page is the current page:
546
                        if (!strcmp($configurationRecord['pidsonly'],'') || \TYPO3\CMS\Core\Utility\GeneralUtility::inList($pidOnlyList,$id)) {
547
                            $key = $configurationRecord['name'];
548
549
                                // don't overwrite previously defined paramSets
550
                            if (!isset($res[$key])) {
551
552
                                    /* @var $TSparserObject \TYPO3\CMS\Core\TypoScript\Parser\TypoScriptParser */
553
                                $TSparserObject = \TYPO3\CMS\Core\Utility\GeneralUtility::makeInstance('TYPO3\CMS\Core\TypoScript\Parser\TypoScriptParser');
554
                                $TSparserObject->parse($configurationRecord['processing_instruction_parameters_ts']);
555
556
                                $subCfg = array(
557
                                    'procInstrFilter' => $configurationRecord['processing_instruction_filter'],
558
                                    'procInstrParams.' => $TSparserObject->setup,
559
                                    'baseUrl' => $this->getBaseUrlForConfigurationRecord($configurationRecord['base_url'], $configurationRecord['sys_domain_base_url']),
560
                                    'realurl' => $configurationRecord['realurl'],
561
                                    'cHash' => $configurationRecord['chash'],
562
                                    'userGroups' => $configurationRecord['fegroups'],
563
                                    'exclude' => $configurationRecord['exclude'],
564
                                    'key' => $key,
565
                                );
566
567
                                    // add trailing slash if not present
568
                                if (!empty($subCfg['baseUrl']) && substr($subCfg['baseUrl'], -1) != '/') {
569
                                    $subCfg['baseUrl'] .= '/';
570
                                }
571
                                if (!in_array($id, $this->expandExcludeString($subCfg['exclude']))) {
572
                                    $res[$key] = array();
573
                                    $res[$key]['subCfg'] = $subCfg;
574
                                    $res[$key]['paramParsed'] = $this->parseParams($configurationRecord['configuration']);
575
                                    $res[$key]['paramExpanded'] = $this->expandParameters($res[$key]['paramParsed'], $id);
576
                                    $res[$key]['URLs'] = $this->compileUrls($res[$key]['paramExpanded'], array('?id=' . $id));
577
                                    $res[$key]['origin'] = 'tx_crawler_configuration_'.$configurationRecord['uid'];
578
                                }
579
                            }
580
                        }
581
                    }
582
                }
583
            }
584
        }
585
586
        if (is_array($GLOBALS['TYPO3_CONF_VARS']['EXTCONF']['crawler']['processUrls']))    {
587
            foreach($GLOBALS['TYPO3_CONF_VARS']['EXTCONF']['crawler']['processUrls'] as $func)    {
588
                $params = array(
589
                    'res' => &$res,
590
                );
591
                \TYPO3\CMS\Core\Utility\GeneralUtility::callUserFunction($func, $params, $this);
592
            }
593
        }
594
595
        return $res;
596
    }
597
598
    /**
599
     * Checks if a domain record exist and returns the base-url based on the record. If not the given baseUrl string is used.
600
     *
601
     * @param  string   $baseUrl
602
     * @param  integer  $sysDomainUid
603
     * @return string
604
     */
605
    protected function getBaseUrlForConfigurationRecord($baseUrl, $sysDomainUid) {
606
        $sysDomainUid = intval($sysDomainUid);
607
608
        if ($sysDomainUid > 0) {
609
            $res = $this->db->exec_SELECTquery(
610
                '*',
611
                'sys_domain',
612
                'uid = '.$sysDomainUid .
613
                \TYPO3\CMS\Backend\Utility\BackendUtility::BEenableFields('sys_domain') .
614
                \TYPO3\CMS\Backend\Utility\BackendUtility::deleteClause('sys_domain')
615
            );
616
            $row = $this->db->sql_fetch_assoc($res);
617
            if ($row['domainName'] != '') {
618
                return 'http://'.$row['domainName'];
619
            }
620
        }
621
        return $baseUrl;
622
    }
623
624
    function getConfigurationsForBranch($rootid, $depth) {
0 ignored issues
show
Best Practice introduced by
It is generally recommended to explicitly declare the visibility for methods.

Adding explicit visibility (private, protected, or public) is generally recommend to communicate to other developers how, and from where this method is intended to be used.

Loading history...
625
626
        $configurationsForBranch = array();
627
628
        $pageTSconfig = $this->getPageTSconfigForId($rootid);
629
        if (is_array($pageTSconfig) && is_array($pageTSconfig['tx_crawler.']['crawlerCfg.']) && is_array($pageTSconfig['tx_crawler.']['crawlerCfg.']['paramSets.']))    {
630
631
            $sets = $pageTSconfig['tx_crawler.']['crawlerCfg.']['paramSets.'];
632
            if(is_array($sets)) {
633
                foreach($sets as $key=>$value) {
634
                    if(!is_array($value)) continue;
635
                    $configurationsForBranch[] = substr($key,-1)=='.'?substr($key,0,-1):$key;
636
                }
637
638
            }
639
        }
640
        $pids = array();
641
        $rootLine = \TYPO3\CMS\Backend\Utility\BackendUtility::BEgetRootLine($rootid);
642
        foreach($rootLine as $node) {
643
            $pids[] = $node['uid'];
644
        }
645
        /* @var \TYPO3\CMS\Backend\Tree\View\PageTreeView */
646
        $tree = \TYPO3\CMS\Core\Utility\GeneralUtility::makeInstance('TYPO3\CMS\Backend\Tree\View\PageTreeView');
647
        $perms_clause = $GLOBALS['BE_USER']->getPagePermsClause(1);
648
        $tree->init('AND ' . $perms_clause);
649
        $tree->getTree($rootid, $depth, '');
650
        foreach($tree->tree as $node) {
651
            $pids[] = $node['row']['uid'];
652
        }
653
654
        $res = $this->db->exec_SELECTquery(
655
            '*',
656
            'tx_crawler_configuration',
657
            'pid IN ('.implode(',', $pids).') '.
658
            \TYPO3\CMS\Backend\Utility\BackendUtility::BEenableFields('tx_crawler_configuration') .
659
            \TYPO3\CMS\Backend\Utility\BackendUtility::deleteClause('tx_crawler_configuration').' '.
660
            \TYPO3\CMS\Backend\Utility\BackendUtility::versioningPlaceholderClause('tx_crawler_configuration').' '
661
        );
662
663
        while($row = $this->db->sql_fetch_assoc($res)) {
664
            $configurationsForBranch[] = $row['name'];
665
        }
666
        $this->db->sql_free_result($res);
667
        return $configurationsForBranch;
668
    }
669
670
    /**
671
     * Check if a user has access to an item
672
     * (e.g. get the group list of the current logged in user from $GLOBALS['TSFE']->gr_list)
673
     *
674
     * @see \TYPO3\CMS\Frontend\Page\PageRepository::getMultipleGroupsWhereClause()
675
     * @param  string $groupList    Comma-separated list of (fe_)group UIDs from a user
676
     * @param  string $accessList   Comma-separated list of (fe_)group UIDs of the item to access
677
     * @return bool                 TRUE if at least one of the users group UIDs is in the access list or the access list is empty
678
     * @author Fabrizio Branca <[email protected]>
679
     * @since 2009-01-19
680
     */
681 3
    function hasGroupAccess($groupList, $accessList) {
0 ignored issues
show
Best Practice introduced by
It is generally recommended to explicitly declare the visibility for methods.

Adding explicit visibility (private, protected, or public) is generally recommend to communicate to other developers how, and from where this method is intended to be used.

Loading history...
682 3
        if (empty($accessList)) {
683 1
            return true;
684
        }
685 2
        foreach(\TYPO3\CMS\Core\Utility\GeneralUtility::intExplode(',', $groupList) as $groupUid) {
686 2
            if (\TYPO3\CMS\Core\Utility\GeneralUtility::inList($accessList, $groupUid)) {
687 2
                return true;
688
            }
689
        }
690 1
        return false;
691
    }
692
693
    /**
694
     * Parse GET vars of input Query into array with key=>value pairs
695
     *
696
     * @param  string  $inputQuery  Input query string
697
     * @return array                Keys are Get var names, values are the values of the GET vars.
698
     */
699 3
    function parseParams($inputQuery) {
0 ignored issues
show
Best Practice introduced by
It is generally recommended to explicitly declare the visibility for methods.

Adding explicit visibility (private, protected, or public) is generally recommend to communicate to other developers how, and from where this method is intended to be used.

Loading history...
700
            // Extract all GET parameters into an ARRAY:
701 3
        $paramKeyValues = array();
702 3
        $GETparams = explode('&', $inputQuery);
703
704 3
        foreach($GETparams as $paramAndValue)    {
705 3
            list($p,$v) = explode('=', $paramAndValue, 2);
706 3
            if (strlen($p))        {
707 3
                $paramKeyValues[rawurldecode($p)] = rawurldecode($v);
708
            }
709
        }
710
711 3
        return $paramKeyValues;
712
    }
713
714
    /**
715
     * Will expand the parameters configuration to individual values. This follows a certain syntax of the value of each parameter.
716
     * Syntax of values:
717
     * - Basically: If the value is wrapped in [...] it will be expanded according to the following syntax, otherwise the value is taken literally
718
     * - Configuration is splitted by "|" and the parts are processed individually and finally added together
719
     * - For each configuration part:
720
     *         - "[int]-[int]" = Integer range, will be expanded to all values in between, values included, starting from low to high (max. 1000). Example "1-34" or "-40--30"
721
     *         - "_TABLE:[TCA table name];[_PID:[optional page id, default is current page]];[_ENABLELANG:1]" = Look up of table records from PID, filtering out deleted records. Example "_TABLE:tt_content; _PID:123"
722
     *        _ENABLELANG:1 picks only original records without their language overlays
723
     *         - Default: Literal value
724
     *
725
     * @param    array        Array with key (GET var name) and values (value of GET var which is configuration for expansion)
726
     * @param    integer        Current page ID
727
     * @return    array        Array with key (GET var name) with the value being an array of all possible values for that key.
728
     */
729
    function expandParameters($paramArray, $pid)    {
0 ignored issues
show
Best Practice introduced by
It is generally recommended to explicitly declare the visibility for methods.

Adding explicit visibility (private, protected, or public) is generally recommend to communicate to other developers how, and from where this method is intended to be used.

Loading history...
730
        global $TCA;
731
732
            // Traverse parameter names:
733
        foreach($paramArray as $p => $v)    {
734
            $v = trim($v);
735
736
                // If value is encapsulated in square brackets it means there are some ranges of values to find, otherwise the value is literal
737
            if (substr($v,0,1)==='[' && substr($v,-1)===']')    {
738
                    // So, find the value inside brackets and reset the paramArray value as an array.
739
                $v = substr($v,1,-1);
740
                $paramArray[$p] = array();
741
742
                    // Explode parts and traverse them:
743
                $parts = explode('|',$v);
744
                foreach($parts as $pV)    {
745
746
                        // Look for integer range: (fx. 1-34 or -40--30 // reads minus 40 to minus 30)
747
                    if (preg_match('/^(-?[0-9]+)\s*-\s*(-?[0-9]+)$/',trim($pV),$reg))    {    // Integer range:
748
749
                            // Swap if first is larger than last:
750
                        if ($reg[1] > $reg[2])    {
751
                            $temp = $reg[2];
752
                            $reg[2] = $reg[1];
753
                            $reg[1] = $temp;
754
                        }
755
756
                            // Traverse range, add values:
757
                        $runAwayBrake = 1000;    // Limit to size of range!
758
                        for($a=$reg[1]; $a<=$reg[2];$a++)    {
759
                            $paramArray[$p][] = $a;
760
                            $runAwayBrake--;
761
                            if ($runAwayBrake<=0)    {
762
                                break;
763
                            }
764
                        }
765
                    } elseif (substr(trim($pV),0,7)=='_TABLE:')    {
766
767
                            // Parse parameters:
768
                        $subparts = \TYPO3\CMS\Core\Utility\GeneralUtility::trimExplode(';',$pV);
769
                        $subpartParams = array();
770
                        foreach($subparts as $spV)    {
771
                            list($pKey,$pVal) = \TYPO3\CMS\Core\Utility\GeneralUtility::trimExplode(':',$spV);
772
                            $subpartParams[$pKey] = $pVal;
773
                        }
774
775
                            // Table exists:
776
                        if (isset($TCA[$subpartParams['_TABLE']]))    {
777
                            $lookUpPid = isset($subpartParams['_PID']) ? intval($subpartParams['_PID']) : $pid;
778
                            $pidField = isset($subpartParams['_PIDFIELD']) ? trim($subpartParams['_PIDFIELD']) : 'pid';
779
                            $where = isset($subpartParams['_WHERE']) ? $subpartParams['_WHERE'] : '';
780
                            $addTable = isset($subpartParams['_ADDTABLE']) ? $subpartParams['_ADDTABLE'] : '';
781
782
                            $fieldName = $subpartParams['_FIELD'] ? $subpartParams['_FIELD'] : 'uid';
783
                            if ($fieldName==='uid' || $TCA[$subpartParams['_TABLE']]['columns'][$fieldName]) {
784
785
                                $andWhereLanguage = '';
786
                                $transOrigPointerField = $TCA[$subpartParams['_TABLE']]['ctrl']['transOrigPointerField'];
787
788
                                if ($subpartParams['_ENABLELANG'] && $transOrigPointerField) {
789
                                    $andWhereLanguage = ' AND ' . $this->db->quoteStr($transOrigPointerField, $subpartParams['_TABLE']) .' <= 0 ';
790
                                }
791
792
                                $where = $this->db->quoteStr($pidField, $subpartParams['_TABLE']) .'='.intval($lookUpPid) . ' ' .
793
                                    $andWhereLanguage . $where;
794
795
                                $rows = $this->db->exec_SELECTgetRows(
796
                                    $fieldName,
797
                                    $subpartParams['_TABLE'] . $addTable,
798
                                    $where . \TYPO3\CMS\Backend\Utility\BackendUtility::deleteClause($subpartParams['_TABLE']),
799
                                    '',
800
                                    '',
801
                                    '',
802
                                    $fieldName
803
                                );
804
805
                                if (is_array($rows))    {
806
                                    $paramArray[$p] = array_merge($paramArray[$p],array_keys($rows));
807
                                }
808
                            }
809
                        }
810
                    } else {    // Just add value:
811
                        $paramArray[$p][] = $pV;
812
                    }
813
                        // Hook for processing own expandParameters place holder
814
                    if (is_array($GLOBALS['TYPO3_CONF_VARS']['SC_OPTIONS']['crawler/class.tx_crawler_lib.php']['expandParameters'])) {
815
                        $_params = array(
816
                            'pObj' => &$this,
817
                            'paramArray' => &$paramArray,
818
                            'currentKey' => $p,
819
                            'currentValue' => $pV,
820
                            'pid' => $pid
821
                        );
822
                        foreach($GLOBALS['TYPO3_CONF_VARS']['SC_OPTIONS']['crawler/class.tx_crawler_lib.php']['expandParameters'] as $key => $_funcRef)    {
823
                            \TYPO3\CMS\Core\Utility\GeneralUtility::callUserFunction($_funcRef, $_params, $this);
824
                        }
825
                    }
826
                }
827
828
                    // Make unique set of values and sort array by key:
829
                $paramArray[$p] = array_unique($paramArray[$p]);
830
                ksort($paramArray);
831
            } else {
832
                    // Set the literal value as only value in array:
833
                $paramArray[$p] = array($v);
834
            }
835
        }
836
837
        return $paramArray;
838
    }
839
840
    /**
841
     * Compiling URLs from parameter array (output of expandParameters())
842
     * The number of URLs will be the multiplication of the number of parameter values for each key
843
     *
844
     * @param  array  $paramArray   Output of expandParameters(): Array with keys (GET var names) and for each an array of values
845
     * @param  array  $urls         URLs accumulated in this array (for recursion)
846
     * @return array                URLs accumulated, if number of urls exceed 'maxCompileUrls' it will return false as an error!
847
     */
848 3
    public function compileUrls($paramArray, $urls = array()) {
849
850 3
        if (count($paramArray) && is_array($urls)) {
851
                // shift first off stack:
852 2
            reset($paramArray);
853 2
            $varName = key($paramArray);
854 2
            $valueSet = array_shift($paramArray);
855
856
                // Traverse value set:
857 2
            $newUrls = array();
858 2
            foreach($urls as $url) {
859 1
                foreach($valueSet as $val) {
860 1
                    $newUrls[] = $url.(strcmp($val,'') ? '&'.rawurlencode($varName).'='.rawurlencode($val) : '');
861
862 1
                    if (count($newUrls) >  \TYPO3\CMS\Core\Utility\MathUtility::forceIntegerInRange($this->extensionSettings['maxCompileUrls'], 1, 1000000000, 10000)) {
863 1
                        break;
864
                    }
865
                }
866
            }
867 2
            $urls = $newUrls;
868 2
            $urls = $this->compileUrls($paramArray, $urls);
869
        }
870
871 3
        return $urls;
872
    }
873
874
    /************************************
875
     *
876
     * Crawler log
877
     *
878
     ************************************/
879
880
    /**
881
     * Return array of records from crawler queue for input page ID
882
     *
883
     * @param  integer $id              Page ID for which to look up log entries.
884
     * @param  string  $filter          Filter: "all" => all entries, "pending" => all that is not yet run, "finished" => all complete ones
885
     * @param  boolean $doFlush         If TRUE, then entries selected at DELETED(!) instead of selected!
886
     * @param  boolean $doFullFlush
887
     * @param  integer $itemsPerPage    Limit the amount of entries per page default is 10
888
     * @return array
889
     */
890
    public function getLogEntriesForPageId($id, $filter = '', $doFlush = FALSE, $doFullFlush = FALSE, $itemsPerPage = 10) {
891
        // FIXME: Write Unit tests for Filters
892
        switch($filter) {
893
            case 'pending':
894
                $addWhere = ' AND exec_time=0';
895
                break;
896
            case 'finished':
897
                $addWhere = ' AND exec_time>0';
898
                break;
899
            default:
900
                $addWhere = '';
901
                break;
902
        }
903
904
        // FIXME: Write unit test that ensures that the right records are deleted.
905
        if ($doFlush) {
906
            $this->flushQueue( ($doFullFlush?'1=1':('page_id='.intval($id))) .$addWhere);
907
            return array();
908
        } else {
909
            return $this->db->exec_SELECTgetRows('*',
910
                'tx_crawler_queue',
911
                'page_id=' . intval($id) . $addWhere, '', 'scheduled DESC',
912
                (intval($itemsPerPage)>0 ? intval($itemsPerPage) : ''));
913
        }
914
    }
915
916
    /**
917
     * Return array of records from crawler queue for input set ID
918
     *
919
     * @param    integer        Set ID for which to look up log entries.
920
     * @param    string        Filter: "all" => all entries, "pending" => all that is not yet run, "finished" => all complete ones
921
     * @param    boolean        If TRUE, then entries selected at DELETED(!) instead of selected!
922
     * @param    integer        Limit the amount of entires per page default is 10
923
     * @return    array
924
     */
925
    public function getLogEntriesForSetId($set_id,$filter='',$doFlush=FALSE, $doFullFlush=FALSE, $itemsPerPage=10)    {
926
        // FIXME: Write Unit tests for Filters
927
        switch($filter)    {
928
            case 'pending':
929
                $addWhere = ' AND exec_time=0';
930
                break;
931
            case 'finished':
932
                $addWhere = ' AND exec_time>0';
933
                break;
934
            default:
935
                $addWhere = '';
936
                break;
937
        }
938
        // FIXME: Write unit test that ensures that the right records are deleted.
939
        if ($doFlush)    {
940
            $this->flushQueue($doFullFlush?'':('set_id='.intval($set_id).$addWhere));
941
            return array();
942
        } else {
943
            return $this->db->exec_SELECTgetRows('*',
944
                'tx_crawler_queue',
945
                'set_id='.intval($set_id).$addWhere,'','scheduled DESC',
946
                (intval($itemsPerPage)>0 ? intval($itemsPerPage) : ''));
947
        }
948
    }
949
950
    /**
951
     * Removes queue entires
952
     *
953
     * @param $where    SQL related filter for the entries which should be removed
954
     * @return void
955
     */
956
    protected function flushQueue($where='') {
957
958
        $realWhere = strlen($where)>0?$where:'1=1';
959
960
        if(tx_crawler_domain_events_dispatcher::getInstance()->hasObserver('queueEntryFlush')) {
961
            $groups = $this->db->exec_SELECTgetRows('DISTINCT set_id','tx_crawler_queue',$realWhere);
962
            foreach($groups as $group) {
0 ignored issues
show
Bug introduced by
The expression $groups of type null|array is not guaranteed to be traversable. How about adding an additional type check?

There are different options of fixing this problem.

  1. If you want to be on the safe side, you can add an additional type-check:

    $collection = json_decode($data, true);
    if ( ! is_array($collection)) {
        throw new \RuntimeException('$collection must be an array.');
    }
    
    foreach ($collection as $item) { /** ... */ }
    
  2. If you are sure that the expression is traversable, you might want to add a doc comment cast to improve IDE auto-completion and static analysis:

    /** @var array $collection */
    $collection = json_decode($data, true);
    
    foreach ($collection as $item) { /** .. */ }
    
  3. Mark the issue as a false-positive: Just hover the remove button, in the top-right corner of this issue for more options.

Loading history...
963
                tx_crawler_domain_events_dispatcher::getInstance()->post('queueEntryFlush',$group['set_id'], $this->db->exec_SELECTgetRows('uid, set_id','tx_crawler_queue',$realWhere.' AND set_id="'.$group['set_id'].'"'));
964
            }
965
        }
966
967
        $this->db->exec_DELETEquery('tx_crawler_queue', $realWhere);
968
    }
969
970
    /**
971
     * Adding call back entries to log (called from hooks typically, see indexed search class "class.crawler.php"
972
     *
973
     * @param    integer        Set ID
974
     * @param    array        Parameters to pass to call back function
975
     * @param    string        Call back object reference, eg. 'EXT:indexed_search/class.crawler.php:&tx_indexedsearch_crawler'
976
     * @param    integer        Page ID to attach it to
977
     * @param    integer        Time at which to activate
978
     * @return    void
979
     */
980
    function addQueueEntry_callBack($setId,$params,$callBack,$page_id=0,$schedule=0) {
0 ignored issues
show
Best Practice introduced by
It is generally recommended to explicitly declare the visibility for methods.

Adding explicit visibility (private, protected, or public) is generally recommend to communicate to other developers how, and from where this method is intended to be used.

Loading history...
981
982
        if (!is_array($params))    $params = array();
983
        $params['_CALLBACKOBJ'] = $callBack;
984
985
            // Compile value array:
986
        $fieldArray = array(
987
            'page_id' => intval($page_id),
988
            'parameters' => serialize($params),
989
            'scheduled' => intval($schedule) ? intval($schedule) : $this->getCurrentTime(),
990
            'exec_time' => 0,
991
            'set_id' => intval($setId),
992
            'result_data' => '',
993
        );
994
995
        $this->db->exec_INSERTquery('tx_crawler_queue',$fieldArray);
996
    }
997
998
999
1000
1001
1002
1003
1004
1005
1006
1007
1008
    /************************************
1009
     *
1010
     * URL setting
1011
     *
1012
     ************************************/
1013
1014
    /**
1015
     * Setting a URL for crawling:
1016
     *
1017
     * @param    integer        Page ID
1018
     * @param    string        Complete URL
1019
     * @param    array        Sub configuration array (from TS config)
1020
     * @param    integer        Scheduled-time
1021
     * @param     string        (optional) configuration hash
1022
     * @param     bool        (optional) skip inner duplication check
1023
     * @return    bool        true if the url was added, false if it already existed
1024
     */
1025
    function addUrl (
0 ignored issues
show
Best Practice introduced by
It is generally recommended to explicitly declare the visibility for methods.

Adding explicit visibility (private, protected, or public) is generally recommend to communicate to other developers how, and from where this method is intended to be used.

Loading history...
1026
        $id,
1027
        $url,
1028
        array $subCfg,
1029
        $tstamp,
1030
        $configurationHash='',
1031
        $skipInnerDuplicationCheck=false
1032
    ) {
1033
1034
        $urlAdded = false;
1035
1036
            // Creating parameters:
1037
        $parameters = array(
1038
            'url' => $url
1039
        );
1040
1041
            // fe user group simulation:
1042
        $uGs = implode(',',array_unique(\TYPO3\CMS\Core\Utility\GeneralUtility::intExplode(',',$subCfg['userGroups'],1)));
0 ignored issues
show
Documentation introduced by
1 is of type integer, but the function expects a boolean.

It seems like the type of the argument is not accepted by the function/method which you are calling.

In some cases, in particular if PHP’s automatic type-juggling kicks in this might be fine. In other cases, however this might be a bug.

We suggest to add an explicit type cast like in the following example:

function acceptsInteger($int) { }

$x = '123'; // string "123"

// Instead of
acceptsInteger($x);

// we recommend to use
acceptsInteger((integer) $x);
Loading history...
1043
        if ($uGs)    {
1044
            $parameters['feUserGroupList'] = $uGs;
1045
        }
1046
1047
            // Setting processing instructions
1048
        $parameters['procInstructions'] = \TYPO3\CMS\Core\Utility\GeneralUtility::trimExplode(',',$subCfg['procInstrFilter']);
1049
        if (is_array($subCfg['procInstrParams.']))    {
1050
            $parameters['procInstrParams'] = $subCfg['procInstrParams.'];
1051
        }
1052
1053
1054
            // Compile value array:
1055
        $parameters_serialized = serialize($parameters);
1056
        $fieldArray = array(
1057
            'page_id' => intval($id),
1058
            'parameters' => $parameters_serialized,
1059
            'parameters_hash' => \TYPO3\CMS\Core\Utility\GeneralUtility::shortMD5($parameters_serialized),
1060
            'configuration_hash' => $configurationHash,
1061
            'scheduled' => $tstamp,
1062
            'exec_time' => 0,
1063
            'set_id' => intval($this->setID),
1064
            'result_data' => '',
1065
            'configuration' => $subCfg['key'],
1066
        );
1067
1068
        if ($this->registerQueueEntriesInternallyOnly)    {
0 ignored issues
show
Bug Best Practice introduced by
The expression $this->registerQueueEntriesInternallyOnly of type array is implicitly converted to a boolean; are you sure this is intended? If so, consider using ! empty($expr) instead to make it clear that you intend to check for an array without elements.

This check marks implicit conversions of arrays to boolean values in a comparison. While in PHP an empty array is considered to be equal (but not identical) to false, this is not always apparent.

Consider making the comparison explicit by using empty(..) or ! empty(...) instead.

Loading history...
1069
                //the entries will only be registered and not stored to the database
1070
            $this->queueEntries[] = $fieldArray;
1071
        } else {
1072
1073
            if(!$skipInnerDuplicationCheck){
1074
                    // check if there is already an equal entry
1075
                $rows = $this->getDuplicateRowsIfExist($tstamp,$fieldArray);
1076
            }
1077
1078
            if (count($rows) == 0) {
1079
                $this->db->exec_INSERTquery('tx_crawler_queue', $fieldArray);
1080
                $uid = $this->db->sql_insert_id();
1081
                $rows[] = $uid;
0 ignored issues
show
Bug introduced by
The variable $rows does not seem to be defined for all execution paths leading up to this point.

If you define a variable conditionally, it can happen that it is not defined for all execution paths.

Let’s take a look at an example:

function myFunction($a) {
    switch ($a) {
        case 'foo':
            $x = 1;
            break;

        case 'bar':
            $x = 2;
            break;
    }

    // $x is potentially undefined here.
    echo $x;
}

In the above example, the variable $x is defined if you pass “foo” or “bar” as argument for $a. However, since the switch statement has no default case statement, if you pass any other value, the variable $x would be undefined.

Available Fixes

  1. Check for existence of the variable explicitly:

    function myFunction($a) {
        switch ($a) {
            case 'foo':
                $x = 1;
                break;
    
            case 'bar':
                $x = 2;
                break;
        }
    
        if (isset($x)) { // Make sure it's always set.
            echo $x;
        }
    }
    
  2. Define a default value for the variable:

    function myFunction($a) {
        $x = ''; // Set a default which gets overridden for certain paths.
        switch ($a) {
            case 'foo':
                $x = 1;
                break;
    
            case 'bar':
                $x = 2;
                break;
        }
    
        echo $x;
    }
    
  3. Add a value for the missing path:

    function myFunction($a) {
        switch ($a) {
            case 'foo':
                $x = 1;
                break;
    
            case 'bar':
                $x = 2;
                break;
    
            // We add support for the missing case.
            default:
                $x = '';
                break;
        }
    
        echo $x;
    }
    
Loading history...
1082
                $urlAdded = true;
1083
                tx_crawler_domain_events_dispatcher::getInstance()->post('urlAddedToQueue',$this->setID,array('uid' => $uid, 'fieldArray' => $fieldArray));
1084
            }else{
1085
                tx_crawler_domain_events_dispatcher::getInstance()->post('duplicateUrlInQueue',$this->setID,array('rows' => $rows, 'fieldArray' => $fieldArray));
1086
            }
1087
        }
1088
1089
        return $urlAdded;
1090
    }
1091
1092
    /**
1093
     * This method determines duplicates for a queue entry with the same parameters and this timestamp.
1094
     * If the timestamp is in the past, it will check if there is any unprocessed queue entry in the past.
1095
     * If the timestamp is in the future it will check, if the queued entry has exactly the same timestamp
1096
     *
1097
     * @param int $tstamp
1098
     * @param string $parameters
0 ignored issues
show
Bug introduced by
There is no parameter named $parameters. Was it maybe removed?

This check looks for PHPDoc comments describing methods or function parameters that do not exist on the corresponding method or function.

Consider the following example. The parameter $italy is not defined by the method finale(...).

/**
 * @param array $germany
 * @param array $island
 * @param array $italy
 */
function finale($germany, $island) {
    return "2:1";
}

The most likely cause is that the parameter was removed, but the annotation was not.

Loading history...
1099
     * @author Fabrizio Branca
1100
     * @author Timo Schmidt
1101
     * @return array;
0 ignored issues
show
Documentation introduced by
The doc-type array; could not be parsed: Expected "|" or "end of type", but got ";" at position 5. (view supported doc-types)

This check marks PHPDoc comments that could not be parsed by our parser. To see which comment annotations we can parse, please refer to our documentation on supported doc-types.

Loading history...
1102
     */
1103
    protected function getDuplicateRowsIfExist($tstamp,$fieldArray){
1104
        $rows = array();
1105
1106
        $currentTime = $this->getCurrentTime();
1107
1108
            //if this entry is scheduled with "now"
1109
        if ($tstamp <= $currentTime) {
1110
            if($this->extensionSettings['enableTimeslot']){
1111
                $timeBegin     = $currentTime - 100;
1112
                $timeEnd     = $currentTime + 100;
1113
                $where         = ' ((scheduled BETWEEN '.$timeBegin.' AND '.$timeEnd.' ) OR scheduled <= '. $currentTime.') ';
1114
            }else{
1115
                $where = 'scheduled <= ' . $currentTime;
1116
            }
1117
        } elseif ($tstamp > $currentTime) {
1118
                //entry with a timestamp in the future need to have the same schedule time
1119
            $where = 'scheduled = ' . $tstamp ;
1120
        }
1121
1122
        if(!empty($where)){
1123
            $result = $this->db->exec_SELECTgetRows(
1124
                'qid',
1125
                'tx_crawler_queue',
1126
                $where.
1127
                ' AND NOT exec_time' .
1128
                ' AND NOT process_id '.
1129
                ' AND page_id='.intval($fieldArray['page_id']).
1130
                ' AND parameters_hash = ' . $this->db->fullQuoteStr($fieldArray['parameters_hash'], 'tx_crawler_queue')
1131
            );
1132
1133
            if (is_array($result)) {
1134
                foreach ($result as $value) {
1135
                    $rows[] = $value['qid'];
1136
                }
1137
            }
1138
        }
1139
1140
1141
        return $rows;
1142
    }
1143
1144
    /**
1145
     * Returns the current system time
1146
     *
1147
     * @author Timo Schmidt <[email protected]>
1148
     * @return int
1149
     */
1150
    public function getCurrentTime(){
1151
        return time();
1152
    }
1153
1154
1155
1156
    /************************************
1157
     *
1158
     * URL reading
1159
     *
1160
     ************************************/
1161
1162
    /**
1163
     * Read URL for single queue entry
1164
     *
1165
     * @param integer $queueId
1166
     * @param boolean $force If set, will process even if exec_time has been set!
1167
     * @return integer
1168
     */
1169
    function readUrl($queueId, $force = FALSE) {
0 ignored issues
show
Best Practice introduced by
It is generally recommended to explicitly declare the visibility for methods.

Adding explicit visibility (private, protected, or public) is generally recommend to communicate to other developers how, and from where this method is intended to be used.

Loading history...
1170
        $ret = 0;
1171
        if ($this->debugMode) {
1172
            \TYPO3\CMS\Core\Utility\GeneralUtility::devlog('crawler-readurl start ' . microtime(true), __FUNCTION__);
1173
        }
1174
        // Get entry:
1175
        list($queueRec) = $this->db->exec_SELECTgetRows('*', 'tx_crawler_queue',
1176
            'qid=' . intval($queueId) . ($force ? '' : ' AND exec_time=0 AND process_scheduled > 0'));
1177
1178
        if (!is_array($queueRec)) {
1179
            return;
1180
        }
1181
1182
        $pageUidRootTypoScript = \AOE\Crawler\Utility\TypoScriptUtility::getPageUidForTypoScriptRootTemplateInRootLine((int)$queueRec['page_id']);
1183
        $this->initTSFE((int)$pageUidRootTypoScript);
1184
1185
        \AOE\Crawler\Utility\SignalSlotUtility::emitSignal(
1186
            __CLASS__,
1187
            \AOE\Crawler\Utility\SignalSlotUtility::SIGNNAL_QUEUEITEM_PREPROCESS,
1188
            array($queueId, &$queueRec)
1189
        );
1190
1191
        // Set exec_time to lock record:
1192
        $field_array = array('exec_time' => $this->getCurrentTime());
1193
1194
        if (isset($this->processID)) {
1195
            //if mulitprocessing is used we need to store the id of the process which has handled this entry
1196
            $field_array['process_id_completed'] = $this->processID;
1197
        }
1198
        $this->db->exec_UPDATEquery('tx_crawler_queue', 'qid=' . intval($queueId), $field_array);
1199
1200
        $result = $this->readUrl_exec($queueRec);
1201
        $resultData = unserialize($result['content']);
1202
1203
        //atm there's no need to point to specific pollable extensions
1204
        if (is_array($GLOBALS['TYPO3_CONF_VARS']['EXTCONF']['crawler']['pollSuccess'])) {
1205
            foreach ($GLOBALS['TYPO3_CONF_VARS']['EXTCONF']['crawler']['pollSuccess'] as $pollable) {
1206
                // only check the success value if the instruction is runnig
1207
                // it is important to name the pollSuccess key same as the procInstructions key
1208
                if (is_array($resultData['parameters']['procInstructions']) && in_array($pollable,
1209
                        $resultData['parameters']['procInstructions'])
1210
                ) {
1211
                    if (!empty($resultData['success'][$pollable]) && $resultData['success'][$pollable]) {
1212
                        $ret |= self::CLI_STATUS_POLLABLE_PROCESSED;
1213
                    }
1214
                }
1215
            }
1216
        }
1217
1218
        // Set result in log which also denotes the end of the processing of this entry.
1219
        $field_array = array('result_data' => serialize($result));
1220
1221
        \AOE\Crawler\Utility\SignalSlotUtility::emitSignal(
1222
            __CLASS__,
1223
            \AOE\Crawler\Utility\SignalSlotUtility::SIGNNAL_QUEUEITEM_POSTPROCESS,
1224
            array($queueId, &$field_array)
1225
        );
1226
1227
        $this->db->exec_UPDATEquery('tx_crawler_queue', 'qid=' . intval($queueId), $field_array);
1228
1229
1230
        if ($this->debugMode) {
1231
            \TYPO3\CMS\Core\Utility\GeneralUtility::devlog('crawler-readurl stop ' . microtime(true), __FUNCTION__);
1232
        }
1233
1234
        return $ret;
1235
    }
1236
1237
    /**
1238
     * Read URL for not-yet-inserted log-entry
1239
     *
1240
     * @param    integer        Queue field array,
1241
     * @return    string
1242
     */
1243
    function readUrlFromArray($field_array)    {
0 ignored issues
show
Best Practice introduced by
It is generally recommended to explicitly declare the visibility for methods.

Adding explicit visibility (private, protected, or public) is generally recommend to communicate to other developers how, and from where this method is intended to be used.

Loading history...
1244
1245
            // Set exec_time to lock record:
1246
        $field_array['exec_time'] = $this->getCurrentTime();
1247
        $this->db->exec_INSERTquery('tx_crawler_queue', $field_array);
1248
        $queueId = $field_array['qid'] = $this->db->sql_insert_id();
1249
1250
        $result = $this->readUrl_exec($field_array);
1251
1252
            // Set result in log which also denotes the end of the processing of this entry.
1253
        $field_array = array('result_data' => serialize($result));
1254
        $this->db->exec_UPDATEquery('tx_crawler_queue','qid='.intval($queueId), $field_array);
1255
1256
        return $result;
1257
    }
1258
1259
    /**
1260
     * Read URL for a queue record
1261
     *
1262
     * @param    array        Queue record
1263
     * @return    string        Result output.
1264
     */
1265
    function readUrl_exec($queueRec)    {
0 ignored issues
show
Best Practice introduced by
It is generally recommended to explicitly declare the visibility for methods.

Adding explicit visibility (private, protected, or public) is generally recommend to communicate to other developers how, and from where this method is intended to be used.

Loading history...
1266
            // Decode parameters:
1267
        $parameters = unserialize($queueRec['parameters']);
1268
        $result = 'ERROR';
1269
        if (is_array($parameters))    {
1270
            if ($parameters['_CALLBACKOBJ'])    {    // Calling object:
1271
                $objRef = $parameters['_CALLBACKOBJ'];
1272
                $callBackObj = &\TYPO3\CMS\Core\Utility\GeneralUtility::getUserObj($objRef);
1273
                if (is_object($callBackObj))    {
1274
                    unset($parameters['_CALLBACKOBJ']);
1275
                    $result = array('content' => serialize($callBackObj->crawler_execute($parameters,$this)));
1276
                } else {
1277
                    $result = array('content' => 'No object: '.$objRef);
1278
                }
1279
            } else {    // Regular FE request:
1280
1281
                    // Prepare:
1282
                $crawlerId = $queueRec['qid'].':'.md5($queueRec['qid'].'|'.$queueRec['set_id'].'|'.$GLOBALS['TYPO3_CONF_VARS']['SYS']['encryptionKey']);
1283
1284
                    // Get result:
1285
                $result = $this->requestUrl($parameters['url'],$crawlerId);
1286
1287
                tx_crawler_domain_events_dispatcher::getInstance()->post('urlCrawled',$queueRec['set_id'],array('url' => $parameters['url'], 'result' => $result));
1288
            }
1289
        }
1290
1291
1292
        return $result;
1293
    }
1294
1295
    /**
1296
     * Gets the content of a URL.
1297
     *
1298
     * @param  string   $originalUrl    URL to read
1299
     * @param  string   $crawlerId      Crawler ID string (qid + hash to verify)
1300
     * @param  integer  $timeout        Timeout time
1301
     * @param  integer  $recursion      Recursion limiter for 302 redirects
1302
     * @return array                    Array with content
1303
     */
1304 2
    function requestUrl($originalUrl, $crawlerId, $timeout=2, $recursion=10) {
0 ignored issues
show
Best Practice introduced by
It is generally recommended to explicitly declare the visibility for methods.

Adding explicit visibility (private, protected, or public) is generally recommend to communicate to other developers how, and from where this method is intended to be used.

Loading history...
1305
1306 2
        if (!$recursion) return false;
1307
1308
            // Parse URL, checking for scheme:
1309 2
        $url = parse_url($originalUrl);
1310
1311 2
        if ($url === FALSE) {
1312
            if (TYPO3_DLOG) \TYPO3\CMS\Core\Utility\GeneralUtility::devLog(sprintf('Could not parse_url() for string "%s"', $url), 'crawler', 4, array('crawlerId' => $crawlerId));
1313
            return FALSE;
1314
        }
1315
1316 2
        if (!in_array($url['scheme'], array('','http','https'))) {
1317
            if (TYPO3_DLOG) \TYPO3\CMS\Core\Utility\GeneralUtility::devLog(sprintf('Scheme does not match for url "%s"', $url), 'crawler', 4, array('crawlerId' => $crawlerId));
1318
            return FALSE;
1319
        }
1320
1321
            // direct request
1322 2
        if ($this->extensionSettings['makeDirectRequests']) {
1323 2
            $result = $this->sendDirectRequest($originalUrl, $crawlerId);
1324 2
            return $result;
1325
        }
1326
1327
        $reqHeaders = $this->buildRequestHeaderArray($url, $crawlerId);
1328
1329
            // thanks to Pierrick Caillon for adding proxy support
1330
        $rurl = $url;
1331
1332
        if ($GLOBALS['TYPO3_CONF_VARS']['SYS']['curlUse'] && $GLOBALS['TYPO3_CONF_VARS']['SYS']['curlProxyServer']) {
1333
            $rurl = parse_url($GLOBALS['TYPO3_CONF_VARS']['SYS']['curlProxyServer']);
1334
            $url['path'] = $url['scheme'] . '://' . $url['host'] . ($url['port'] > 0 ? ':' . $url['port'] : '') . $url['path'];
1335
            $reqHeaders = $this->buildRequestHeaderArray($url, $crawlerId);
1336
        }
1337
1338
        $host = $rurl['host'];
1339
1340
        if ($url['scheme'] == 'https') {
1341
            $host = 'ssl://' . $host;
1342
            $port = ($rurl['port'] > 0) ? $rurl['port'] : 443;
1343
        } else {
1344
            $port = ($rurl['port'] > 0) ? $rurl['port'] : 80;
1345
        }
1346
1347
        $startTime = microtime(true);
1348
        $fp = fsockopen($host, $port, $errno, $errstr, $timeout);
1349
1350
        if (!$fp) {
1351
            if (TYPO3_DLOG) \TYPO3\CMS\Core\Utility\GeneralUtility::devLog(sprintf('Error while opening "%s"', $url), 'crawler', 4, array('crawlerId' => $crawlerId));
1352
            return FALSE;
1353
        } else {
1354
                // Request message:
1355
            $msg = implode("\r\n",$reqHeaders)."\r\n\r\n";
1356
            fputs ($fp, $msg);
1357
1358
                // Read response:
1359
            $d = $this->getHttpResponseFromStream($fp);
1360
            fclose ($fp);
1361
1362
            $time = microtime(true) - $startTime;
1363
            $this->log($originalUrl .' '.$time);
1364
1365
                // Implode content and headers:
1366
            $result = array(
1367
                'request' => $msg,
1368
                'headers' => implode('', $d['headers']),
1369
                'content' => implode('', (array)$d['content'])
1370
            );
1371
1372
            if (($this->extensionSettings['follow30x']) && ($newUrl = $this->getRequestUrlFrom302Header($d['headers'],$url['user'],$url['pass']))) {
1373
                $result = array_merge(array('parentRequest'=>$result), $this->requestUrl($newUrl, $crawlerId, $recursion--));
1374
                $newRequestUrl = $this->requestUrl($newUrl, $crawlerId, $timeout, --$recursion);
1375
1376
                if (is_array($newRequestUrl)) {
1377
                    $result = array_merge(array('parentRequest'=>$result), $newRequestUrl);
1378
                } else {
1379
                    if (TYPO3_DLOG) \TYPO3\CMS\Core\Utility\GeneralUtility::devLog(sprintf('Error while opening "%s"', $url), 'crawler', 4, array('crawlerId' => $crawlerId));
1380
                    return FALSE;
1381
                }
1382
            }
1383
1384
            return $result;
1385
        }
1386
    }
1387
1388
    /**
1389
     * Gets the base path of the website frontend.
1390
     * (e.g. if you call http://mydomain.com/cms/index.php in
1391
     * the browser the base path is "/cms/")
1392
     *
1393
     * @return string Base path of the website frontend
1394
     */
1395
    protected function getFrontendBasePath() {
1396
        $frontendBasePath = '/';
1397
1398
        // Get the path from the extension settings:
1399
        if (isset($this->extensionSettings['frontendBasePath']) && $this->extensionSettings['frontendBasePath']) {
1400
            $frontendBasePath = $this->extensionSettings['frontendBasePath'];
1401
        // If empty, try to use config.absRefPrefix:
1402
        } elseif (isset($GLOBALS['TSFE']->absRefPrefix) && !empty($GLOBALS['TSFE']->absRefPrefix)) {
1403
            $frontendBasePath = $GLOBALS['TSFE']->absRefPrefix;
1404
        // If not in CLI mode the base path can be determined from $_SERVER environment:
1405
        } elseif (!defined('TYPO3_cliMode') || !TYPO3_cliMode) {
1406
            $frontendBasePath = \TYPO3\CMS\Core\Utility\GeneralUtility::getIndpEnv('TYPO3_SITE_PATH');
1407
        }
1408
1409
        // Base path must be '/<pathSegements>/':
1410
        if ($frontendBasePath != '/') {
1411
            $frontendBasePath = '/' . ltrim($frontendBasePath, '/');
1412
            $frontendBasePath = rtrim($frontendBasePath, '/') . '/';
1413
        }
1414
1415
        return $frontendBasePath;
1416
    }
1417
1418
    /**
1419
     * Executes a shell command and returns the outputted result.
1420
     *
1421
     * @param string $command Shell command to be executed
1422
     * @return string Outputted result of the command execution
1423
     */
1424
    protected function executeShellCommand($command) {
1425
        $result = shell_exec($command);
1426
        return $result;
1427
    }
1428
1429
    /**
1430
     * Reads HTTP response from the given stream.
1431
     *
1432
     * @param  resource $streamPointer  Pointer to connection stream.
1433
     * @return array                    Associative array with the following items:
1434
     *                                  headers <array> Response headers sent by server.
1435
     *                                  content <array> Content, with each line as an array item.
1436
     */
1437
    protected function getHttpResponseFromStream($streamPointer) {
1438
        $response = array('headers' => array(), 'content' => array());
1439
1440
        if (is_resource($streamPointer)) {
1441
                // read headers
1442
            while($line = fgets($streamPointer, '2048')) {
1443
                $line = trim($line);
1444
                if ($line !== '') {
1445
                    $response['headers'][] = $line;
1446
                } else {
1447
                    break;
1448
                }
1449
            }
1450
1451
                // read content
1452
            while($line = fgets($streamPointer, '2048')) {
1453
                $response['content'][] = $line;
1454
            }
1455
        }
1456
1457
        return $response;
1458
    }
1459
1460
    /**
1461
     * @param message
1462
     */
1463 2
    protected function log($message) {
1464 2
        if (!empty($this->extensionSettings['logFileName'])) {
1465
            @file_put_contents($this->extensionSettings['logFileName'], date('Ymd His') . $message . "\n", FILE_APPEND);
0 ignored issues
show
Security Best Practice introduced by
It seems like you do not handle an error condition here. This can introduce security issues, and is generally not recommended.

If you suppress an error, we recommend checking for the error condition explicitly:

// For example instead of
@mkdir($dir);

// Better use
if (@mkdir($dir) === false) {
    throw new \RuntimeException('The directory '.$dir.' could not be created.');
}
Loading history...
1466
        }
1467 2
    }
1468
1469
    /**
1470
     * Builds HTTP request headers.
1471
     *
1472
     * @param array $url
1473
     * @param string $crawlerId
1474
     *
1475
     * @return array
1476
     */
1477 6
    protected function buildRequestHeaderArray(array $url, $crawlerId) {
1478 6
        $reqHeaders = array();
1479 6
        $reqHeaders[] = 'GET '.$url['path'].($url['query'] ? '?'.$url['query'] : '').' HTTP/1.0';
1480 6
        $reqHeaders[] = 'Host: '.$url['host'];
1481 6
        if (stristr($url['query'],'ADMCMD_previewWS')) {
1482 2
            $reqHeaders[] = 'Cookie: $Version="1"; be_typo_user="1"; $Path=/';
1483
        }
1484 6
        $reqHeaders[] = 'Connection: close';
1485 6
        if ($url['user']!='') {
1486 2
            $reqHeaders[] = 'Authorization: Basic '. base64_encode($url['user'].':'.$url['pass']);
1487
        }
1488 6
        $reqHeaders[] = 'X-T3crawler: '.$crawlerId;
1489 6
        $reqHeaders[] = 'User-Agent: TYPO3 crawler';
1490 6
        return $reqHeaders;
1491
    }
1492
1493
    /**
1494
     * Check if the submitted HTTP-Header contains a redirect location and built new crawler-url
1495
     *
1496
     * @param    array        HTTP Header
1497
     * @param    string        HTTP Auth. User
1498
     * @param    string        HTTP Auth. Password
1499
     * @return    string        URL from redirection
1500
     */
1501 12
    protected function getRequestUrlFrom302Header($headers,$user='',$pass='') {
1502 12
        if(!is_array($headers)) return false;
1503 11
        if(!(stristr($headers[0],'301 Moved') || stristr($headers[0],'302 Found') || stristr($headers[0],'302 Moved'))) return false;
1504
1505 9
        foreach($headers as $hl) {
1506 9
            $tmp = explode(": ",$hl);
1507 9
            $header[trim($tmp[0])] = trim($tmp[1]);
0 ignored issues
show
Coding Style Comprehensibility introduced by
$header was never initialized. Although not strictly required by PHP, it is generally a good practice to add $header = array(); before regardless.

Adding an explicit array definition is generally preferable to implicit array definition as it guarantees a stable state of the code.

Let’s take a look at an example:

foreach ($collection as $item) {
    $myArray['foo'] = $item->getFoo();

    if ($item->hasBar()) {
        $myArray['bar'] = $item->getBar();
    }

    // do something with $myArray
}

As you can see in this example, the array $myArray is initialized the first time when the foreach loop is entered. You can also see that the value of the bar key is only written conditionally; thus, its value might result from a previous iteration.

This might or might not be intended. To make your intention clear, your code more readible and to avoid accidental bugs, we recommend to add an explicit initialization $myArray = array() either outside or inside the foreach loop.

Loading history...
1508 9
            if(trim($tmp[0])=='Location') break;
1509
        }
1510 9
        if(!array_key_exists('Location',$header)) return false;
0 ignored issues
show
Bug introduced by
The variable $header does not seem to be defined for all execution paths leading up to this point.

If you define a variable conditionally, it can happen that it is not defined for all execution paths.

Let’s take a look at an example:

function myFunction($a) {
    switch ($a) {
        case 'foo':
            $x = 1;
            break;

        case 'bar':
            $x = 2;
            break;
    }

    // $x is potentially undefined here.
    echo $x;
}

In the above example, the variable $x is defined if you pass “foo” or “bar” as argument for $a. However, since the switch statement has no default case statement, if you pass any other value, the variable $x would be undefined.

Available Fixes

  1. Check for existence of the variable explicitly:

    function myFunction($a) {
        switch ($a) {
            case 'foo':
                $x = 1;
                break;
    
            case 'bar':
                $x = 2;
                break;
        }
    
        if (isset($x)) { // Make sure it's always set.
            echo $x;
        }
    }
    
  2. Define a default value for the variable:

    function myFunction($a) {
        $x = ''; // Set a default which gets overridden for certain paths.
        switch ($a) {
            case 'foo':
                $x = 1;
                break;
    
            case 'bar':
                $x = 2;
                break;
        }
    
        echo $x;
    }
    
  3. Add a value for the missing path:

    function myFunction($a) {
        switch ($a) {
            case 'foo':
                $x = 1;
                break;
    
            case 'bar':
                $x = 2;
                break;
    
            // We add support for the missing case.
            default:
                $x = '';
                break;
        }
    
        echo $x;
    }
    
Loading history...
1511
1512 6
        if($user!='') {
1513 3
            if(!($tmp = parse_url($header['Location']))) return false;
1514 2
            $newUrl = $tmp['scheme'] . '://' . $user . ':' . $pass . '@' . $tmp['host'] . $tmp['path'];
1515 2
            if($tmp['query']!='') $newUrl .= '?' . $tmp['query'];
1516
        } else {
1517 3
            $newUrl = $header['Location'];
1518
        }
1519 5
        return $newUrl;
1520
    }
1521
1522
1523
1524
1525
1526
1527
1528
1529
    /**************************
1530
     *
1531
     * tslib_fe hooks:
1532
     *
1533
     **************************/
1534
1535
    /**
1536
     * Initialization hook (called after database connection)
1537
     * Takes the "HTTP_X_T3CRAWLER" header and looks up queue record and verifies if the session comes from the system (by comparing hashes)
1538
     *
1539
     * @param    array        Parameters from frontend
1540
     * @param    object        TSFE object (reference under PHP5)
1541
     * @return    void
1542
     */
1543
    function fe_init(&$params, $ref)    {
0 ignored issues
show
Unused Code introduced by
The parameter $ref is not used and could be removed.

This check looks from parameters that have been defined for a function or method, but which are not used in the method body.

Loading history...
Best Practice introduced by
It is generally recommended to explicitly declare the visibility for methods.

Adding explicit visibility (private, protected, or public) is generally recommend to communicate to other developers how, and from where this method is intended to be used.

Loading history...
1544
1545
            // Authenticate crawler request:
1546
        if (isset($_SERVER['HTTP_X_T3CRAWLER']))    {
1547
            list($queueId,$hash) = explode(':', $_SERVER['HTTP_X_T3CRAWLER']);
1548
            list($queueRec) = $this->db->exec_SELECTgetRows('*','tx_crawler_queue','qid='.intval($queueId));
1549
1550
                // If a crawler record was found and hash was matching, set it up:
1551
            if (is_array($queueRec) && $hash === md5($queueRec['qid'].'|'.$queueRec['set_id'].'|'.$GLOBALS['TYPO3_CONF_VARS']['SYS']['encryptionKey']))    {
1552
                $params['pObj']->applicationData['tx_crawler']['running'] = TRUE;
1553
                $params['pObj']->applicationData['tx_crawler']['parameters'] = unserialize($queueRec['parameters']);
1554
                $params['pObj']->applicationData['tx_crawler']['log'] = array();
1555
            } else {
1556
                die('No crawler entry found!');
0 ignored issues
show
Coding Style Compatibility introduced by
The method fe_init() contains an exit expression.

An exit expression should only be used in rare cases. For example, if you write a short command line script.

In most cases however, using an exit expression makes the code untestable and often causes incompatibilities with other libraries. Thus, unless you are absolutely sure it is required here, we recommend to refactor your code to avoid its usage.

Loading history...
1557
            }
1558
        }
1559
    }
1560
1561
1562
1563
    /*****************************
1564
     *
1565
     * Compiling URLs to crawl - tools
1566
     *
1567
     *****************************/
1568
1569
    /**
1570
     * @param    integer        Root page id to start from.
1571
     * @param    integer        Depth of tree, 0=only id-page, 1= on sublevel, 99 = infinite
1572
     * @param    integer        Unix Time when the URL is timed to be visited when put in queue
1573
     * @param    integer        Number of requests per minute (creates the interleave between requests)
1574
     * @param    boolean        If set, submits the URLs to queue in database (real crawling)
1575
     * @param    boolean        If set (and submitcrawlUrls is false) will fill $downloadUrls with entries)
1576
     * @param    array        Array of processing instructions
1577
     * @param    array        Array of configuration keys
1578
     * @return    string        HTML code
1579
     */
1580
    function getPageTreeAndUrls(
0 ignored issues
show
Best Practice introduced by
It is generally recommended to explicitly declare the visibility for methods.

Adding explicit visibility (private, protected, or public) is generally recommend to communicate to other developers how, and from where this method is intended to be used.

Loading history...
1581
        $id,
1582
        $depth,
1583
        $scheduledTime,
1584
        $reqMinute,
1585
        $submitCrawlUrls,
1586
        $downloadCrawlUrls,
1587
        array $incomingProcInstructions,
1588
        array $configurationSelection
1589
    ) {
1590
1591
        global $BACK_PATH;
1592
        global $LANG;
1593
        if (!is_object($LANG)) {
1594
            $LANG = \TYPO3\CMS\Core\Utility\GeneralUtility::makeInstance('language');
1595
            $LANG->init(0);
1596
        }
1597
        $this->scheduledTime = $scheduledTime;
0 ignored issues
show
Bug introduced by
The property scheduledTime does not exist. Did you maybe forget to declare it?

In PHP it is possible to write to properties without declaring them. For example, the following is perfectly valid PHP code:

class MyClass { }

$x = new MyClass();
$x->foo = true;

Generally, it is a good practice to explictly declare properties to avoid accidental typos and provide IDE auto-completion:

class MyClass {
    public $foo;
}

$x = new MyClass();
$x->foo = true;
Loading history...
1598
        $this->reqMinute = $reqMinute;
0 ignored issues
show
Bug introduced by
The property reqMinute does not exist. Did you maybe forget to declare it?

In PHP it is possible to write to properties without declaring them. For example, the following is perfectly valid PHP code:

class MyClass { }

$x = new MyClass();
$x->foo = true;

Generally, it is a good practice to explictly declare properties to avoid accidental typos and provide IDE auto-completion:

class MyClass {
    public $foo;
}

$x = new MyClass();
$x->foo = true;
Loading history...
1599
        $this->submitCrawlUrls = $submitCrawlUrls;
0 ignored issues
show
Bug introduced by
The property submitCrawlUrls does not exist. Did you maybe forget to declare it?

In PHP it is possible to write to properties without declaring them. For example, the following is perfectly valid PHP code:

class MyClass { }

$x = new MyClass();
$x->foo = true;

Generally, it is a good practice to explictly declare properties to avoid accidental typos and provide IDE auto-completion:

class MyClass {
    public $foo;
}

$x = new MyClass();
$x->foo = true;
Loading history...
1600
        $this->downloadCrawlUrls = $downloadCrawlUrls;
0 ignored issues
show
Bug introduced by
The property downloadCrawlUrls does not exist. Did you maybe forget to declare it?

In PHP it is possible to write to properties without declaring them. For example, the following is perfectly valid PHP code:

class MyClass { }

$x = new MyClass();
$x->foo = true;

Generally, it is a good practice to explictly declare properties to avoid accidental typos and provide IDE auto-completion:

class MyClass {
    public $foo;
}

$x = new MyClass();
$x->foo = true;
Loading history...
1601
        $this->incomingProcInstructions = $incomingProcInstructions;
1602
        $this->incomingConfigurationSelection = $configurationSelection;
1603
1604
        $this->duplicateTrack = array();
1605
        $this->downloadUrls = array();
1606
1607
            // Drawing tree:
1608
            /* @var $tree \TYPO3\CMS\Backend\Tree\View\PageTreeView */
1609
        $tree = \TYPO3\CMS\Core\Utility\GeneralUtility::makeInstance('TYPO3\CMS\Backend\Tree\View\PageTreeView');
1610
        $perms_clause = $GLOBALS['BE_USER']->getPagePermsClause(1);
1611
        $tree->init('AND ' . $perms_clause);
1612
1613
        $pageinfo = \TYPO3\CMS\Backend\Utility\BackendUtility::readPageAccess($id, $perms_clause);
1614
1615
            // Set root row:
1616
        $tree->tree[] = Array(
1617
            'row' => $pageinfo,
1618
            'HTML' => \TYPO3\CMS\Backend\Utility\IconUtility::getSpriteIconForRecord('pages', $pageinfo)
1619
        );
1620
1621
            // Get branch beneath:
1622
        if ($depth)    {
1623
            $tree->getTree($id, $depth, '');
1624
        }
1625
1626
            // Traverse page tree:
1627
        $code = '';
1628
1629
        foreach ($tree->tree as $data) {
1630
1631
            $this->MP = false;
1632
1633
                // recognize mount points
1634
            if($data['row']['doktype'] == 7){
1635
                $mountpage = $this->db->exec_SELECTgetRows('*', 'pages', 'uid = '.$data['row']['uid']);
1636
1637
                    // fetch mounted pages
1638
                $this->MP = $mountpage[0]['mount_pid'].'-'.$data['row']['uid'];
0 ignored issues
show
Documentation Bug introduced by
The property $MP was declared of type boolean, but $mountpage[0]['mount_pid...' . $data['row']['uid'] is of type string. Maybe add a type cast?

This check looks for assignments to scalar types that may be of the wrong type.

To ensure the code behaves as expected, it may be a good idea to add an explicit type cast.

$answer = 42;

$correct = false;

$correct = (bool) $answer;
Loading history...
1639
1640
                $mountTree = \TYPO3\CMS\Core\Utility\GeneralUtility::makeInstance('TYPO3\CMS\Backend\Tree\View\PageTreeView');
1641
                $mountTree->init('AND '.$perms_clause);
1642
                $mountTree->getTree($mountpage[0]['mount_pid'], $depth, '');
1643
1644
                foreach($mountTree->tree as $mountData)    {
1645
                    $code .= $this->drawURLs_addRowsForPage(
1646
                        $mountData['row'],
1647
                        $mountData['HTML'].\TYPO3\CMS\Backend\Utility\BackendUtility::getRecordTitle('pages',$mountData['row'],TRUE)
1648
                    );
1649
                }
1650
1651
                    // replace page when mount_pid_ol is enabled
1652
                if($mountpage[0]['mount_pid_ol']){
1653
                    $data['row']['uid'] = $mountpage[0]['mount_pid'];
1654
                } else {
1655
                        // if the mount_pid_ol is not set the MP must not be used for the mountpoint page
1656
                    $this->MP = false;
1657
                }
1658
            }
1659
1660
            $code .= $this->drawURLs_addRowsForPage(
1661
                $data['row'],
0 ignored issues
show
Security Bug introduced by
It seems like $data['row'] can also be of type false; however, tx_crawler_lib::drawURLs_addRowsForPage() does only seem to accept array, did you maybe forget to handle an error condition?
Loading history...
1662
                $data['HTML'] . \TYPO3\CMS\Backend\Utility\BackendUtility::getRecordTitle('pages', $data['row'], TRUE)
0 ignored issues
show
Security Bug introduced by
It seems like $data['row'] can also be of type false; however, TYPO3\CMS\Backend\Utilit...ility::getRecordTitle() does only seem to accept array, did you maybe forget to handle an error condition?
Loading history...
1663
            );
1664
        }
1665
1666
        return $code;
1667
    }
1668
1669
    /**
1670
     * Expands exclude string.
1671
     *
1672
     * @param  string $excludeString    Exclude string
1673
     * @return array                    Array of page ids.
1674
     */
1675
    public function expandExcludeString($excludeString) {
1676
            // internal static caches;
1677
        static $expandedExcludeStringCache;
1678
        static $treeCache;
1679
1680
        if (empty($expandedExcludeStringCache[$excludeString])) {
1681
            $pidList = array();
1682
1683
            if (!empty($excludeString)) {
1684
                /* @var $tree \TYPO3\CMS\Backend\Tree\View\PageTreeView */
1685
                $tree = \TYPO3\CMS\Core\Utility\GeneralUtility::makeInstance('TYPO3\CMS\Backend\Tree\View\PageTreeView');
1686
                $tree->init('AND ' . $this->backendUser->getPagePermsClause(1));
1687
1688
                $excludeParts = \TYPO3\CMS\Core\Utility\GeneralUtility::trimExplode(',', $excludeString);
1689
1690
                foreach ($excludeParts as $excludePart) {
1691
                    list($pid, $depth) = \TYPO3\CMS\Core\Utility\GeneralUtility::trimExplode('+', $excludePart);
1692
1693
                        // default is "page only" = "depth=0"
1694
                    if (empty($depth)) {
1695
                        $depth = ( stristr($excludePart,'+')) ? 99 : 0;
1696
                    }
1697
1698
                    $pidList[] = $pid;
1699
1700
                    if ($depth > 0) {
1701
                        if (empty($treeCache[$pid][$depth])) {
1702
                            $tree->reset();
1703
                            $tree->getTree($pid, $depth);
1704
                            $treeCache[$pid][$depth] = $tree->tree;
1705
                        }
1706
1707
                        foreach ($treeCache[$pid][$depth] as $data) {
1708
                            $pidList[] = $data['row']['uid'];
1709
                        }
1710
                    }
1711
                }
1712
            }
1713
1714
            $expandedExcludeStringCache[$excludeString] = array_unique($pidList);
1715
        }
1716
1717
        return $expandedExcludeStringCache[$excludeString];
1718
    }
1719
1720
    /**
1721
     * Create the rows for display of the page tree
1722
     * For each page a number of rows are shown displaying GET variable configuration
1723
     *
1724
     * @param    array        Page row
1725
     * @param    string        Page icon and title for row
1726
     * @return    string        HTML <tr> content (one or more)
1727
     */
1728
    public function drawURLs_addRowsForPage(array $pageRow, $pageTitleAndIcon)    {
1729
1730
        $skipMessage = '';
1731
1732
            // Get list of configurations
1733
        $configurations = $this->getUrlsForPageRow($pageRow, $skipMessage);
1734
1735
        if (count($this->incomingConfigurationSelection) > 0) {
1736
                //     remove configuration that does not match the current selection
1737
            foreach ($configurations as $confKey => $confArray) {
1738
                if (!in_array($confKey, $this->incomingConfigurationSelection)) {
1739
                    unset($configurations[$confKey]);
1740
                }
1741
            }
1742
        }
1743
1744
            // Traverse parameter combinations:
1745
        $c = 0;
1746
        $cc = 0;
0 ignored issues
show
Unused Code introduced by
$cc is not used, you could remove the assignment.

This check looks for variable assignements that are either overwritten by other assignments or where the variable is not used subsequently.

$myVar = 'Value';
$higher = false;

if (rand(1, 6) > 3) {
    $higher = true;
} else {
    $higher = false;
}

Both the $myVar assignment in line 1 and the $higher assignment in line 2 are dead. The first because $myVar is never used and the second because $higher is always overwritten for every possible time line.

Loading history...
1747
        $content = '';
1748
        if (count($configurations)) {
1749
            foreach($configurations as $confKey => $confArray)    {
1750
1751
                    // Title column:
1752
                if (!$c) {
1753
                    $titleClm = '<td rowspan="'.count($configurations).'">'.$pageTitleAndIcon.'</td>';
1754
                } else {
1755
                    $titleClm = '';
1756
                }
1757
1758
1759
                if (!in_array($pageRow['uid'], $this->expandExcludeString($confArray['subCfg']['exclude']))) {
1760
1761
                        // URL list:
1762
                    $urlList = $this->urlListFromUrlArray(
1763
                        $confArray,
1764
                        $pageRow,
1765
                        $this->scheduledTime,
1766
                        $this->reqMinute,
1767
                        $this->submitCrawlUrls,
1768
                        $this->downloadCrawlUrls,
1769
                        $this->duplicateTrack,
1770
                        $this->downloadUrls,
1771
                        $this->incomingProcInstructions // if empty the urls won't be filtered by processing instructions
1772
                    );
1773
1774
                        // Expanded parameters:
1775
                    $paramExpanded = '';
1776
                    $calcAccu = array();
1777
                    $calcRes = 1;
1778
                    foreach($confArray['paramExpanded'] as $gVar => $gVal)    {
1779
                        $paramExpanded.= '
1780
                            <tr>
1781
                                <td class="bgColor4-20">'.htmlspecialchars('&'.$gVar.'=').'<br/>'.
1782
                                                '('.count($gVal).')'.
1783
                                                '</td>
1784
                                <td class="bgColor4" nowrap="nowrap">'.nl2br(htmlspecialchars(implode(chr(10),$gVal))).'</td>
1785
                            </tr>
1786
                        ';
1787
                        $calcRes*= count($gVal);
1788
                        $calcAccu[] = count($gVal);
1789
                    }
1790
                    $paramExpanded = '<table class="lrPadding c-list param-expanded">'.$paramExpanded.'</table>';
1791
                    $paramExpanded.= 'Comb: '.implode('*',$calcAccu).'='.$calcRes;
1792
1793
                        // Options
1794
                    $optionValues = '';
1795
                    if ($confArray['subCfg']['userGroups'])    {
1796
                        $optionValues.='User Groups: '.$confArray['subCfg']['userGroups'].'<br/>';
1797
                    }
1798
                    if ($confArray['subCfg']['baseUrl'])    {
1799
                        $optionValues.='Base Url: '.$confArray['subCfg']['baseUrl'].'<br/>';
1800
                    }
1801
                    if ($confArray['subCfg']['procInstrFilter'])    {
1802
                        $optionValues.='ProcInstr: '.$confArray['subCfg']['procInstrFilter'].'<br/>';
1803
                    }
1804
1805
                        // Compile row:
1806
                    $content .= '
1807
                        <tr class="bgColor' . ($c%2 ? '-20':'-10') . '">
1808
                            ' . $titleClm . '
1809
                            <td>' . htmlspecialchars($confKey) . '</td>
1810
                            <td>' . nl2br(htmlspecialchars(rawurldecode(trim(str_replace('&', chr(10) . '&', \TYPO3\CMS\Core\Utility\GeneralUtility::implodeArrayForUrl('', $confArray['paramParsed'])))))) . '</td>
1811
                            <td>'.$paramExpanded.'</td>
1812
                            <td nowrap="nowrap">' . $urlList . '</td>
1813
                            <td nowrap="nowrap">' . $optionValues . '</td>
1814
                            <td nowrap="nowrap">' . \TYPO3\CMS\Core\Utility\DebugUtility::viewArray($confArray['subCfg']['procInstrParams.']) . '</td>
1815
                        </tr>';
1816
                } else {
1817
1818
                    $content .= '<tr class="bgColor'.($c%2 ? '-20':'-10') . '">
1819
                            '.$titleClm.'
1820
                            <td>'.htmlspecialchars($confKey).'</td>
1821
                            <td colspan="5"><em>No entries</em> (Page is excluded in this configuration)</td>
1822
                        </tr>';
1823
1824
                }
1825
1826
1827
                $c++;
1828
            }
1829
        } else {
1830
            $message = !empty($skipMessage) ? ' ('.$skipMessage.')' : '';
1831
1832
                // Compile row:
1833
            $content.= '
1834
                <tr class="bgColor-20" style="border-bottom: 1px solid black;">
1835
                    <td>'.$pageTitleAndIcon.'</td>
1836
                    <td colspan="6"><em>No entries</em>'.$message.'</td>
1837
                </tr>';
1838
        }
1839
1840
        return $content;
1841
    }
1842
1843
    /**
1844
     *
1845
     * @return int
1846
     */
1847
    function getUnprocessedItemsCount() {
0 ignored issues
show
Best Practice introduced by
It is generally recommended to explicitly declare the visibility for methods.

Adding explicit visibility (private, protected, or public) is generally recommend to communicate to other developers how, and from where this method is intended to be used.

Loading history...
1848
        $res = $this->db->exec_SELECTquery(
1849
                    'count(*) as num',
1850
                    'tx_crawler_queue',
1851
                    'exec_time=0
1852
                    AND process_scheduled= 0
1853
                    AND scheduled<='.$this->getCurrentTime()
1854
        );
1855
1856
        $count = $this->db->sql_fetch_assoc($res);
1857
        return $count['num'];
1858
    }
1859
1860
1861
1862
1863
1864
1865
1866
1867
    /*****************************
1868
     *
1869
     * CLI functions
1870
     *
1871
     *****************************/
1872
1873
    /**
1874
     * Main function for running from Command Line PHP script (cron job)
1875
     * See ext/crawler/cli/crawler_cli.phpsh for details
1876
     *
1877
     * @return    int number of remaining items or false if error
1878
     */
1879
    function CLI_main() {
0 ignored issues
show
Best Practice introduced by
It is generally recommended to explicitly declare the visibility for methods.

Adding explicit visibility (private, protected, or public) is generally recommend to communicate to other developers how, and from where this method is intended to be used.

Loading history...
1880
        $this->setAccessMode('cli');
1881
        $result = self::CLI_STATUS_NOTHING_PROCCESSED;
1882
        $cliObj = \TYPO3\CMS\Core\Utility\GeneralUtility::makeInstance('tx_crawler_cli');
1883
1884
        if (isset($cliObj->cli_args['-h']) || isset($cliObj->cli_args['--help'])) {
1885
            $cliObj->cli_validateArgs();
1886
            $cliObj->cli_help();
1887
            exit;
0 ignored issues
show
Coding Style Compatibility introduced by
The method CLI_main() contains an exit expression.

An exit expression should only be used in rare cases. For example, if you write a short command line script.

In most cases however, using an exit expression makes the code untestable and often causes incompatibilities with other libraries. Thus, unless you are absolutely sure it is required here, we recommend to refactor your code to avoid its usage.

Loading history...
1888
        }
1889
1890
        if (!$this->getDisabled() && $this->CLI_checkAndAcquireNewProcess($this->CLI_buildProcessId())) {
1891
            $countInARun = $cliObj->cli_argValue('--countInARun') ? intval($cliObj->cli_argValue('--countInARun')) : $this->extensionSettings['countInARun'];
1892
                // Seconds
1893
            $sleepAfterFinish = $cliObj->cli_argValue('--sleepAfterFinish') ? intval($cliObj->cli_argValue('--sleepAfterFinish')) : $this->extensionSettings['sleepAfterFinish'];
1894
                // Milliseconds
1895
            $sleepTime = $cliObj->cli_argValue('--sleepTime') ? intval($cliObj->cli_argValue('--sleepTime')) : $this->extensionSettings['sleepTime'];
1896
1897
            try {
1898
                    // Run process:
1899
                $result = $this->CLI_run($countInARun, $sleepTime, $sleepAfterFinish);
1900
            } catch (Exception $e) {
1901
                $result = self::CLI_STATUS_ABORTED;
1902
            }
1903
1904
                // Cleanup
1905
            $this->db->exec_DELETEquery('tx_crawler_process', 'assigned_items_count = 0');
1906
1907
                //TODO can't we do that in a clean way?
1908
            $releaseStatus = $this->CLI_releaseProcesses($this->CLI_buildProcessId());
0 ignored issues
show
Unused Code introduced by
$releaseStatus is not used, you could remove the assignment.

This check looks for variable assignements that are either overwritten by other assignments or where the variable is not used subsequently.

$myVar = 'Value';
$higher = false;

if (rand(1, 6) > 3) {
    $higher = true;
} else {
    $higher = false;
}

Both the $myVar assignment in line 1 and the $higher assignment in line 2 are dead. The first because $myVar is never used and the second because $higher is always overwritten for every possible time line.

Loading history...
1909
1910
            $this->CLI_debug("Unprocessed Items remaining:".$this->getUnprocessedItemsCount()." (".$this->CLI_buildProcessId().")");
1911
            $result |= ( $this->getUnprocessedItemsCount() > 0 ? self::CLI_STATUS_REMAIN : self::CLI_STATUS_NOTHING_PROCCESSED );
1912
        } else {
1913
            $result |= self::CLI_STATUS_ABORTED;
1914
        }
1915
1916
        return $result;
1917
    }
1918
1919
    /**
1920
     * Function executed by crawler_im.php cli script.
1921
     *
1922
     * @return    void
1923
     */
1924
    function CLI_main_im()    {
0 ignored issues
show
Best Practice introduced by
It is generally recommended to explicitly declare the visibility for methods.

Adding explicit visibility (private, protected, or public) is generally recommend to communicate to other developers how, and from where this method is intended to be used.

Loading history...
1925
        $this->setAccessMode('cli_im');
1926
1927
        $cliObj = \TYPO3\CMS\Core\Utility\GeneralUtility::makeInstance('tx_crawler_cli_im');
1928
1929
            // Force user to admin state and set workspace to "Live":
1930
        $this->backendUser->user['admin'] = 1;
1931
        $this->backendUser->setWorkspace(0);
1932
1933
            // Print help
1934
        if (!isset($cliObj->cli_args['_DEFAULT'][1]))    {
1935
            $cliObj->cli_validateArgs();
1936
            $cliObj->cli_help();
1937
            exit;
0 ignored issues
show
Coding Style Compatibility introduced by
The method CLI_main_im() contains an exit expression.

An exit expression should only be used in rare cases. For example, if you write a short command line script.

In most cases however, using an exit expression makes the code untestable and often causes incompatibilities with other libraries. Thus, unless you are absolutely sure it is required here, we recommend to refactor your code to avoid its usage.

Loading history...
1938
        }
1939
1940
        $cliObj->cli_validateArgs();
1941
1942
        if ($cliObj->cli_argValue('-o')==='exec')    {
1943
            $this->registerQueueEntriesInternallyOnly=TRUE;
0 ignored issues
show
Documentation Bug introduced by
It seems like TRUE of type boolean is incompatible with the declared type array of property $registerQueueEntriesInternallyOnly.

Our type inference engine has found an assignment to a property that is incompatible with the declared type of that property.

Either this assignment is in error or the assigned type should be added to the documentation/type hint for that property..

Loading history...
1944
        }
1945
1946
        if (isset($cliObj->cli_args['_DEFAULT'][2])) {
1947
            // Crawler is called over TYPO3 BE
1948
            $pageId = \TYPO3\CMS\Core\Utility\MathUtility::forceIntegerInRange($cliObj->cli_args['_DEFAULT'][2], 0);
1949
        } else {
1950
            // Crawler is called over cli
1951
            $pageId = \TYPO3\CMS\Core\Utility\MathUtility::forceIntegerInRange($cliObj->cli_args['_DEFAULT'][1], 0);
1952
        }
1953
1954
        $configurationKeys  = $this->getConfigurationKeys($cliObj);
1955
1956
        if(!is_array($configurationKeys)){
1957
            $configurations = $this->getUrlsForPageId($pageId);
1958
            if(is_array($configurations)){
1959
                $configurationKeys = array_keys($configurations);
1960
            }else{
1961
                $configurationKeys = array();
1962
            }
1963
        }
1964
1965
        if($cliObj->cli_argValue('-o')==='queue' || $cliObj->cli_argValue('-o')==='exec'){
1966
1967
            $reason = new tx_crawler_domain_reason();
1968
            $reason->setReason(tx_crawler_domain_reason::REASON_GUI_SUBMIT);
1969
            $reason->setDetailText('The cli script of the crawler added to the queue');
1970
            tx_crawler_domain_events_dispatcher::getInstance()->post(
1971
                'invokeQueueChange',
1972
                $this->setID,
1973
                array(    'reason' => $reason )
1974
            );
1975
        }
1976
1977
        if ($this->extensionSettings['cleanUpOldQueueEntries']) {
1978
            $this->cleanUpOldQueueEntries();
1979
        }
1980
1981
        $this->setID = \TYPO3\CMS\Core\Utility\GeneralUtility::md5int(microtime());
0 ignored issues
show
Documentation Bug introduced by
It seems like \TYPO3\CMS\Core\Utility\...ty::md5int(microtime()) can also be of type double. However, the property $setID is declared as type integer. Maybe add an additional type check?

Our type inference engine has found a suspicous assignment of a value to a property. This check raises an issue when a value that can be of a mixed type is assigned to a property that is type hinted more strictly.

For example, imagine you have a variable $accountId that can either hold an Id object or false (if there is no account id yet). Your code now assigns that value to the id property of an instance of the Account class. This class holds a proper account, so the id value must no longer be false.

Either this assignment is in error or a type check should be added for that assignment.

class Id
{
    public $id;

    public function __construct($id)
    {
        $this->id = $id;
    }

}

class Account
{
    /** @var  Id $id */
    public $id;
}

$account_id = false;

if (starsAreRight()) {
    $account_id = new Id(42);
}

$account = new Account();
if ($account instanceof Id)
{
    $account->id = $account_id;
}
Loading history...
1982
        $this->getPageTreeAndUrls(
1983
            $pageId,
1984
            \TYPO3\CMS\Core\Utility\MathUtility::forceIntegerInRange($cliObj->cli_argValue('-d'),0,99),
1985
            $this->getCurrentTime(),
1986
            \TYPO3\CMS\Core\Utility\MathUtility::forceIntegerInRange($cliObj->cli_isArg('-n') ? $cliObj->cli_argValue('-n') : 30,1,1000),
1987
            $cliObj->cli_argValue('-o')==='queue' || $cliObj->cli_argValue('-o')==='exec',
1988
            $cliObj->cli_argValue('-o')==='url',
1989
            \TYPO3\CMS\Core\Utility\GeneralUtility::trimExplode(',',$cliObj->cli_argValue('-proc'),1),
0 ignored issues
show
Documentation introduced by
1 is of type integer, but the function expects a boolean.

It seems like the type of the argument is not accepted by the function/method which you are calling.

In some cases, in particular if PHP’s automatic type-juggling kicks in this might be fine. In other cases, however this might be a bug.

We suggest to add an explicit type cast like in the following example:

function acceptsInteger($int) { }

$x = '123'; // string "123"

// Instead of
acceptsInteger($x);

// we recommend to use
acceptsInteger((integer) $x);
Loading history...
1990
            $configurationKeys
1991
        );
1992
1993
        if ($cliObj->cli_argValue('-o')==='url') {
1994
            $cliObj->cli_echo(implode(chr(10),$this->downloadUrls).chr(10),1);
0 ignored issues
show
Documentation introduced by
1 is of type integer, but the function expects a boolean.

It seems like the type of the argument is not accepted by the function/method which you are calling.

In some cases, in particular if PHP’s automatic type-juggling kicks in this might be fine. In other cases, however this might be a bug.

We suggest to add an explicit type cast like in the following example:

function acceptsInteger($int) { }

$x = '123'; // string "123"

// Instead of
acceptsInteger($x);

// we recommend to use
acceptsInteger((integer) $x);
Loading history...
1995
        } elseif ($cliObj->cli_argValue('-o')==='exec')    {
1996
            $cliObj->cli_echo("Executing ".count($this->urlList)." requests right away:\n\n");
1997
            $cliObj->cli_echo(implode(chr(10),$this->urlList).chr(10));
1998
            $cliObj->cli_echo("\nProcessing:\n");
1999
2000
            foreach($this->queueEntries as $queueRec)    {
2001
                $p = unserialize($queueRec['parameters']);
2002
                $cliObj->cli_echo($p['url'].' ('.implode(',',$p['procInstructions']).') => ');
2003
2004
                $result = $this->readUrlFromArray($queueRec);
2005
2006
                $requestResult = unserialize($result['content']);
2007
                if (is_array($requestResult))    {
2008
                    $resLog = is_array($requestResult['log']) ?  chr(10).chr(9).chr(9).implode(chr(10).chr(9).chr(9),$requestResult['log']) : '';
2009
                    $cliObj->cli_echo('OK: '.$resLog.chr(10));
2010
                } else {
2011
                    $cliObj->cli_echo('Error checking Crawler Result: '.substr(preg_replace('/\s+/',' ',strip_tags($result['content'])),0,30000).'...'.chr(10));
2012
                }
2013
            }
2014
        } elseif ($cliObj->cli_argValue('-o')==='queue')    {
2015
            $cliObj->cli_echo("Putting ".count($this->urlList)." entries in queue:\n\n");
2016
            $cliObj->cli_echo(implode(chr(10),$this->urlList).chr(10));
2017
        } else {
2018
            $cliObj->cli_echo(count($this->urlList)." entries found for processing. (Use -o to decide action):\n\n",1);
0 ignored issues
show
Documentation introduced by
1 is of type integer, but the function expects a boolean.

It seems like the type of the argument is not accepted by the function/method which you are calling.

In some cases, in particular if PHP’s automatic type-juggling kicks in this might be fine. In other cases, however this might be a bug.

We suggest to add an explicit type cast like in the following example:

function acceptsInteger($int) { }

$x = '123'; // string "123"

// Instead of
acceptsInteger($x);

// we recommend to use
acceptsInteger((integer) $x);
Loading history...
2019
            $cliObj->cli_echo(implode(chr(10),$this->urlList).chr(10),1);
0 ignored issues
show
Documentation introduced by
1 is of type integer, but the function expects a boolean.

It seems like the type of the argument is not accepted by the function/method which you are calling.

In some cases, in particular if PHP’s automatic type-juggling kicks in this might be fine. In other cases, however this might be a bug.

We suggest to add an explicit type cast like in the following example:

function acceptsInteger($int) { }

$x = '123'; // string "123"

// Instead of
acceptsInteger($x);

// we recommend to use
acceptsInteger((integer) $x);
Loading history...
2020
        }
2021
    }
2022
2023
    /**
2024
     * Function executed by crawler_im.php cli script.
2025
     *
2026
     * @return bool
2027
     */
2028
    function CLI_main_flush() {
0 ignored issues
show
Best Practice introduced by
It is generally recommended to explicitly declare the visibility for methods.

Adding explicit visibility (private, protected, or public) is generally recommend to communicate to other developers how, and from where this method is intended to be used.

Loading history...
2029
        $this->setAccessMode('cli_flush');
2030
        $cliObj = \TYPO3\CMS\Core\Utility\GeneralUtility::makeInstance('tx_crawler_cli_flush');
2031
2032
            // Force user to admin state and set workspace to "Live":
2033
        $this->backendUser->user['admin'] = 1;
2034
        $this->backendUser->setWorkspace(0);
2035
2036
            // Print help
2037
        if (!isset($cliObj->cli_args['_DEFAULT'][1])) {
2038
            $cliObj->cli_validateArgs();
2039
            $cliObj->cli_help();
2040
            exit;
0 ignored issues
show
Coding Style Compatibility introduced by
The method CLI_main_flush() contains an exit expression.

An exit expression should only be used in rare cases. For example, if you write a short command line script.

In most cases however, using an exit expression makes the code untestable and often causes incompatibilities with other libraries. Thus, unless you are absolutely sure it is required here, we recommend to refactor your code to avoid its usage.

Loading history...
2041
        }
2042
2043
        $cliObj->cli_validateArgs();
2044
        $pageId = \TYPO3\CMS\Core\Utility\MathUtility::forceIntegerInRange($cliObj->cli_args['_DEFAULT'][1],0);
2045
        $fullFlush = ($pageId == 0);
2046
2047
        $mode = $cliObj->cli_argValue('-o');
2048
2049
        switch($mode) {
2050
            case 'all':
2051
                $result = $this->getLogEntriesForPageId($pageId, '', true, $fullFlush);
2052
                break;
2053
            case 'finished':
2054
            case 'pending':
2055
                $result = $this->getLogEntriesForPageId($pageId, $mode, true, $fullFlush);
2056
                break;
2057
            default:
2058
                $cliObj->cli_validateArgs();
2059
                $cliObj->cli_help();
2060
                $result = false;
2061
        }
2062
2063
        return $result !== false;
2064
    }
2065
2066
    /**
2067
     * Obtains configuration keys from the CLI arguments
2068
     *
2069
     * @param  tx_crawler_cli_im $cliObj    Command line object
2070
     * @return mixed                        Array of keys or null if no keys found
2071
     */
2072
    protected function getConfigurationKeys(tx_crawler_cli_im &$cliObj) {
2073
        $parameter = trim($cliObj->cli_argValue('-conf'));
2074
        return ($parameter != '' ? \TYPO3\CMS\Core\Utility\GeneralUtility::trimExplode(',', $parameter) : array());
2075
    }
2076
2077
    /**
2078
     * Running the functionality of the CLI (crawling URLs from queue)
2079
     *
2080
     * @param  int $countInARun
2081
     * @param  int $sleepTime
2082
     * @param  int $sleepAfterFinish
2083
     * @return string                   Status message
2084
     */
2085
    public function CLI_run($countInARun, $sleepTime, $sleepAfterFinish) {
2086
        $result = 0;
2087
        $counter = 0;
2088
2089
            // First, run hooks:
2090
        $this->CLI_runHooks();
2091
2092
            // Clean up the queue
2093
        if (intval($this->extensionSettings['purgeQueueDays']) > 0) {
2094
            $purgeDate = $this->getCurrentTime() - 24 * 60 * 60 * intval($this->extensionSettings['purgeQueueDays']);
2095
            $del = $this->db->exec_DELETEquery(
0 ignored issues
show
Unused Code introduced by
$del is not used, you could remove the assignment.

This check looks for variable assignements that are either overwritten by other assignments or where the variable is not used subsequently.

$myVar = 'Value';
$higher = false;

if (rand(1, 6) > 3) {
    $higher = true;
} else {
    $higher = false;
}

Both the $myVar assignment in line 1 and the $higher assignment in line 2 are dead. The first because $myVar is never used and the second because $higher is always overwritten for every possible time line.

Loading history...
2096
                'tx_crawler_queue',
2097
                'exec_time!=0 AND exec_time<' . $purgeDate
2098
            );
2099
        }
2100
2101
            // Select entries:
2102
            //TODO Shouldn't this reside within the transaction?
2103
        $rows = $this->db->exec_SELECTgetRows(
2104
            'qid,scheduled',
2105
            'tx_crawler_queue',
2106
            'exec_time=0
2107
                AND process_scheduled= 0
2108
                AND scheduled<='.$this->getCurrentTime(),
2109
            '',
2110
            'scheduled, qid',
2111
        intval($countInARun)
2112
        );
2113
2114
        if (count($rows)>0) {
2115
            $quidList = array();
2116
2117
            foreach($rows as $r) {
0 ignored issues
show
Bug introduced by
The expression $rows of type null|array is not guaranteed to be traversable. How about adding an additional type check?

There are different options of fixing this problem.

  1. If you want to be on the safe side, you can add an additional type-check:

    $collection = json_decode($data, true);
    if ( ! is_array($collection)) {
        throw new \RuntimeException('$collection must be an array.');
    }
    
    foreach ($collection as $item) { /** ... */ }
    
  2. If you are sure that the expression is traversable, you might want to add a doc comment cast to improve IDE auto-completion and static analysis:

    /** @var array $collection */
    $collection = json_decode($data, true);
    
    foreach ($collection as $item) { /** .. */ }
    
  3. Mark the issue as a false-positive: Just hover the remove button, in the top-right corner of this issue for more options.

Loading history...
2118
                $quidList[] = $r['qid'];
2119
            }
2120
2121
            $processId = $this->CLI_buildProcessId();
2122
2123
                //reserve queue entrys for process
2124
            $this->db->sql_query('BEGIN');
2125
                //TODO make sure we're not taking assigned queue-entires
2126
            $this->db->exec_UPDATEquery(
2127
                'tx_crawler_queue',
2128
                'qid IN ('.implode(',',$quidList).')',
2129
                array(
2130
                    'process_scheduled' => intval($this->getCurrentTime()),
2131
                    'process_id' => $processId
2132
                )
2133
            );
2134
2135
                //save the number of assigned queue entrys to determine who many have been processed later
2136
            $numberOfAffectedRows = $this->db->sql_affected_rows();
2137
            $this->db->exec_UPDATEquery(
2138
                'tx_crawler_process',
2139
                "process_id = '".$processId."'" ,
2140
                array(
2141
                    'assigned_items_count' => intval($numberOfAffectedRows)
2142
                )
2143
            );
2144
2145
            if($numberOfAffectedRows == count($quidList)) {
2146
                $this->db->sql_query('COMMIT');
2147
            } else  {
2148
                $this->db->sql_query('ROLLBACK');
2149
                $this->CLI_debug("Nothing processed due to multi-process collision (".$this->CLI_buildProcessId().")");
2150
                return ( $result | self::CLI_STATUS_ABORTED );
2151
            }
2152
2153
2154
2155
            foreach($rows as $r)    {
0 ignored issues
show
Bug introduced by
The expression $rows of type null|array is not guaranteed to be traversable. How about adding an additional type check?

There are different options of fixing this problem.

  1. If you want to be on the safe side, you can add an additional type-check:

    $collection = json_decode($data, true);
    if ( ! is_array($collection)) {
        throw new \RuntimeException('$collection must be an array.');
    }
    
    foreach ($collection as $item) { /** ... */ }
    
  2. If you are sure that the expression is traversable, you might want to add a doc comment cast to improve IDE auto-completion and static analysis:

    /** @var array $collection */
    $collection = json_decode($data, true);
    
    foreach ($collection as $item) { /** .. */ }
    
  3. Mark the issue as a false-positive: Just hover the remove button, in the top-right corner of this issue for more options.

Loading history...
2156
                $result |= $this->readUrl($r['qid']);
2157
2158
                $counter++;
2159
                usleep(intval($sleepTime));    // Just to relax the system
2160
2161
                    // if during the start and the current read url the cli has been disable we need to return from the function
2162
                    // mark the process NOT as ended.
2163
                if ($this->getDisabled()) {
2164
                    return ( $result | self::CLI_STATUS_ABORTED );
2165
                }
2166
2167
                if (!$this->CLI_checkIfProcessIsActive($this->CLI_buildProcessId())) {
2168
                    $this->CLI_debug("conflict / timeout (".$this->CLI_buildProcessId().")");
2169
2170
                        //TODO might need an additional returncode
2171
                    $result |= self::CLI_STATUS_ABORTED;
2172
                    break;        //possible timeout
2173
                }
2174
            }
2175
2176
            sleep(intval($sleepAfterFinish));
2177
2178
            $msg = 'Rows: '.$counter;
2179
            $this->CLI_debug($msg." (".$this->CLI_buildProcessId().")");
2180
2181
        } else {
2182
            $this->CLI_debug("Nothing within queue which needs to be processed (".$this->CLI_buildProcessId().")");
2183
        }
2184
2185
        if($counter > 0) {
2186
            $result |= self::CLI_STATUS_PROCESSED;
2187
        }
2188
2189
        return $result;
2190
    }
2191
2192
    /**
2193
     * Activate hooks
2194
     *
2195
     * @return    void
2196
     */
2197
    function CLI_runHooks()    {
0 ignored issues
show
Best Practice introduced by
It is generally recommended to explicitly declare the visibility for methods.

Adding explicit visibility (private, protected, or public) is generally recommend to communicate to other developers how, and from where this method is intended to be used.

Loading history...
2198
        global $TYPO3_CONF_VARS;
2199
        if (is_array($TYPO3_CONF_VARS['EXTCONF']['crawler']['cli_hooks']))    {
2200
            foreach($TYPO3_CONF_VARS['EXTCONF']['crawler']['cli_hooks'] as $objRef)    {
2201
                $hookObj = &\TYPO3\CMS\Core\Utility\GeneralUtility::getUserObj($objRef);
2202
                if (is_object($hookObj))    {
2203
                    $hookObj->crawler_init($this);
2204
                }
2205
            }
2206
        }
2207
    }
2208
2209
    /**
2210
     * Try to acquire a new process with the given id
2211
     * also performs some auto-cleanup for orphan processes
2212
     * @todo preemption might not be the most elegant way to clean up
2213
     *
2214
     * @param  string    $id  identification string for the process
2215
     * @return boolean        determines whether the attempt to get resources was successful
2216
     */
2217
    function CLI_checkAndAcquireNewProcess($id) {
0 ignored issues
show
Best Practice introduced by
It is generally recommended to explicitly declare the visibility for methods.

Adding explicit visibility (private, protected, or public) is generally recommend to communicate to other developers how, and from where this method is intended to be used.

Loading history...
2218
2219
        $ret = true;
2220
2221
        $systemProcessId = getmypid();
2222
        if ($systemProcessId < 1) {
2223
            return FALSE;
2224
        }
2225
2226
        $processCount = 0;
2227
        $orphanProcesses = array();
2228
2229
        $this->db->sql_query('BEGIN');
2230
2231
        $res = $this->db->exec_SELECTquery(
2232
            'process_id,ttl',
2233
            'tx_crawler_process',
2234
            'active=1 AND deleted=0'
2235
            );
2236
2237
            $currentTime = $this->getCurrentTime();
2238
2239
            while($row = $this->db->sql_fetch_assoc($res))    {
2240
                if ($row['ttl'] < $currentTime) {
2241
                    $orphanProcesses[] = $row['process_id'];
2242
                } else {
2243
                    $processCount++;
2244
                }
2245
            }
2246
2247
                // if there are less than allowed active processes then add a new one
2248
            if ($processCount < intval($this->extensionSettings['processLimit'])) {
2249
                $this->CLI_debug("add ".$this->CLI_buildProcessId()." (".($processCount+1)."/".intval($this->extensionSettings['processLimit']).")");
2250
2251
                    // create new process record
2252
                $this->db->exec_INSERTquery(
2253
                'tx_crawler_process',
2254
                array(
2255
                    'process_id' => $id,
2256
                    'active'=>'1',
2257
                    'ttl' => ($currentTime + intval($this->extensionSettings['processMaxRunTime'])),
2258
                    'system_process_id' => $systemProcessId
2259
                )
2260
                );
2261
2262
            } else {
2263
                $this->CLI_debug("Processlimit reached (".($processCount)."/".intval($this->extensionSettings['processLimit']).")");
2264
                $ret = false;
2265
            }
2266
2267
            $this->CLI_releaseProcesses($orphanProcesses, true); // maybe this should be somehow included into the current lock
2268
            $this->CLI_deleteProcessesMarkedDeleted();
2269
2270
            $this->db->sql_query('COMMIT');
2271
2272
            return $ret;
2273
    }
2274
2275
    /**
2276
     * Release a process and the required resources
2277
     *
2278
     * @param  mixed    $releaseIds   string with a single process-id or array with multiple process-ids
2279
     * @param  boolean  $withinLock   show whether the DB-actions are included within an existing lock
2280
     * @return boolean
2281
     */
2282
    function CLI_releaseProcesses($releaseIds, $withinLock=false) {
0 ignored issues
show
Best Practice introduced by
It is generally recommended to explicitly declare the visibility for methods.

Adding explicit visibility (private, protected, or public) is generally recommend to communicate to other developers how, and from where this method is intended to be used.

Loading history...
2283
2284
        if (!is_array($releaseIds)) {
2285
            $releaseIds = array($releaseIds);
2286
        }
2287
2288
        if (!count($releaseIds) > 0) {
2289
            return false;   //nothing to release
2290
        }
2291
2292
        if(!$withinLock) $this->db->sql_query('BEGIN');
2293
2294
            // some kind of 2nd chance algo - this way you need at least 2 processes to have a real cleanup
2295
            // this ensures that a single process can't mess up the entire process table
2296
2297
            // mark all processes as deleted which have no "waiting" queue-entires and which are not active
2298
        $this->db->exec_UPDATEquery(
2299
            'tx_crawler_queue',
2300
            'process_id IN (SELECT process_id FROM tx_crawler_process WHERE active=0 AND deleted=0)',
2301
            array(
2302
                'process_scheduled' => 0,
2303
                'process_id' => ''
2304
            )
2305
        );
2306
        $this->db->exec_UPDATEquery(
2307
            'tx_crawler_process',
2308
            'active=0 AND deleted=0
2309
            AND NOT EXISTS (
2310
                SELECT * FROM tx_crawler_queue
2311
                WHERE tx_crawler_queue.process_id = tx_crawler_process.process_id
2312
                AND tx_crawler_queue.exec_time = 0
2313
            )',
2314
            array(
2315
                'deleted'=>'1',
2316
                'system_process_id' => 0
2317
            )
2318
        );
2319
                // mark all requested processes as non-active
2320
        $this->db->exec_UPDATEquery(
2321
            'tx_crawler_process',
2322
            'process_id IN (\''.implode('\',\'',$releaseIds).'\') AND deleted=0',
2323
            array(
2324
                'active'=>'0'
2325
            )
2326
        );
2327
        $this->db->exec_UPDATEquery(
2328
            'tx_crawler_queue',
2329
            'exec_time=0 AND process_id IN ("'.implode('","',$releaseIds).'")',
2330
            array(
2331
                'process_scheduled'=>0,
2332
                'process_id'=>''
2333
            )
2334
        );
2335
2336
        if(!$withinLock) $this->db->sql_query('COMMIT');
2337
2338
        return true;
2339
    }
2340
2341
    /**
2342
     * Delete processes marked as deleted
2343
     *
2344
     * @return void
2345
     */
2346
     public function CLI_deleteProcessesMarkedDeleted() {
2347
        $this->db->exec_DELETEquery('tx_crawler_process', 'deleted = 1');
2348
    }
2349
2350
    /**
2351
     * Check if there are still resources left for the process with the given id
2352
     * Used to determine timeouts and to ensure a proper cleanup if there's a timeout
2353
     *
2354
     * @param  string  identification string for the process
2355
     * @return boolean determines if the process is still active / has resources
2356
     *
2357
     * TODO: Why using  $this->db->sql_query('BEGIN'); &  $this->db->sql_query('COMMIT'); on a SELECT Query?
2358
     */
2359
    function CLI_checkIfProcessIsActive($pid) {
0 ignored issues
show
Best Practice introduced by
It is generally recommended to explicitly declare the visibility for methods.

Adding explicit visibility (private, protected, or public) is generally recommend to communicate to other developers how, and from where this method is intended to be used.

Loading history...
2360
        $ret = false;
2361
        $this->db->sql_query('BEGIN');
2362
        $res = $this->db->exec_SELECTquery(
2363
            'process_id,active,ttl',
2364
            'tx_crawler_process','process_id = \''.$pid.'\'  AND deleted=0',
2365
            '',
2366
            'ttl',
2367
            '0,1'
2368
        );
2369
        if($row = $this->db->sql_fetch_assoc($res))    {
2370
            $ret = intVal($row['active'])==1;
2371
        }
2372
        $this->db->sql_query('COMMIT');
2373
2374
        return $ret;
2375
    }
2376
2377
    /**
2378
     * Create a unique Id for the current process
2379
     *
2380
     * @return string  the ID
2381
     */
2382 2
    function CLI_buildProcessId() {
0 ignored issues
show
Best Practice introduced by
It is generally recommended to explicitly declare the visibility for methods.

Adding explicit visibility (private, protected, or public) is generally recommend to communicate to other developers how, and from where this method is intended to be used.

Loading history...
2383 2
        if(!$this->processID) {
2384 1
            $this->processID= \TYPO3\CMS\Core\Utility\GeneralUtility::shortMD5($this->microtime(true));
2385
        }
2386 2
        return $this->processID;
2387
    }
2388
2389
    /**
2390
     * @param bool $get_as_float
2391
     *
2392
     * @return mixed
2393
     */
2394
    protected function microtime($get_as_float = false )
2395
    {
2396
        return microtime($get_as_float);
2397
    }
2398
2399
    /**
2400
     * Prints a message to the stdout (only if debug-mode is enabled)
2401
     *
2402
     * @param  string $msg  the message
2403
     */
2404
    function CLI_debug($msg) {
0 ignored issues
show
Best Practice introduced by
It is generally recommended to explicitly declare the visibility for methods.

Adding explicit visibility (private, protected, or public) is generally recommend to communicate to other developers how, and from where this method is intended to be used.

Loading history...
2405
        if(intval($this->extensionSettings['processDebug'])) {
2406
            echo $msg."\n"; flush();
2407
        }
2408
    }
2409
2410
2411
2412
    /**
2413
     * Get URL content by making direct request to TYPO3.
2414
     *
2415
     * @param  string $url          Page URL
2416
     * @param  int    $crawlerId    Crawler-ID
2417
     * @return array
2418
     */
2419 2
    protected function sendDirectRequest($url, $crawlerId) {
2420 2
        $requestHeaders = $this->buildRequestHeaderArray(parse_url($url), $crawlerId);
0 ignored issues
show
Security Bug introduced by
It seems like parse_url($url) targeting parse_url() can also be of type false; however, tx_crawler_lib::buildRequestHeaderArray() does only seem to accept array, did you maybe forget to handle an error condition?
Loading history...
2421
2422 2
        $cmd  = escapeshellcmd($this->extensionSettings['phpPath']);
2423 2
        $cmd .= ' ';
2424 2
        $cmd .= escapeshellarg(\TYPO3\CMS\Core\Utility\ExtensionManagementUtility::extPath('crawler') . 'cli/bootstrap.php');
2425 2
        $cmd .= ' ';
2426 2
        $cmd .= escapeshellarg($this->getFrontendBasePath());
2427 2
        $cmd .= ' ';
2428 2
        $cmd .= escapeshellarg($url);
2429 2
        $cmd .= ' ';
2430 2
        $cmd .= escapeshellarg(base64_encode(serialize($requestHeaders)));
2431
2432 2
        $startTime = microtime(true);
2433 2
        $content = $this->executeShellCommand($cmd);
2434 2
        $this->log($url . (microtime(true) - $startTime));
2435
2436
        $result = array(
2437 2
            'request' => implode("\r\n", $requestHeaders) . "\r\n\r\n",
2438 2
            'headers' => '',
2439 2
            'content' => $content
2440
        );
2441
2442 2
        return $result;
2443
    }
2444
2445
    /**
2446
     * Cleans up entries that stayed for too long in the queue. These are:
2447
     * - processed entries that are over 1.5 days in age
2448
     * - scheduled entries that are over 7 days old
2449
     *
2450
     * @return void
2451
     */
2452
    protected function cleanUpOldQueueEntries() {
2453
        $processedAgeInSeconds = $this->extensionSettings['cleanUpProcessedAge'] * 86400; // 24*60*60 Seconds in 24 hours
2454
        $scheduledAgeInSeconds = $this->extensionSettings['cleanUpScheduledAge'] * 86400;
2455
2456
        $now = time();
2457
        $condition = '(exec_time<>0 AND exec_time<' . ($now - $processedAgeInSeconds) . ') OR scheduled<=' . ($now - $scheduledAgeInSeconds);
2458
        $this->flushQueue($condition);
2459
    }
2460
2461
    /**
2462
     * Initializes a TypoScript Frontend necessary for using TypoScript and TypoLink functions
2463
     *
2464
     * @param int $id
2465
     * @param int $typeNum
2466
     *
2467
     * @return void
2468
     */
2469
    protected function initTSFE($id = 1, $typeNum = 0) {
2470
        \TYPO3\CMS\Frontend\Utility\EidUtility::initTCA();
2471
        if (!is_object($GLOBALS['TT'])) {
2472
            $GLOBALS['TT'] = new \TYPO3\CMS\Core\TimeTracker\NullTimeTracker;
0 ignored issues
show
Deprecated Code introduced by
The class TYPO3\CMS\Core\TimeTracker\NullTimeTracker has been deprecated with message: since TYPO3 v8, will be removed in v9

This class, trait or interface has been deprecated. The supplier of the file has supplied an explanatory message.

The explanatory message should give you some clue as to whether and when the type will be removed from the class and what other constant to use instead.

Loading history...
2473
            $GLOBALS['TT']->start();
0 ignored issues
show
Deprecated Code introduced by
The method TYPO3\CMS\Core\TimeTrack...ullTimeTracker::start() has been deprecated with message: since TYPO3 v8, will be removed in v9, use the regular time tracking

This method has been deprecated. The supplier of the class has supplied an explanatory message.

The explanatory message should give you some clue as to whether and when the method will be removed from the class and what other method or class to use instead.

Loading history...
2474
        }
2475
2476
        $GLOBALS['TSFE'] = \TYPO3\CMS\Core\Utility\GeneralUtility::makeInstance('TYPO3\\CMS\\Frontend\\Controller\\TypoScriptFrontendController',  $GLOBALS['TYPO3_CONF_VARS'], $id, $typeNum);
2477
        $GLOBALS['TSFE']->sys_page = \TYPO3\CMS\Core\Utility\GeneralUtility::makeInstance('TYPO3\\CMS\\Frontend\\Page\\PageRepository');
2478
        $GLOBALS['TSFE']->sys_page->init(TRUE);
2479
        $GLOBALS['TSFE']->connectToDB();
2480
        $GLOBALS['TSFE']->initFEuser();
2481
        $GLOBALS['TSFE']->determineId();
2482
        $GLOBALS['TSFE']->initTemplate();
2483
        $GLOBALS['TSFE']->rootLine = $GLOBALS['TSFE']->sys_page->getRootLine($id, '');
2484
        $GLOBALS['TSFE']->getConfigArray();
2485
        \TYPO3\CMS\Frontend\Page\PageGenerator::pagegenInit();
2486
    }
2487
}
2488
2489 1
if (defined('TYPO3_MODE') && $TYPO3_CONF_VARS[TYPO3_MODE]['XCLASS']['ext/crawler/class.tx_crawler_lib.php'])    {
2490
    include_once($TYPO3_CONF_VARS[TYPO3_MODE]['XCLASS']['ext/crawler/class.tx_crawler_lib.php']);
2491
}
2492