Completed
Branch master (261a74)
by Tomas Norre
02:16
created

tx_crawler_lib::requestUrl()   D

Complexity

Conditions 19
Paths 78

Size

Total Lines 83
Code Lines 49

Duplication

Lines 0
Ratio 0 %

Importance

Changes 0
Metric Value
cc 19
eloc 49
nc 78
nop 4
dl 0
loc 83
rs 4.8772
c 0
b 0
f 0

How to fix   Long Method    Complexity   

Long Method

Small methods make your code easier to understand, in particular if combined with a good name. Besides, if your method is small, finding a good name is usually much easier.

For example, if you find yourself adding comments to a method's body, this is usually a good sign to extract the commented part to a new method, and use the comment as a starting point when coming up with a good name for this new method.

Commonly applied refactorings include:

1
<?php
2
/***************************************************************
3
 *  Copyright notice
4
 *
5
 *  (c) 2016 AOE GmbH <[email protected]>
6
 *
7
 *  All rights reserved
8
 *
9
 *  This script is part of the TYPO3 project. The TYPO3 project is
10
 *  free software; you can redistribute it and/or modify
11
 *  it under the terms of the GNU General Public License as published by
12
 *  the Free Software Foundation; either version 3 of the License, or
13
 *  (at your option) any later version.
14
 *
15
 *  The GNU General Public License can be found at
16
 *  http://www.gnu.org/copyleft/gpl.html.
17
 *
18
 *  This script is distributed in the hope that it will be useful,
19
 *  but WITHOUT ANY WARRANTY; without even the implied warranty of
20
 *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
21
 *  GNU General Public License for more details.
22
 *
23
 *  This copyright notice MUST APPEAR in all copies of the script!
24
 ***************************************************************/
25
26
/**
27
 * Class tx_crawler_lib
28
 */
29
class tx_crawler_lib {
30
31
    var $setID = 0;
32
    var $processID ='';
33
    var $max_CLI_exec_time = 3600;    // One hour is max stalled time for the CLI (If the process has had the status "start" for 3600 seconds it will be regarded stalled and a new process is started.
34
35
    var $duplicateTrack = array();
36
    var $downloadUrls = array();
37
38
    var $incomingProcInstructions = array();
39
    var $incomingConfigurationSelection = array();
40
41
42
    var $registerQueueEntriesInternallyOnly = array();
43
    var $queueEntries = array();
44
    var $urlList = array();
45
46
    var $debugMode=FALSE;
47
48
    var $extensionSettings=array();
49
50
    var $MP = false; // mount point
51
52
    protected $processFilename;
53
54
    /**
55
     * Holds the internal access mode can be 'gui','cli' or 'cli_im'
56
     *
57
     * @var string
58
     */
59
    protected $accessMode;
60
61
    /**
62
     * @var \TYPO3\CMS\Core\Database\DatabaseConnection
63
     */
64
    private $db;
65
66
    /**
67
     * @var TYPO3\CMS\Core\Authentication\BackendUserAuthentication
68
     */
69
    private $backendUser;
70
71
    const CLI_STATUS_NOTHING_PROCCESSED = 0;
72
    const CLI_STATUS_REMAIN = 1;    //queue not empty
73
    const CLI_STATUS_PROCESSED = 2;    //(some) queue items where processed
74
    const CLI_STATUS_ABORTED = 4;    //instance didn't finish
75
    const CLI_STATUS_POLLABLE_PROCESSED = 8;
76
77
    /**
78
     * Method to set the accessMode can be gui, cli or cli_im
79
     *
80
     * @return string
81
     */
82
    public function getAccessMode() {
83
        return $this->accessMode;
84
    }
85
86
    /**
87
     * @param string $accessMode
88
     */
89
    public function setAccessMode($accessMode) {
90
        $this->accessMode = $accessMode;
91
    }
92
93
    /**
94
     * Set disabled status to prevent processes from being processed
95
     *
96
     * @param  bool $disabled (optional, defaults to true)
97
     * @return void
98
     */
99
    public function setDisabled($disabled = true) {
100
        if ($disabled) {
101
            \TYPO3\CMS\Core\Utility\GeneralUtility::writeFile($this->processFilename, '');
102
        } else {
103
            if (is_file($this->processFilename)) {
104
                unlink($this->processFilename);
105
            }
106
        }
107
    }
108
109
    /**
110
     * Get disable status
111
     *
112
     * @return bool true if disabled
113
     */
114
    public function getDisabled() {
115
        if (is_file($this->processFilename)) {
116
            return true;
117
        } else {
118
            return false;
119
        }
120
    }
121
122
    /**
123
     * @param string $filenameWithPath
124
     *
125
     * @return void
126
     */
127
    public function setProcessFilename($filenameWithPath)
128
    {
129
        $this->processFilename = $filenameWithPath;
130
    }
131
132
    /**
133
     * @return string
134
     */
135
    public function getProcessFilename()
136
    {
137
        return $this->processFilename;
138
    }
139
140
141
142
    /************************************
143
     *
144
     * Getting URLs based on Page TSconfig
145
     *
146
     ************************************/
147
148
    public function __construct() {
149
        $this->db = $GLOBALS['TYPO3_DB'];
150
        $this->backendUser = $GLOBALS['BE_USER'];
151
        $this->processFilename = PATH_site.'typo3temp/tx_crawler.proc';
152
153
        $settings = unserialize($GLOBALS['TYPO3_CONF_VARS']['EXT']['extConf']['crawler']);
154
        $settings = is_array($settings) ? $settings : array();
155
156
        // read ext_em_conf_template settings and set
157
        $this->setExtensionSettings($settings);
158
159
160
        // set defaults:
161
        if (\TYPO3\CMS\Core\Utility\MathUtility::convertToPositiveInteger($this->extensionSettings['countInARun']) == 0) {
162
            $this->extensionSettings['countInARun'] = 100;
163
        }
164
165
        $this->extensionSettings['processLimit'] = \TYPO3\CMS\Core\Utility\MathUtility::forceIntegerInRange($this->extensionSettings['processLimit'],1,99,1);
166
    }
167
168
    /**
169
     * Sets the extensions settings (unserialized pendant of $TYPO3_CONF_VARS['EXT']['extConf']['crawler']).
170
     *
171
     * @param array $extensionSettings
172
     * @return void
173
     */
174
    public function setExtensionSettings(array $extensionSettings) {
175
        $this->extensionSettings = $extensionSettings;
176
    }
177
178
    /**
179
     * Check if the given page should be crawled
180
     *
181
     * @param array $pageRow
182
     * @return false|string false if the page should be crawled (not excluded), true / skipMessage if it should be skipped
183
     * @author Fabrizio Branca <[email protected]>
184
     */
185
    public function checkIfPageShouldBeSkipped(array $pageRow) {
186
187
        $skipPage = false;
188
        $skipMessage = 'Skipped'; // message will be overwritten later
189
190
            // if page is hidden
191
        if (!$this->extensionSettings['crawlHiddenPages']) {
192
            if ($pageRow['hidden']) {
193
                $skipPage = true;
194
                $skipMessage = 'Because page is hidden';
195
            }
196
        }
197
198
        if (!$skipPage) {
199
            if (\TYPO3\CMS\Core\Utility\GeneralUtility::inList('3,4', $pageRow['doktype']) || $pageRow['doktype']>=199)    {
200
                $skipPage = true;
201
                $skipMessage = 'Because doktype is not allowed';
202
            }
203
        }
204
205
        if (!$skipPage) {
206
            if (is_array($GLOBALS['TYPO3_CONF_VARS']['EXTCONF']['crawler']['excludeDoktype'])) {
207
                foreach ($GLOBALS['TYPO3_CONF_VARS']['EXTCONF']['crawler']['excludeDoktype'] as $key => $doktypeList) {
208
                    if (\TYPO3\CMS\Core\Utility\GeneralUtility::inList($doktypeList, $pageRow['doktype'])) {
209
                        $skipPage = true;
210
                        $skipMessage = 'Doktype was excluded by "'.$key.'"';
211
                        break;
212
                    }
213
                }
214
            }
215
        }
216
217
        if (!$skipPage) {
218
                // veto hook
219
            if (is_array($GLOBALS['TYPO3_CONF_VARS']['EXTCONF']['crawler']['pageVeto'])) {
220
                foreach($GLOBALS['TYPO3_CONF_VARS']['EXTCONF']['crawler']['pageVeto'] as $key => $func)    {
221
                    $params = array(
222
                        'pageRow' => $pageRow
223
                    );
224
                    // expects "false" if page is ok and "true" or a skipMessage if this page should _not_ be crawled
225
                    $veto = \TYPO3\CMS\Core\Utility\GeneralUtility::callUserFunction($func, $params, $this);
226
                    if ($veto !== false)    {
227
                        $skipPage = true;
228
                        if (is_string($veto)) {
229
                            $skipMessage = $veto;
230
                        } else {
231
                            $skipMessage = 'Veto from hook "'.htmlspecialchars($key).'"';
232
                        }
233
                        // no need to execute other hooks if a previous one return a veto
234
                        break;
235
                    }
236
                }
237
            }
238
        }
239
240
        return $skipPage ? $skipMessage : false;
241
    }
242
243
    /**
244
     * Wrapper method for getUrlsForPageId()
245
     * It returns an array of configurations and no urls!
246
     *
247
     * @param  array  $pageRow       Page record with at least dok-type and uid columns.
248
     * @param  string $skipMessage
249
     * @return array                 Result (see getUrlsForPageId())
250
     * @see getUrlsForPageId()
251
     */
252
    public function getUrlsForPageRow(array $pageRow, &$skipMessage = '') {
253
        $message = $this->checkIfPageShouldBeSkipped($pageRow);
254
255
        if ($message === false) {
256
            $res = $this->getUrlsForPageId($pageRow['uid']);
257
            $skipMessage = '';
258
        } else {
259
            $skipMessage = $message;
260
            $res = array();
261
        }
262
263
        return $res;
264
    }
265
266
    /**
267
     * This method is used to count if there are ANY unprocessed queue entries
268
     * of a given page_id and the configuration which matches a given hash.
269
     * If there if none, we can skip an inner detail check
270
     *
271
     * @param  int    $uid
272
     * @param  string $configurationHash
273
     * @return boolean
274
     */
275
    protected function noUnprocessedQueueEntriesForPageWithConfigurationHashExist($uid,$configurationHash) {
276
        $configurationHash = $this->db->fullQuoteStr($configurationHash,'tx_crawler_queue');
277
        $res = $this->db->exec_SELECTquery('count(*) as anz','tx_crawler_queue',"page_id=".intval($uid)." AND configuration_hash=".$configurationHash." AND exec_time=0");
278
        $row = $this->db->sql_fetch_assoc($res);
279
280
        return ($row['anz'] == 0);
281
    }
282
283
    /**
284
     * Creates a list of URLs from input array (and submits them to queue if asked for)
285
     * See Web > Info module script + "indexed_search"'s crawler hook-client using this!
286
     *
287
     * @param    array        Information about URLs from pageRow to crawl.
288
     * @param    array        Page row
289
     * @param    integer        Unix time to schedule indexing to, typically time()
290
     * @param    integer        Number of requests per minute (creates the interleave between requests)
291
     * @param    boolean        If set, submits the URLs to queue
292
     * @param    boolean        If set (and submitcrawlUrls is false) will fill $downloadUrls with entries)
293
     * @param    array        Array which is passed by reference and contains the an id per url to secure we will not crawl duplicates
294
     * @param    array        Array which will be filled with URLS for download if flag is set.
295
     * @param    array        Array of processing instructions
296
     * @return    string        List of URLs (meant for display in backend module)
297
     *
298
     */
299
    function urlListFromUrlArray(
0 ignored issues
show
Best Practice introduced by
It is generally recommended to explicitly declare the visibility for methods.

Adding explicit visibility (private, protected, or public) is generally recommend to communicate to other developers how, and from where this method is intended to be used.

Loading history...
300
    array $vv,
301
    array $pageRow,
302
    $scheduledTime,
303
    $reqMinute,
304
    $submitCrawlUrls,
305
    $downloadCrawlUrls,
306
    array &$duplicateTrack,
307
    array &$downloadUrls,
308
    array $incomingProcInstructions) {
309
310
        // realurl support (thanks to Ingo Renner)
311
        if (\TYPO3\CMS\Core\Utility\ExtensionManagementUtility::isLoaded('realurl') && $vv['subCfg']['realurl']) {
312
313
            /** @var tx_realurl $urlObj */
314
            $urlObj = \TYPO3\CMS\Core\Utility\GeneralUtility::makeInstance('tx_realurl');
315
316
            if (!empty($vv['subCfg']['baseUrl'])) {
317
                $urlParts = parse_url($vv['subCfg']['baseUrl']);
318
                $host = strtolower($urlParts['host']);
319
                $urlObj->host = $host;
320
321
                // First pass, finding configuration OR pointer string:
322
                $urlObj->extConf = isset($GLOBALS['TYPO3_CONF_VARS']['EXTCONF']['realurl'][$urlObj->host]) ? $GLOBALS['TYPO3_CONF_VARS']['EXTCONF']['realurl'][$urlObj->host] : $GLOBALS['TYPO3_CONF_VARS']['EXTCONF']['realurl']['_DEFAULT'];
323
324
                // If it turned out to be a string pointer, then look up the real config:
325
                if (is_string($urlObj->extConf)) {
326
                    $urlObj->extConf = is_array($GLOBALS['TYPO3_CONF_VARS']['EXTCONF']['realurl'][$urlObj->extConf]) ? $GLOBALS['TYPO3_CONF_VARS']['EXTCONF']['realurl'][$urlObj->extConf] : $GLOBALS['TYPO3_CONF_VARS']['EXTCONF']['realurl']['_DEFAULT'];
327
                }
328
329
            }
330
331
            if (!$GLOBALS['TSFE']->sys_page) {
332
                $GLOBALS['TSFE']->sys_page = \TYPO3\CMS\Core\Utility\GeneralUtility::makeInstance('TYPO3\CMS\Frontend\Page\PageRepository');
333
            }
334
            if (!$GLOBALS['TSFE']->csConvObj) {
335
                $GLOBALS['TSFE']->csConvObj = \TYPO3\CMS\Core\Utility\GeneralUtility::makeInstance('TYPO3\CMS\Core\Charset\CharsetConverter');
336
            }
337
            if (!$GLOBALS['TSFE']->tmpl->rootLine[0]['uid']) {
338
                $GLOBALS['TSFE']->tmpl->rootLine[0]['uid'] = $urlObj->extConf['pagePath']['rootpage_id'];
339
            }
340
        }
341
342
        if (is_array($vv['URLs']))    {
343
            $configurationHash     =    md5(serialize($vv));
344
            $skipInnerCheck     =    $this->noUnprocessedQueueEntriesForPageWithConfigurationHashExist($pageRow['uid'],$configurationHash);
345
346
            foreach($vv['URLs'] as $urlQuery)    {
347
348
                if ($this->drawURLs_PIfilter($vv['subCfg']['procInstrFilter'], $incomingProcInstructions))    {
349
350
                    // Calculate cHash:
351
                    if ($vv['subCfg']['cHash'])    {
352
                        /* @var $cacheHash \TYPO3\CMS\Frontend\Page\CacheHashCalculator */
353
                        $cacheHash = \TYPO3\CMS\Core\Utility\GeneralUtility::makeInstance('TYPO3\CMS\Frontend\Page\CacheHashCalculator');
354
                        $urlQuery .= '&cHash=' . $cacheHash->generateForParameters($urlQuery);
355
                    }
356
357
                    // Create key by which to determine unique-ness:
358
                    $uKey = $urlQuery.'|'.$vv['subCfg']['userGroups'].'|'.$vv['subCfg']['baseUrl'].'|'.$vv['subCfg']['procInstrFilter'];
359
360
                    // realurl support (thanks to Ingo Renner)
361
                    $urlQuery = 'index.php' . $urlQuery;
362
                    if (\TYPO3\CMS\Core\Utility\ExtensionManagementUtility::isLoaded('realurl') && $vv['subCfg']['realurl']) {
363
                        $params = array(
364
                            'LD' => array(
365
                                'totalURL' => $urlQuery
366
                            ),
367
                            'TCEmainHook' => true
368
                        );
369
                        $urlObj->encodeSpURL($params);
0 ignored issues
show
Bug introduced by
The variable $urlObj does not seem to be defined for all execution paths leading up to this point.

If you define a variable conditionally, it can happen that it is not defined for all execution paths.

Let’s take a look at an example:

function myFunction($a) {
    switch ($a) {
        case 'foo':
            $x = 1;
            break;

        case 'bar':
            $x = 2;
            break;
    }

    // $x is potentially undefined here.
    echo $x;
}

In the above example, the variable $x is defined if you pass “foo” or “bar” as argument for $a. However, since the switch statement has no default case statement, if you pass any other value, the variable $x would be undefined.

Available Fixes

  1. Check for existence of the variable explicitly:

    function myFunction($a) {
        switch ($a) {
            case 'foo':
                $x = 1;
                break;
    
            case 'bar':
                $x = 2;
                break;
        }
    
        if (isset($x)) { // Make sure it's always set.
            echo $x;
        }
    }
    
  2. Define a default value for the variable:

    function myFunction($a) {
        $x = ''; // Set a default which gets overridden for certain paths.
        switch ($a) {
            case 'foo':
                $x = 1;
                break;
    
            case 'bar':
                $x = 2;
                break;
        }
    
        echo $x;
    }
    
  3. Add a value for the missing path:

    function myFunction($a) {
        switch ($a) {
            case 'foo':
                $x = 1;
                break;
    
            case 'bar':
                $x = 2;
                break;
    
            // We add support for the missing case.
            default:
                $x = '';
                break;
        }
    
        echo $x;
    }
    
Loading history...
370
                        $urlQuery = $params['LD']['totalURL'];
371
                    }
372
373
                    // Scheduled time:
374
                    $schTime = $scheduledTime + round(count($duplicateTrack)*(60/$reqMinute));
375
                    $schTime = floor($schTime/60)*60;
376
377
                    if (isset($duplicateTrack[$uKey])) {
378
379
                        //if the url key is registered just display it and do not resubmit is
380
                        $urlList = '<em><span class="typo3-dimmed">'.htmlspecialchars($urlQuery).'</span></em><br/>';
381
382
                    } else {
383
384
                        $urlList = '['.date('d.m.y H:i', $schTime).'] '.htmlspecialchars($urlQuery);
385
                        $this->urlList[] = '['.date('d.m.y H:i', $schTime).'] '.$urlQuery;
386
387
                        $theUrl = ($vv['subCfg']['baseUrl'] ? $vv['subCfg']['baseUrl'] : \TYPO3\CMS\Core\Utility\GeneralUtility::getIndpEnv('TYPO3_SITE_URL')) . $urlQuery;
388
389
                        // Submit for crawling!
390
                        if ($submitCrawlUrls)    {
391
                            $added = $this->addUrl(
392
                            $pageRow['uid'],
393
                            $theUrl,
394
                            $vv['subCfg'],
395
                            $scheduledTime,
396
                            $configurationHash,
397
                            $skipInnerCheck
398
                            );
399
                            if ($added === false) {
400
                                $urlList .= ' (Url already existed)';
401
                            }
402
                        } elseif ($downloadCrawlUrls)    {
403
                            $downloadUrls[$theUrl] = $theUrl;
404
                        }
405
406
                        $urlList .= '<br />';
407
                    }
408
                    $duplicateTrack[$uKey] = TRUE;
409
                }
410
            }
411
        } else {
412
            $urlList = 'ERROR - no URL generated';
413
        }
414
415
        return $urlList;
0 ignored issues
show
Bug introduced by
The variable $urlList does not seem to be defined for all execution paths leading up to this point.

If you define a variable conditionally, it can happen that it is not defined for all execution paths.

Let’s take a look at an example:

function myFunction($a) {
    switch ($a) {
        case 'foo':
            $x = 1;
            break;

        case 'bar':
            $x = 2;
            break;
    }

    // $x is potentially undefined here.
    echo $x;
}

In the above example, the variable $x is defined if you pass “foo” or “bar” as argument for $a. However, since the switch statement has no default case statement, if you pass any other value, the variable $x would be undefined.

Available Fixes

  1. Check for existence of the variable explicitly:

    function myFunction($a) {
        switch ($a) {
            case 'foo':
                $x = 1;
                break;
    
            case 'bar':
                $x = 2;
                break;
        }
    
        if (isset($x)) { // Make sure it's always set.
            echo $x;
        }
    }
    
  2. Define a default value for the variable:

    function myFunction($a) {
        $x = ''; // Set a default which gets overridden for certain paths.
        switch ($a) {
            case 'foo':
                $x = 1;
                break;
    
            case 'bar':
                $x = 2;
                break;
        }
    
        echo $x;
    }
    
  3. Add a value for the missing path:

    function myFunction($a) {
        switch ($a) {
            case 'foo':
                $x = 1;
                break;
    
            case 'bar':
                $x = 2;
                break;
    
            // We add support for the missing case.
            default:
                $x = '';
                break;
        }
    
        echo $x;
    }
    
Loading history...
416
    }
417
418
    /**
419
     * Returns true if input processing instruction is among registered ones.
420
     *
421
     * @param  string $piString                     PI to test
422
     * @param  array  $incomingProcInstructions     Processing instructions
423
     * @return boolean                              TRUE if found
424
     */
425
    public function drawURLs_PIfilter($piString, array $incomingProcInstructions) {
426
        if (empty($incomingProcInstructions)) {
427
            return TRUE;
428
        }
429
430
        foreach($incomingProcInstructions as $pi) {
431
            if (\TYPO3\CMS\Core\Utility\GeneralUtility::inList($piString, $pi)) {
432
                return TRUE;
433
            }
434
        }
435
    }
436
437
438
    public function getPageTSconfigForId($id) {
439
        if(!$this->MP){
440
            $pageTSconfig = \TYPO3\CMS\Backend\Utility\BackendUtility::getPagesTSconfig($id);
441
        } else {
442
            list(,$mountPointId) = explode('-', $this->MP);
443
            $pageTSconfig = \TYPO3\CMS\Backend\Utility\BackendUtility::getPagesTSconfig($mountPointId);
444
        }
445
446
        // Call a hook to alter configuration
447
        if (is_array($GLOBALS['TYPO3_CONF_VARS']['EXTCONF']['crawler']['getPageTSconfigForId'])) {
448
            $params = array(
449
                'pageId' => $id,
450
                'pageTSConfig' => &$pageTSconfig
451
            );
452
            foreach ($GLOBALS['TYPO3_CONF_VARS']['EXTCONF']['crawler']['getPageTSconfigForId'] as $userFunc) {
453
                \TYPO3\CMS\Core\Utility\GeneralUtility::callUserFunction($userFunc, $params, $this);
454
            }
455
        }
456
457
        return $pageTSconfig;
458
    }
459
460
    /**
461
     * This methods returns an array of configurations.
462
     * And no urls!
463
     *
464
     * @param  integer $id  Page ID
465
     * @return array        Configurations from pages and configuration records
466
     */
467
    protected function getUrlsForPageId($id)    {
468
469
        /**
470
         * Get configuration from tsConfig
471
         */
472
473
        // Get page TSconfig for page ID:
474
        $pageTSconfig = $this->getPageTSconfigForId($id);
475
476
        $res = array();
477
478
        if (is_array($pageTSconfig) && is_array($pageTSconfig['tx_crawler.']['crawlerCfg.']))    {
479
            $crawlerCfg = $pageTSconfig['tx_crawler.']['crawlerCfg.'];
480
481
            if (is_array($crawlerCfg['paramSets.']))    {
482
                foreach($crawlerCfg['paramSets.'] as $key => $values)    {
483
                    if (!is_array($values))    {
484
485
                        // Sub configuration for a single configuration string:
486
                        $subCfg = (array)$crawlerCfg['paramSets.'][$key.'.'];
487
                        $subCfg['key'] = $key;
488
489
                        if (strcmp($subCfg['procInstrFilter'],''))    {
490
                            $subCfg['procInstrFilter'] = implode(',',\TYPO3\CMS\Core\Utility\GeneralUtility::trimExplode(',',$subCfg['procInstrFilter']));
491
                        }
492
                        $pidOnlyList = implode(',',\TYPO3\CMS\Core\Utility\GeneralUtility::trimExplode(',',$subCfg['pidsOnly'],1));
0 ignored issues
show
Documentation introduced by
1 is of type integer, but the function expects a boolean.

It seems like the type of the argument is not accepted by the function/method which you are calling.

In some cases, in particular if PHP’s automatic type-juggling kicks in this might be fine. In other cases, however this might be a bug.

We suggest to add an explicit type cast like in the following example:

function acceptsInteger($int) { }

$x = '123'; // string "123"

// Instead of
acceptsInteger($x);

// we recommend to use
acceptsInteger((integer) $x);
Loading history...
493
494
                            // process configuration if it is not page-specific or if the specific page is the current page:
495
                        if (!strcmp($subCfg['pidsOnly'],'') || \TYPO3\CMS\Core\Utility\GeneralUtility::inList($pidOnlyList,$id))    {
496
497
                                // add trailing slash if not present
498
                            if (!empty($subCfg['baseUrl']) && substr($subCfg['baseUrl'], -1) != '/') {
499
                                $subCfg['baseUrl'] .= '/';
500
                            }
501
502
                                // Explode, process etc.:
503
                            $res[$key] = array();
504
                            $res[$key]['subCfg'] = $subCfg;
505
                            $res[$key]['paramParsed'] = $this->parseParams($values);
506
                            $res[$key]['paramExpanded'] = $this->expandParameters($res[$key]['paramParsed'],$id);
507
                            $res[$key]['origin'] = 'pagets';
508
509
                                // recognize MP value
510
                            if(!$this->MP){
511
                                $res[$key]['URLs'] = $this->compileUrls($res[$key]['paramExpanded'],array('?id='.$id));
512
                            } else {
513
                                $res[$key]['URLs'] = $this->compileUrls($res[$key]['paramExpanded'],array('?id='.$id.'&MP='.$this->MP));
514
                            }
515
                        }
516
                    }
517
                }
518
519
            }
520
        }
521
522
        /**
523
         * Get configuration from tx_crawler_configuration records
524
         */
525
526
            // get records along the rootline
527
        $rootLine = \TYPO3\CMS\Backend\Utility\BackendUtility::BEgetRootLine($id);
528
529
        foreach ($rootLine as $page) {
530
            $configurationRecordsForCurrentPage = \TYPO3\CMS\Backend\Utility\BackendUtility::getRecordsByField(
531
                'tx_crawler_configuration',
532
                'pid',
533
                intval($page['uid']),
534
                \TYPO3\CMS\Backend\Utility\BackendUtility::BEenableFields('tx_crawler_configuration') . \TYPO3\CMS\Backend\Utility\BackendUtility::deleteClause('tx_crawler_configuration')
535
            );
536
537
            if (is_array($configurationRecordsForCurrentPage)) {
538
                foreach ($configurationRecordsForCurrentPage as $configurationRecord) {
539
540
                        // check access to the configuration record
541
                    if (empty($configurationRecord['begroups']) || $GLOBALS['BE_USER']->isAdmin() || $this->hasGroupAccess($GLOBALS['BE_USER']->user['usergroup_cached_list'], $configurationRecord['begroups'])) {
542
543
                        $pidOnlyList = implode(',',\TYPO3\CMS\Core\Utility\GeneralUtility::trimExplode(',',$configurationRecord['pidsonly'],1));
0 ignored issues
show
Documentation introduced by
1 is of type integer, but the function expects a boolean.

It seems like the type of the argument is not accepted by the function/method which you are calling.

In some cases, in particular if PHP’s automatic type-juggling kicks in this might be fine. In other cases, however this might be a bug.

We suggest to add an explicit type cast like in the following example:

function acceptsInteger($int) { }

$x = '123'; // string "123"

// Instead of
acceptsInteger($x);

// we recommend to use
acceptsInteger((integer) $x);
Loading history...
544
545
                            // process configuration if it is not page-specific or if the specific page is the current page:
546
                        if (!strcmp($configurationRecord['pidsonly'],'') || \TYPO3\CMS\Core\Utility\GeneralUtility::inList($pidOnlyList,$id)) {
547
                            $key = $configurationRecord['name'];
548
549
                                // don't overwrite previously defined paramSets
550
                            if (!isset($res[$key])) {
551
552
                                    /* @var $TSparserObject \TYPO3\CMS\Core\TypoScript\Parser\TypoScriptParser */
553
                                $TSparserObject = \TYPO3\CMS\Core\Utility\GeneralUtility::makeInstance('TYPO3\CMS\Core\TypoScript\Parser\TypoScriptParser');
554
                                $TSparserObject->parse($configurationRecord['processing_instruction_parameters_ts']);
555
556
                                $subCfg = array(
557
                                    'procInstrFilter' => $configurationRecord['processing_instruction_filter'],
558
                                    'procInstrParams.' => $TSparserObject->setup,
559
                                    'baseUrl' => $this->getBaseUrlForConfigurationRecord($configurationRecord['base_url'], $configurationRecord['sys_domain_base_url']),
560
                                    'realurl' => $configurationRecord['realurl'],
561
                                    'cHash' => $configurationRecord['chash'],
562
                                    'userGroups' => $configurationRecord['fegroups'],
563
                                    'exclude' => $configurationRecord['exclude'],
564
                                    'key' => $key,
565
                                );
566
567
                                    // add trailing slash if not present
568
                                if (!empty($subCfg['baseUrl']) && substr($subCfg['baseUrl'], -1) != '/') {
569
                                    $subCfg['baseUrl'] .= '/';
570
                                }
571
                                if (!in_array($id, $this->expandExcludeString($subCfg['exclude']))) {
572
                                    $res[$key] = array();
573
                                    $res[$key]['subCfg'] = $subCfg;
574
                                    $res[$key]['paramParsed'] = $this->parseParams($configurationRecord['configuration']);
575
                                    $res[$key]['paramExpanded'] = $this->expandParameters($res[$key]['paramParsed'], $id);
576
                                    $res[$key]['URLs'] = $this->compileUrls($res[$key]['paramExpanded'], array('?id=' . $id));
577
                                    $res[$key]['origin'] = 'tx_crawler_configuration_'.$configurationRecord['uid'];
578
                                }
579
                            }
580
                        }
581
                    }
582
                }
583
            }
584
        }
585
586
        if (is_array($GLOBALS['TYPO3_CONF_VARS']['EXTCONF']['crawler']['processUrls']))    {
587
            foreach($GLOBALS['TYPO3_CONF_VARS']['EXTCONF']['crawler']['processUrls'] as $func)    {
588
                $params = array(
589
                    'res' => &$res,
590
                );
591
                \TYPO3\CMS\Core\Utility\GeneralUtility::callUserFunction($func, $params, $this);
592
            }
593
        }
594
595
        return $res;
596
    }
597
598
    /**
599
     * Checks if a domain record exist and returns the base-url based on the record. If not the given baseUrl string is used.
600
     *
601
     * @param  string   $baseUrl
602
     * @param  integer  $sysDomainUid
603
     * @return string
604
     */
605
    protected function getBaseUrlForConfigurationRecord($baseUrl, $sysDomainUid) {
606
        $sysDomainUid = intval($sysDomainUid);
607
608
        if ($sysDomainUid > 0) {
609
            $res = $this->db->exec_SELECTquery(
610
                '*',
611
                'sys_domain',
612
                'uid = '.$sysDomainUid .
613
                \TYPO3\CMS\Backend\Utility\BackendUtility::BEenableFields('sys_domain') .
614
                \TYPO3\CMS\Backend\Utility\BackendUtility::deleteClause('sys_domain')
615
            );
616
            $row = $this->db->sql_fetch_assoc($res);
617
            if ($row['domainName'] != '') {
618
                return 'http://'.$row['domainName'];
619
            }
620
        }
621
        return $baseUrl;
622
    }
623
624
    function getConfigurationsForBranch($rootid, $depth) {
0 ignored issues
show
Best Practice introduced by
It is generally recommended to explicitly declare the visibility for methods.

Adding explicit visibility (private, protected, or public) is generally recommend to communicate to other developers how, and from where this method is intended to be used.

Loading history...
625
626
        $configurationsForBranch = array();
627
628
        $pageTSconfig = $this->getPageTSconfigForId($rootid);
629
        if (is_array($pageTSconfig) && is_array($pageTSconfig['tx_crawler.']['crawlerCfg.']) && is_array($pageTSconfig['tx_crawler.']['crawlerCfg.']['paramSets.']))    {
630
631
            $sets = $pageTSconfig['tx_crawler.']['crawlerCfg.']['paramSets.'];
632
            if(is_array($sets)) {
633
                foreach($sets as $key=>$value) {
634
                    if(!is_array($value)) continue;
635
                    $configurationsForBranch[] = substr($key,-1)=='.'?substr($key,0,-1):$key;
636
                }
637
638
            }
639
        }
640
        $pids = array();
641
        $rootLine = \TYPO3\CMS\Backend\Utility\BackendUtility::BEgetRootLine($rootid);
642
        foreach($rootLine as $node) {
643
            $pids[] = $node['uid'];
644
        }
645
        /* @var \TYPO3\CMS\Backend\Tree\View\PageTreeView */
646
        $tree = \TYPO3\CMS\Core\Utility\GeneralUtility::makeInstance('TYPO3\CMS\Backend\Tree\View\PageTreeView');
647
        $perms_clause = $GLOBALS['BE_USER']->getPagePermsClause(1);
648
        $tree->init('AND ' . $perms_clause);
649
        $tree->getTree($rootid, $depth, '');
650
        foreach($tree->tree as $node) {
651
            $pids[] = $node['row']['uid'];
652
        }
653
654
        $res = $this->db->exec_SELECTquery(
655
            '*',
656
            'tx_crawler_configuration',
657
            'pid IN ('.implode(',', $pids).') '.
658
            \TYPO3\CMS\Backend\Utility\BackendUtility::BEenableFields('tx_crawler_configuration') .
659
            \TYPO3\CMS\Backend\Utility\BackendUtility::deleteClause('tx_crawler_configuration').' '.
660
            \TYPO3\CMS\Backend\Utility\BackendUtility::versioningPlaceholderClause('tx_crawler_configuration').' '
661
        );
662
663
        while($row = $this->db->sql_fetch_assoc($res)) {
664
            $configurationsForBranch[] = $row['name'];
665
        }
666
        $this->db->sql_free_result($res);
667
        return $configurationsForBranch;
668
    }
669
670
    /**
671
     * Check if a user has access to an item
672
     * (e.g. get the group list of the current logged in user from $GLOBALS['TSFE']->gr_list)
673
     *
674
     * @see \TYPO3\CMS\Frontend\Page\PageRepository::getMultipleGroupsWhereClause()
675
     * @param  string $groupList    Comma-separated list of (fe_)group UIDs from a user
676
     * @param  string $accessList   Comma-separated list of (fe_)group UIDs of the item to access
677
     * @return bool                 TRUE if at least one of the users group UIDs is in the access list or the access list is empty
678
     * @author Fabrizio Branca <[email protected]>
679
     * @since 2009-01-19
680
     */
681
    function hasGroupAccess($groupList, $accessList) {
0 ignored issues
show
Best Practice introduced by
It is generally recommended to explicitly declare the visibility for methods.

Adding explicit visibility (private, protected, or public) is generally recommend to communicate to other developers how, and from where this method is intended to be used.

Loading history...
682
        if (empty($accessList)) {
683
            return true;
684
        }
685
        foreach(\TYPO3\CMS\Core\Utility\GeneralUtility::intExplode(',', $groupList) as $groupUid) {
686
            if (\TYPO3\CMS\Core\Utility\GeneralUtility::inList($accessList, $groupUid)) {
687
                return true;
688
            }
689
        }
690
        return false;
691
    }
692
693
    /**
694
     * Parse GET vars of input Query into array with key=>value pairs
695
     *
696
     * @param  string  $inputQuery  Input query string
697
     * @return array                Keys are Get var names, values are the values of the GET vars.
698
     */
699
    function parseParams($inputQuery) {
0 ignored issues
show
Best Practice introduced by
It is generally recommended to explicitly declare the visibility for methods.

Adding explicit visibility (private, protected, or public) is generally recommend to communicate to other developers how, and from where this method is intended to be used.

Loading history...
700
            // Extract all GET parameters into an ARRAY:
701
        $paramKeyValues = array();
702
        $GETparams = explode('&', $inputQuery);
703
704
        foreach($GETparams as $paramAndValue)    {
705
            list($p,$v) = explode('=', $paramAndValue, 2);
706
            if (strlen($p))        {
707
                $paramKeyValues[rawurldecode($p)] = rawurldecode($v);
708
            }
709
        }
710
711
        return $paramKeyValues;
712
    }
713
714
    /**
715
     * Will expand the parameters configuration to individual values. This follows a certain syntax of the value of each parameter.
716
     * Syntax of values:
717
     * - Basically: If the value is wrapped in [...] it will be expanded according to the following syntax, otherwise the value is taken literally
718
     * - Configuration is splitted by "|" and the parts are processed individually and finally added together
719
     * - For each configuration part:
720
     *         - "[int]-[int]" = Integer range, will be expanded to all values in between, values included, starting from low to high (max. 1000). Example "1-34" or "-40--30"
721
     *         - "_TABLE:[TCA table name];[_PID:[optional page id, default is current page]];[_ENABLELANG:1]" = Look up of table records from PID, filtering out deleted records. Example "_TABLE:tt_content; _PID:123"
722
     *        _ENABLELANG:1 picks only original records without their language overlays
723
     *         - Default: Literal value
724
     *
725
     * @param    array        Array with key (GET var name) and values (value of GET var which is configuration for expansion)
726
     * @param    integer        Current page ID
727
     * @return    array        Array with key (GET var name) with the value being an array of all possible values for that key.
728
     */
729
    function expandParameters($paramArray, $pid)    {
0 ignored issues
show
Best Practice introduced by
It is generally recommended to explicitly declare the visibility for methods.

Adding explicit visibility (private, protected, or public) is generally recommend to communicate to other developers how, and from where this method is intended to be used.

Loading history...
730
        global $TCA;
731
732
            // Traverse parameter names:
733
        foreach($paramArray as $p => $v)    {
734
            $v = trim($v);
735
736
                // If value is encapsulated in square brackets it means there are some ranges of values to find, otherwise the value is literal
737
            if (substr($v,0,1)==='[' && substr($v,-1)===']')    {
738
                    // So, find the value inside brackets and reset the paramArray value as an array.
739
                $v = substr($v,1,-1);
740
                $paramArray[$p] = array();
741
742
                    // Explode parts and traverse them:
743
                $parts = explode('|',$v);
744
                foreach($parts as $pV)    {
745
746
                        // Look for integer range: (fx. 1-34 or -40--30 // reads minus 40 to minus 30)
747
                    if (preg_match('/^(-?[0-9]+)\s*-\s*(-?[0-9]+)$/',trim($pV),$reg))    {    // Integer range:
748
749
                            // Swap if first is larger than last:
750
                        if ($reg[1] > $reg[2])    {
751
                            $temp = $reg[2];
752
                            $reg[2] = $reg[1];
753
                            $reg[1] = $temp;
754
                        }
755
756
                            // Traverse range, add values:
757
                        $runAwayBrake = 1000;    // Limit to size of range!
758
                        for($a=$reg[1]; $a<=$reg[2];$a++)    {
759
                            $paramArray[$p][] = $a;
760
                            $runAwayBrake--;
761
                            if ($runAwayBrake<=0)    {
762
                                break;
763
                            }
764
                        }
765
                    } elseif (substr(trim($pV),0,7)=='_TABLE:')    {
766
767
                            // Parse parameters:
768
                        $subparts = \TYPO3\CMS\Core\Utility\GeneralUtility::trimExplode(';',$pV);
769
                        $subpartParams = array();
770
                        foreach($subparts as $spV)    {
771
                            list($pKey,$pVal) = \TYPO3\CMS\Core\Utility\GeneralUtility::trimExplode(':',$spV);
772
                            $subpartParams[$pKey] = $pVal;
773
                        }
774
775
                            // Table exists:
776
                        if (isset($TCA[$subpartParams['_TABLE']]))    {
777
                            $lookUpPid = isset($subpartParams['_PID']) ? intval($subpartParams['_PID']) : $pid;
778
                            $pidField = isset($subpartParams['_PIDFIELD']) ? trim($subpartParams['_PIDFIELD']) : 'pid';
779
                            $where = isset($subpartParams['_WHERE']) ? $subpartParams['_WHERE'] : '';
780
                            $addTable = isset($subpartParams['_ADDTABLE']) ? $subpartParams['_ADDTABLE'] : '';
781
782
                            $fieldName = $subpartParams['_FIELD'] ? $subpartParams['_FIELD'] : 'uid';
783
                            if ($fieldName==='uid' || $TCA[$subpartParams['_TABLE']]['columns'][$fieldName]) {
784
785
                                $andWhereLanguage = '';
786
                                $transOrigPointerField = $TCA[$subpartParams['_TABLE']]['ctrl']['transOrigPointerField'];
787
788
                                if ($subpartParams['_ENABLELANG'] && $transOrigPointerField) {
789
                                    $andWhereLanguage = ' AND ' . $this->db->quoteStr($transOrigPointerField, $subpartParams['_TABLE']) .' <= 0 ';
790
                                }
791
792
                                $where = $this->db->quoteStr($pidField, $subpartParams['_TABLE']) .'='.intval($lookUpPid) . ' ' .
793
                                    $andWhereLanguage . $where;
794
795
                                $rows = $this->db->exec_SELECTgetRows(
796
                                    $fieldName,
797
                                    $subpartParams['_TABLE'] . $addTable,
798
                                    $where . \TYPO3\CMS\Backend\Utility\BackendUtility::deleteClause($subpartParams['_TABLE']),
799
                                    '',
800
                                    '',
801
                                    '',
802
                                    $fieldName
803
                                );
804
805
                                if (is_array($rows))    {
806
                                    $paramArray[$p] = array_merge($paramArray[$p],array_keys($rows));
807
                                }
808
                            }
809
                        }
810
                    } else {    // Just add value:
811
                        $paramArray[$p][] = $pV;
812
                    }
813
                        // Hook for processing own expandParameters place holder
814
                    if (is_array($GLOBALS['TYPO3_CONF_VARS']['SC_OPTIONS']['crawler/class.tx_crawler_lib.php']['expandParameters'])) {
815
                        $_params = array(
816
                            'pObj' => &$this,
817
                            'paramArray' => &$paramArray,
818
                            'currentKey' => $p,
819
                            'currentValue' => $pV,
820
                            'pid' => $pid
821
                        );
822
                        foreach($GLOBALS['TYPO3_CONF_VARS']['SC_OPTIONS']['crawler/class.tx_crawler_lib.php']['expandParameters'] as $key => $_funcRef)    {
823
                            \TYPO3\CMS\Core\Utility\GeneralUtility::callUserFunction($_funcRef, $_params, $this);
824
                        }
825
                    }
826
                }
827
828
                    // Make unique set of values and sort array by key:
829
                $paramArray[$p] = array_unique($paramArray[$p]);
830
                ksort($paramArray);
831
            } else {
832
                    // Set the literal value as only value in array:
833
                $paramArray[$p] = array($v);
834
            }
835
        }
836
837
        return $paramArray;
838
    }
839
840
    /**
841
     * Compiling URLs from parameter array (output of expandParameters())
842
     * The number of URLs will be the multiplication of the number of parameter values for each key
843
     *
844
     * @param  array  $paramArray   Output of expandParameters(): Array with keys (GET var names) and for each an array of values
845
     * @param  array  $urls         URLs accumulated in this array (for recursion)
846
     * @return array                URLs accumulated, if number of urls exceed 'maxCompileUrls' it will return false as an error!
847
     */
848
    function compileUrls($paramArray, $urls = array()) {
0 ignored issues
show
Best Practice introduced by
It is generally recommended to explicitly declare the visibility for methods.

Adding explicit visibility (private, protected, or public) is generally recommend to communicate to other developers how, and from where this method is intended to be used.

Loading history...
849
850
        if (count($paramArray) && is_array($urls)) {
851
                // shift first off stack:
852
            reset($paramArray);
853
            $varName = key($paramArray);
854
            $valueSet = array_shift($paramArray);
855
856
                // Traverse value set:
857
            $newUrls = array();
858
            foreach($urls as $url) {
859
                foreach($valueSet as $val) {
860
                    $newUrls[] = $url.(strcmp($val,'') ? '&'.rawurlencode($varName).'='.rawurlencode($val) : '');
861
862
                    if (count($newUrls) >  \TYPO3\CMS\Core\Utility\MathUtility::forceIntegerInRange($this->extensionSettings['maxCompileUrls'], 1, 1000000000, 10000)) {
863
                        break;
864
                    }
865
                }
866
            }
867
            $urls = $newUrls;
868
            $urls = $this->compileUrls($paramArray, $urls);
869
        }
870
871
        return $urls;
872
    }
873
874
875
876
877
878
879
880
881
882
883
884
    /************************************
885
     *
886
     * Crawler log
887
     *
888
     ************************************/
889
890
    /**
891
     * Return array of records from crawler queue for input page ID
892
     *
893
     * @param  integer $id              Page ID for which to look up log entries.
894
     * @param  string  $filter          Filter: "all" => all entries, "pending" => all that is not yet run, "finished" => all complete ones
895
     * @param  boolean $doFlush         If TRUE, then entries selected at DELETED(!) instead of selected!
896
     * @param  boolean $doFullFlush
897
     * @param  integer $itemsPerPage    Limit the amount of entries per page default is 10
898
     * @return array
899
     */
900
    function getLogEntriesForPageId($id,$filter = '', $doFlush = FALSE, $doFullFlush = FALSE, $itemsPerPage = 10) {
0 ignored issues
show
Best Practice introduced by
It is generally recommended to explicitly declare the visibility for methods.

Adding explicit visibility (private, protected, or public) is generally recommend to communicate to other developers how, and from where this method is intended to be used.

Loading history...
901
        switch($filter) {
902
            case 'pending':
903
                $addWhere = ' AND exec_time=0';
904
                break;
905
            case 'finished':
906
                $addWhere = ' AND exec_time>0';
907
                break;
908
            default:
909
                $addWhere = '';
910
                break;
911
        }
912
913
        if ($doFlush) {
914
            $this->flushQueue( ($doFullFlush?'1=1':('page_id='.intval($id))) .$addWhere);
915
            return array();
916
        } else {
917
            return $this->db->exec_SELECTgetRows('*',
918
                'tx_crawler_queue',
919
                'page_id=' . intval($id) . $addWhere, '', 'scheduled DESC',
920
                (intval($itemsPerPage)>0 ? intval($itemsPerPage) : ''));
921
        }
922
    }
923
924
    /**
925
     * Return array of records from crawler queue for input set ID
926
     *
927
     * @param    integer        Set ID for which to look up log entries.
928
     * @param    string        Filter: "all" => all entries, "pending" => all that is not yet run, "finished" => all complete ones
929
     * @param    boolean        If TRUE, then entries selected at DELETED(!) instead of selected!
930
     * @param    integer        Limit the amount of entires per page default is 10
931
     * @return    array
932
     */
933
    function getLogEntriesForSetId($set_id,$filter='',$doFlush=FALSE, $doFullFlush=FALSE, $itemsPerPage=10)    {
0 ignored issues
show
Best Practice introduced by
It is generally recommended to explicitly declare the visibility for methods.

Adding explicit visibility (private, protected, or public) is generally recommend to communicate to other developers how, and from where this method is intended to be used.

Loading history...
934
935
        switch($filter)    {
936
            case 'pending':
937
                $addWhere = ' AND exec_time=0';
938
                break;
939
            case 'finished':
940
                $addWhere = ' AND exec_time>0';
941
                break;
942
            default:
943
                $addWhere = '';
944
                break;
945
        }
946
947
        if ($doFlush)    {
948
            $this->flushQueue($doFullFlush?'':('set_id='.intval($set_id).$addWhere));
949
            return array();
950
        } else {
951
            return $this->db->exec_SELECTgetRows('*',
952
                'tx_crawler_queue',
953
                'set_id='.intval($set_id).$addWhere,'','scheduled DESC',
954
                (intval($itemsPerPage)>0 ? intval($itemsPerPage) : ''));
955
        }
956
    }
957
958
    /**
959
     * Removes queue entires
960
     *
961
     * @param $where    SQL related filter for the entries which should be removed
962
     * @return void
963
     */
964
    protected function flushQueue($where='') {
965
966
        $realWhere = strlen($where)>0?$where:'1=1';
967
968
        if(tx_crawler_domain_events_dispatcher::getInstance()->hasObserver('queueEntryFlush')) {
969
            $groups = $this->db->exec_SELECTgetRows('DISTINCT set_id','tx_crawler_queue',$realWhere);
970
            foreach($groups as $group) {
0 ignored issues
show
Bug introduced by
The expression $groups of type null|array is not guaranteed to be traversable. How about adding an additional type check?

There are different options of fixing this problem.

  1. If you want to be on the safe side, you can add an additional type-check:

    $collection = json_decode($data, true);
    if ( ! is_array($collection)) {
        throw new \RuntimeException('$collection must be an array.');
    }
    
    foreach ($collection as $item) { /** ... */ }
    
  2. If you are sure that the expression is traversable, you might want to add a doc comment cast to improve IDE auto-completion and static analysis:

    /** @var array $collection */
    $collection = json_decode($data, true);
    
    foreach ($collection as $item) { /** .. */ }
    
  3. Mark the issue as a false-positive: Just hover the remove button, in the top-right corner of this issue for more options.

Loading history...
971
                tx_crawler_domain_events_dispatcher::getInstance()->post('queueEntryFlush',$group['set_id'], $this->db->exec_SELECTgetRows('uid, set_id','tx_crawler_queue',$realWhere.' AND set_id="'.$group['set_id'].'"'));
972
            }
973
        }
974
975
        $this->db->exec_DELETEquery('tx_crawler_queue', $realWhere);
976
    }
977
978
    /**
979
     * Adding call back entries to log (called from hooks typically, see indexed search class "class.crawler.php"
980
     *
981
     * @param    integer        Set ID
982
     * @param    array        Parameters to pass to call back function
983
     * @param    string        Call back object reference, eg. 'EXT:indexed_search/class.crawler.php:&tx_indexedsearch_crawler'
984
     * @param    integer        Page ID to attach it to
985
     * @param    integer        Time at which to activate
986
     * @return    void
987
     */
988
    function addQueueEntry_callBack($setId,$params,$callBack,$page_id=0,$schedule=0) {
0 ignored issues
show
Best Practice introduced by
It is generally recommended to explicitly declare the visibility for methods.

Adding explicit visibility (private, protected, or public) is generally recommend to communicate to other developers how, and from where this method is intended to be used.

Loading history...
989
990
        if (!is_array($params))    $params = array();
991
        $params['_CALLBACKOBJ'] = $callBack;
992
993
            // Compile value array:
994
        $fieldArray = array(
995
            'page_id' => intval($page_id),
996
            'parameters' => serialize($params),
997
            'scheduled' => intval($schedule) ? intval($schedule) : $this->getCurrentTime(),
998
            'exec_time' => 0,
999
            'set_id' => intval($setId),
1000
            'result_data' => '',
1001
        );
1002
1003
        $this->db->exec_INSERTquery('tx_crawler_queue',$fieldArray);
1004
    }
1005
1006
1007
1008
1009
1010
1011
1012
1013
1014
1015
1016
    /************************************
1017
     *
1018
     * URL setting
1019
     *
1020
     ************************************/
1021
1022
    /**
1023
     * Setting a URL for crawling:
1024
     *
1025
     * @param    integer        Page ID
1026
     * @param    string        Complete URL
1027
     * @param    array        Sub configuration array (from TS config)
1028
     * @param    integer        Scheduled-time
1029
     * @param     string        (optional) configuration hash
1030
     * @param     bool        (optional) skip inner duplication check
1031
     * @return    bool        true if the url was added, false if it already existed
1032
     */
1033
    function addUrl (
0 ignored issues
show
Best Practice introduced by
It is generally recommended to explicitly declare the visibility for methods.

Adding explicit visibility (private, protected, or public) is generally recommend to communicate to other developers how, and from where this method is intended to be used.

Loading history...
1034
        $id,
1035
        $url,
1036
        array $subCfg,
1037
        $tstamp,
1038
        $configurationHash='',
1039
        $skipInnerDuplicationCheck=false
1040
    ) {
1041
1042
        $urlAdded = false;
1043
1044
            // Creating parameters:
1045
        $parameters = array(
1046
            'url' => $url
1047
        );
1048
1049
            // fe user group simulation:
1050
        $uGs = implode(',',array_unique(\TYPO3\CMS\Core\Utility\GeneralUtility::intExplode(',',$subCfg['userGroups'],1)));
0 ignored issues
show
Documentation introduced by
1 is of type integer, but the function expects a boolean.

It seems like the type of the argument is not accepted by the function/method which you are calling.

In some cases, in particular if PHP’s automatic type-juggling kicks in this might be fine. In other cases, however this might be a bug.

We suggest to add an explicit type cast like in the following example:

function acceptsInteger($int) { }

$x = '123'; // string "123"

// Instead of
acceptsInteger($x);

// we recommend to use
acceptsInteger((integer) $x);
Loading history...
1051
        if ($uGs)    {
1052
            $parameters['feUserGroupList'] = $uGs;
1053
        }
1054
1055
            // Setting processing instructions
1056
        $parameters['procInstructions'] = \TYPO3\CMS\Core\Utility\GeneralUtility::trimExplode(',',$subCfg['procInstrFilter']);
1057
        if (is_array($subCfg['procInstrParams.']))    {
1058
            $parameters['procInstrParams'] = $subCfg['procInstrParams.'];
1059
        }
1060
1061
1062
            // Compile value array:
1063
        $parameters_serialized = serialize($parameters);
1064
        $fieldArray = array(
1065
            'page_id' => intval($id),
1066
            'parameters' => $parameters_serialized,
1067
            'parameters_hash' => \TYPO3\CMS\Core\Utility\GeneralUtility::shortMD5($parameters_serialized),
1068
            'configuration_hash' => $configurationHash,
1069
            'scheduled' => $tstamp,
1070
            'exec_time' => 0,
1071
            'set_id' => intval($this->setID),
1072
            'result_data' => '',
1073
            'configuration' => $subCfg['key'],
1074
        );
1075
1076
        if ($this->registerQueueEntriesInternallyOnly)    {
0 ignored issues
show
Bug Best Practice introduced by
The expression $this->registerQueueEntriesInternallyOnly of type array is implicitly converted to a boolean; are you sure this is intended? If so, consider using ! empty($expr) instead to make it clear that you intend to check for an array without elements.

This check marks implicit conversions of arrays to boolean values in a comparison. While in PHP an empty array is considered to be equal (but not identical) to false, this is not always apparent.

Consider making the comparison explicit by using empty(..) or ! empty(...) instead.

Loading history...
1077
                //the entries will only be registered and not stored to the database
1078
            $this->queueEntries[] = $fieldArray;
1079
        } else {
1080
1081
            if(!$skipInnerDuplicationCheck){
1082
                    // check if there is already an equal entry
1083
                $rows = $this->getDuplicateRowsIfExist($tstamp,$fieldArray);
1084
            }
1085
1086
            if (count($rows) == 0) {
1087
                $this->db->exec_INSERTquery('tx_crawler_queue', $fieldArray);
1088
                $uid = $this->db->sql_insert_id();
1089
                $rows[] = $uid;
0 ignored issues
show
Bug introduced by
The variable $rows does not seem to be defined for all execution paths leading up to this point.

If you define a variable conditionally, it can happen that it is not defined for all execution paths.

Let’s take a look at an example:

function myFunction($a) {
    switch ($a) {
        case 'foo':
            $x = 1;
            break;

        case 'bar':
            $x = 2;
            break;
    }

    // $x is potentially undefined here.
    echo $x;
}

In the above example, the variable $x is defined if you pass “foo” or “bar” as argument for $a. However, since the switch statement has no default case statement, if you pass any other value, the variable $x would be undefined.

Available Fixes

  1. Check for existence of the variable explicitly:

    function myFunction($a) {
        switch ($a) {
            case 'foo':
                $x = 1;
                break;
    
            case 'bar':
                $x = 2;
                break;
        }
    
        if (isset($x)) { // Make sure it's always set.
            echo $x;
        }
    }
    
  2. Define a default value for the variable:

    function myFunction($a) {
        $x = ''; // Set a default which gets overridden for certain paths.
        switch ($a) {
            case 'foo':
                $x = 1;
                break;
    
            case 'bar':
                $x = 2;
                break;
        }
    
        echo $x;
    }
    
  3. Add a value for the missing path:

    function myFunction($a) {
        switch ($a) {
            case 'foo':
                $x = 1;
                break;
    
            case 'bar':
                $x = 2;
                break;
    
            // We add support for the missing case.
            default:
                $x = '';
                break;
        }
    
        echo $x;
    }
    
Loading history...
1090
                $urlAdded = true;
1091
                tx_crawler_domain_events_dispatcher::getInstance()->post('urlAddedToQueue',$this->setID,array('uid' => $uid, 'fieldArray' => $fieldArray));
1092
            }else{
1093
                tx_crawler_domain_events_dispatcher::getInstance()->post('duplicateUrlInQueue',$this->setID,array('rows' => $rows, 'fieldArray' => $fieldArray));
1094
            }
1095
        }
1096
1097
        return $urlAdded;
1098
    }
1099
1100
    /**
1101
     * This method determines duplicates for a queue entry with the same parameters and this timestamp.
1102
     * If the timestamp is in the past, it will check if there is any unprocessed queue entry in the past.
1103
     * If the timestamp is in the future it will check, if the queued entry has exactly the same timestamp
1104
     *
1105
     * @param int $tstamp
1106
     * @param string $parameters
0 ignored issues
show
Bug introduced by
There is no parameter named $parameters. Was it maybe removed?

This check looks for PHPDoc comments describing methods or function parameters that do not exist on the corresponding method or function.

Consider the following example. The parameter $italy is not defined by the method finale(...).

/**
 * @param array $germany
 * @param array $island
 * @param array $italy
 */
function finale($germany, $island) {
    return "2:1";
}

The most likely cause is that the parameter was removed, but the annotation was not.

Loading history...
1107
     * @author Fabrizio Branca
1108
     * @author Timo Schmidt
1109
     * @return array;
0 ignored issues
show
Documentation introduced by
The doc-type array; could not be parsed: Expected "|" or "end of type", but got ";" at position 5. (view supported doc-types)

This check marks PHPDoc comments that could not be parsed by our parser. To see which comment annotations we can parse, please refer to our documentation on supported doc-types.

Loading history...
1110
     */
1111
    protected function getDuplicateRowsIfExist($tstamp,$fieldArray){
1112
        $rows = array();
1113
1114
        $currentTime = $this->getCurrentTime();
1115
1116
            //if this entry is scheduled with "now"
1117
        if ($tstamp <= $currentTime) {
1118
            if($this->extensionSettings['enableTimeslot']){
1119
                $timeBegin     = $currentTime - 100;
1120
                $timeEnd     = $currentTime + 100;
1121
                $where         = ' ((scheduled BETWEEN '.$timeBegin.' AND '.$timeEnd.' ) OR scheduled <= '. $currentTime.') ';
1122
            }else{
1123
                $where = 'scheduled <= ' . $currentTime;
1124
            }
1125
        } elseif ($tstamp > $currentTime) {
1126
                //entry with a timestamp in the future need to have the same schedule time
1127
            $where = 'scheduled = ' . $tstamp ;
1128
        }
1129
1130
        if(!empty($where)){
1131
            $result = $this->db->exec_SELECTgetRows(
1132
                'qid',
1133
                'tx_crawler_queue',
1134
                $where.
1135
                ' AND NOT exec_time' .
1136
                ' AND NOT process_id '.
1137
                ' AND page_id='.intval($fieldArray['page_id']).
1138
                ' AND parameters_hash = ' . $this->db->fullQuoteStr($fieldArray['parameters_hash'], 'tx_crawler_queue')
1139
            );
1140
1141
            if (is_array($result)) {
1142
                foreach ($result as $value) {
1143
                    $rows[] = $value['qid'];
1144
                }
1145
            }
1146
        }
1147
1148
1149
        return $rows;
1150
    }
1151
1152
    /**
1153
     * Returns the current system time
1154
     *
1155
     * @author Timo Schmidt <[email protected]>
1156
     * @return int
1157
     */
1158
    public function getCurrentTime(){
1159
        return time();
1160
    }
1161
1162
1163
1164
    /************************************
1165
     *
1166
     * URL reading
1167
     *
1168
     ************************************/
1169
1170
    /**
1171
     * Read URL for single queue entry
1172
     *
1173
     * @param integer $queueId
1174
     * @param boolean $force If set, will process even if exec_time has been set!
1175
     * @return integer
1176
     */
1177
    function readUrl($queueId, $force = FALSE) {
0 ignored issues
show
Best Practice introduced by
It is generally recommended to explicitly declare the visibility for methods.

Adding explicit visibility (private, protected, or public) is generally recommend to communicate to other developers how, and from where this method is intended to be used.

Loading history...
1178
        $ret = 0;
1179
        if ($this->debugMode) {
1180
            \TYPO3\CMS\Core\Utility\GeneralUtility::devlog('crawler-readurl start ' . microtime(true), __FUNCTION__);
1181
        }
1182
        // Get entry:
1183
        list($queueRec) = $this->db->exec_SELECTgetRows('*', 'tx_crawler_queue',
1184
            'qid=' . intval($queueId) . ($force ? '' : ' AND exec_time=0 AND process_scheduled > 0'));
1185
1186
        if (!is_array($queueRec)) {
1187
            return;
1188
        }
1189
1190
        $pageUidRootTypoScript = \AOE\Crawler\Utility\TypoScriptUtility::getPageUidForTypoScriptRootTemplateInRootLine((int)$queueRec['page_id']);
1191
        $this->initTSFE((int)$pageUidRootTypoScript);
1192
1193
        \AOE\Crawler\Utility\SignalSlotUtility::emitSignal(
1194
            __CLASS__,
1195
            \AOE\Crawler\Utility\SignalSlotUtility::SIGNNAL_QUEUEITEM_PREPROCESS,
1196
            array($queueId, &$queueRec)
1197
        );
1198
1199
        // Set exec_time to lock record:
1200
        $field_array = array('exec_time' => $this->getCurrentTime());
1201
1202
        if (isset($this->processID)) {
1203
            //if mulitprocessing is used we need to store the id of the process which has handled this entry
1204
            $field_array['process_id_completed'] = $this->processID;
1205
        }
1206
        $this->db->exec_UPDATEquery('tx_crawler_queue', 'qid=' . intval($queueId), $field_array);
1207
1208
        $result = $this->readUrl_exec($queueRec);
1209
        $resultData = unserialize($result['content']);
1210
1211
        //atm there's no need to point to specific pollable extensions
1212
        if (is_array($GLOBALS['TYPO3_CONF_VARS']['EXTCONF']['crawler']['pollSuccess'])) {
1213
            foreach ($GLOBALS['TYPO3_CONF_VARS']['EXTCONF']['crawler']['pollSuccess'] as $pollable) {
1214
                // only check the success value if the instruction is runnig
1215
                // it is important to name the pollSuccess key same as the procInstructions key
1216
                if (is_array($resultData['parameters']['procInstructions']) && in_array($pollable,
1217
                        $resultData['parameters']['procInstructions'])
1218
                ) {
1219
                    if (!empty($resultData['success'][$pollable]) && $resultData['success'][$pollable]) {
1220
                        $ret |= self::CLI_STATUS_POLLABLE_PROCESSED;
1221
                    }
1222
                }
1223
            }
1224
        }
1225
1226
        // Set result in log which also denotes the end of the processing of this entry.
1227
        $field_array = array('result_data' => serialize($result));
1228
1229
        \AOE\Crawler\Utility\SignalSlotUtility::emitSignal(
1230
            __CLASS__,
1231
            \AOE\Crawler\Utility\SignalSlotUtility::SIGNNAL_QUEUEITEM_POSTPROCESS,
1232
            array($queueId, &$field_array)
1233
        );
1234
1235
        $this->db->exec_UPDATEquery('tx_crawler_queue', 'qid=' . intval($queueId), $field_array);
1236
1237
1238
        if ($this->debugMode) {
1239
            \TYPO3\CMS\Core\Utility\GeneralUtility::devlog('crawler-readurl stop ' . microtime(true), __FUNCTION__);
1240
        }
1241
1242
        return $ret;
1243
    }
1244
1245
    /**
1246
     * Read URL for not-yet-inserted log-entry
1247
     *
1248
     * @param    integer        Queue field array,
1249
     * @return    string
1250
     */
1251
    function readUrlFromArray($field_array)    {
0 ignored issues
show
Best Practice introduced by
It is generally recommended to explicitly declare the visibility for methods.

Adding explicit visibility (private, protected, or public) is generally recommend to communicate to other developers how, and from where this method is intended to be used.

Loading history...
1252
1253
            // Set exec_time to lock record:
1254
        $field_array['exec_time'] = $this->getCurrentTime();
1255
        $this->db->exec_INSERTquery('tx_crawler_queue', $field_array);
1256
        $queueId = $field_array['qid'] = $this->db->sql_insert_id();
1257
1258
        $result = $this->readUrl_exec($field_array);
1259
1260
            // Set result in log which also denotes the end of the processing of this entry.
1261
        $field_array = array('result_data' => serialize($result));
1262
        $this->db->exec_UPDATEquery('tx_crawler_queue','qid='.intval($queueId), $field_array);
1263
1264
        return $result;
1265
    }
1266
1267
    /**
1268
     * Read URL for a queue record
1269
     *
1270
     * @param    array        Queue record
1271
     * @return    string        Result output.
1272
     */
1273
    function readUrl_exec($queueRec)    {
0 ignored issues
show
Best Practice introduced by
It is generally recommended to explicitly declare the visibility for methods.

Adding explicit visibility (private, protected, or public) is generally recommend to communicate to other developers how, and from where this method is intended to be used.

Loading history...
1274
            // Decode parameters:
1275
        $parameters = unserialize($queueRec['parameters']);
1276
        $result = 'ERROR';
1277
        if (is_array($parameters))    {
1278
            if ($parameters['_CALLBACKOBJ'])    {    // Calling object:
1279
                $objRef = $parameters['_CALLBACKOBJ'];
1280
                $callBackObj = &\TYPO3\CMS\Core\Utility\GeneralUtility::getUserObj($objRef);
1281
                if (is_object($callBackObj))    {
1282
                    unset($parameters['_CALLBACKOBJ']);
1283
                    $result = array('content' => serialize($callBackObj->crawler_execute($parameters,$this)));
1284
                } else {
1285
                    $result = array('content' => 'No object: '.$objRef);
1286
                }
1287
            } else {    // Regular FE request:
1288
1289
                    // Prepare:
1290
                $crawlerId = $queueRec['qid'].':'.md5($queueRec['qid'].'|'.$queueRec['set_id'].'|'.$GLOBALS['TYPO3_CONF_VARS']['SYS']['encryptionKey']);
1291
1292
                    // Get result:
1293
                $result = $this->requestUrl($parameters['url'],$crawlerId);
1294
1295
                tx_crawler_domain_events_dispatcher::getInstance()->post('urlCrawled',$queueRec['set_id'],array('url' => $parameters['url'], 'result' => $result));
1296
            }
1297
        }
1298
1299
1300
        return $result;
1301
    }
1302
1303
    /**
1304
     * Gets the content of a URL.
1305
     *
1306
     * @param  string   $originalUrl    URL to read
1307
     * @param  string   $crawlerId      Crawler ID string (qid + hash to verify)
1308
     * @param  integer  $timeout        Timeout time
1309
     * @param  integer  $recursion      Recursion limiter for 302 redirects
1310
     * @return array                    Array with content
1311
     */
1312
    function requestUrl($originalUrl, $crawlerId, $timeout=2, $recursion=10) {
0 ignored issues
show
Best Practice introduced by
It is generally recommended to explicitly declare the visibility for methods.

Adding explicit visibility (private, protected, or public) is generally recommend to communicate to other developers how, and from where this method is intended to be used.

Loading history...
1313
1314
        if (!$recursion) return false;
1315
1316
            // Parse URL, checking for scheme:
1317
        $url = parse_url($originalUrl);
1318
1319
        if ($url === FALSE) {
1320
            if (TYPO3_DLOG) \TYPO3\CMS\Core\Utility\GeneralUtility::devLog(sprintf('Could not parse_url() for string "%s"', $url), 'crawler', 4, array('crawlerId' => $crawlerId));
1321
            return FALSE;
1322
        }
1323
1324
        if (!in_array($url['scheme'], array('','http','https'))) {
1325
            if (TYPO3_DLOG) \TYPO3\CMS\Core\Utility\GeneralUtility::devLog(sprintf('Scheme does not match for url "%s"', $url), 'crawler', 4, array('crawlerId' => $crawlerId));
1326
            return FALSE;
1327
        }
1328
1329
            // direct request
1330
        if ($this->extensionSettings['makeDirectRequests']) {
1331
            $result = $this->sendDirectRequest($originalUrl, $crawlerId);
1332
            return $result;
1333
        }
1334
1335
        $reqHeaders = $this->buildRequestHeaderArray($url, $crawlerId);
1336
1337
            // thanks to Pierrick Caillon for adding proxy support
1338
        $rurl = $url;
1339
1340
        if ($GLOBALS['TYPO3_CONF_VARS']['SYS']['curlUse'] && $GLOBALS['TYPO3_CONF_VARS']['SYS']['curlProxyServer']) {
1341
            $rurl = parse_url($GLOBALS['TYPO3_CONF_VARS']['SYS']['curlProxyServer']);
1342
            $url['path'] = $url['scheme'] . '://' . $url['host'] . ($url['port'] > 0 ? ':' . $url['port'] : '') . $url['path'];
1343
            $reqHeaders = $this->buildRequestHeaderArray($url, $crawlerId);
1344
        }
1345
1346
        $host = $rurl['host'];
1347
1348
        if ($url['scheme'] == 'https') {
1349
            $host = 'ssl://' . $host;
1350
            $port = ($rurl['port'] > 0) ? $rurl['port'] : 443;
1351
        } else {
1352
            $port = ($rurl['port'] > 0) ? $rurl['port'] : 80;
1353
        }
1354
1355
        $startTime = microtime(true);
1356
        $fp = fsockopen($host, $port, $errno, $errstr, $timeout);
1357
1358
        if (!$fp) {
1359
            if (TYPO3_DLOG) \TYPO3\CMS\Core\Utility\GeneralUtility::devLog(sprintf('Error while opening "%s"', $url), 'crawler', 4, array('crawlerId' => $crawlerId));
1360
            return FALSE;
1361
        } else {
1362
                // Request message:
1363
            $msg = implode("\r\n",$reqHeaders)."\r\n\r\n";
1364
            fputs ($fp, $msg);
1365
1366
                // Read response:
1367
            $d = $this->getHttpResponseFromStream($fp);
1368
            fclose ($fp);
1369
1370
            $time = microtime(true) - $startTime;
1371
            $this->log($originalUrl .' '.$time);
1372
1373
                // Implode content and headers:
1374
            $result = array(
1375
                'request' => $msg,
1376
                'headers' => implode('', $d['headers']),
1377
                'content' => implode('', (array)$d['content'])
1378
            );
1379
1380
            if (($this->extensionSettings['follow30x']) && ($newUrl = $this->getRequestUrlFrom302Header($d['headers'],$url['user'],$url['pass']))) {
1381
                $result = array_merge(array('parentRequest'=>$result), $this->requestUrl($newUrl, $crawlerId, $recursion--));
1382
                $newRequestUrl = $this->requestUrl($newUrl, $crawlerId, $timeout, --$recursion);
1383
1384
                if (is_array($newRequestUrl)) {
1385
                    $result = array_merge(array('parentRequest'=>$result), $newRequestUrl);
1386
                } else {
1387
                    if (TYPO3_DLOG) \TYPO3\CMS\Core\Utility\GeneralUtility::devLog(sprintf('Error while opening "%s"', $url), 'crawler', 4, array('crawlerId' => $crawlerId));
1388
                    return FALSE;
1389
                }
1390
            }
1391
1392
            return $result;
1393
        }
1394
    }
1395
1396
    /**
1397
     * Gets the base path of the website frontend.
1398
     * (e.g. if you call http://mydomain.com/cms/index.php in
1399
     * the browser the base path is "/cms/")
1400
     *
1401
     * @return string Base path of the website frontend
1402
     */
1403
    protected function getFrontendBasePath() {
1404
        $frontendBasePath = '/';
1405
1406
        // Get the path from the extension settings:
1407
        if (isset($this->extensionSettings['frontendBasePath']) && $this->extensionSettings['frontendBasePath']) {
1408
            $frontendBasePath = $this->extensionSettings['frontendBasePath'];
1409
        // If empty, try to use config.absRefPrefix:
1410
        } elseif (isset($GLOBALS['TSFE']->absRefPrefix) && !empty($GLOBALS['TSFE']->absRefPrefix)) {
1411
            $frontendBasePath = $GLOBALS['TSFE']->absRefPrefix;
1412
        // If not in CLI mode the base path can be determined from $_SERVER environment:
1413
        } elseif (!defined('TYPO3_cliMode') || !TYPO3_cliMode) {
1414
            $frontendBasePath = \TYPO3\CMS\Core\Utility\GeneralUtility::getIndpEnv('TYPO3_SITE_PATH');
1415
        }
1416
1417
        // Base path must be '/<pathSegements>/':
1418
        if ($frontendBasePath != '/') {
1419
            $frontendBasePath = '/' . ltrim($frontendBasePath, '/');
1420
            $frontendBasePath = rtrim($frontendBasePath, '/') . '/';
1421
        }
1422
1423
        return $frontendBasePath;
1424
    }
1425
1426
    /**
1427
     * Executes a shell command and returns the outputted result.
1428
     *
1429
     * @param string $command Shell command to be executed
1430
     * @return string Outputted result of the command execution
1431
     */
1432
    protected function executeShellCommand($command) {
1433
        $result = shell_exec($command);
1434
        return $result;
1435
    }
1436
1437
    /**
1438
     * Reads HTTP response from the given stream.
1439
     *
1440
     * @param  resource $streamPointer  Pointer to connection stream.
1441
     * @return array                    Associative array with the following items:
1442
     *                                  headers <array> Response headers sent by server.
1443
     *                                  content <array> Content, with each line as an array item.
1444
     */
1445
    protected function getHttpResponseFromStream($streamPointer) {
1446
        $response = array('headers' => array(), 'content' => array());
1447
1448
        if (is_resource($streamPointer)) {
1449
                // read headers
1450
            while($line = fgets($streamPointer, '2048')) {
1451
                $line = trim($line);
1452
                if ($line !== '') {
1453
                    $response['headers'][] = $line;
1454
                } else {
1455
                    break;
1456
                }
1457
            }
1458
1459
                // read content
1460
            while($line = fgets($streamPointer, '2048')) {
1461
                $response['content'][] = $line;
1462
            }
1463
        }
1464
1465
        return $response;
1466
    }
1467
1468
    /**
1469
     * @param message
1470
     */
1471
    protected function log($message) {
1472
        if (!empty($this->extensionSettings['logFileName'])) {
1473
            @file_put_contents($this->extensionSettings['logFileName'], date('Ymd His') . $message . "\n", FILE_APPEND);
0 ignored issues
show
Security Best Practice introduced by
It seems like you do not handle an error condition here. This can introduce security issues, and is generally not recommended.

If you suppress an error, we recommend checking for the error condition explicitly:

// For example instead of
@mkdir($dir);

// Better use
if (@mkdir($dir) === false) {
    throw new \RuntimeException('The directory '.$dir.' could not be created.');
}
Loading history...
1474
        }
1475
    }
1476
1477
    /**
1478
     * Builds HTTP request headers.
1479
     *
1480
     * @param array $url
1481
     * @param string $crawlerId
1482
     *
1483
     * @return array
1484
     */
1485
    protected function buildRequestHeaderArray(array $url, $crawlerId) {
1486
        $reqHeaders = array();
1487
        $reqHeaders[] = 'GET '.$url['path'].($url['query'] ? '?'.$url['query'] : '').' HTTP/1.0';
1488
        $reqHeaders[] = 'Host: '.$url['host'];
1489
        if (stristr($url['query'],'ADMCMD_previewWS')) {
1490
            $reqHeaders[] = 'Cookie: $Version="1"; be_typo_user="1"; $Path=/';
1491
        }
1492
        $reqHeaders[] = 'Connection: close';
1493
        if ($url['user']!='') {
1494
            $reqHeaders[] = 'Authorization: Basic '. base64_encode($url['user'].':'.$url['pass']);
1495
        }
1496
        $reqHeaders[] = 'X-T3crawler: '.$crawlerId;
1497
        $reqHeaders[] = 'User-Agent: TYPO3 crawler';
1498
        return $reqHeaders;
1499
    }
1500
1501
    /**
1502
     * Check if the submitted HTTP-Header contains a redirect location and built new crawler-url
1503
     *
1504
     * @param    array        HTTP Header
1505
     * @param    string        HTTP Auth. User
1506
     * @param    string        HTTP Auth. Password
1507
     * @return    string        URL from redirection
1508
     */
1509
    protected function getRequestUrlFrom302Header($headers,$user='',$pass='') {
1510
        if(!is_array($headers)) return false;
1511
        if(!(stristr($headers[0],'301 Moved') || stristr($headers[0],'302 Found') || stristr($headers[0],'302 Moved'))) return false;
1512
1513
        foreach($headers as $hl) {
1514
            $tmp = explode(": ",$hl);
1515
            $header[trim($tmp[0])] = trim($tmp[1]);
0 ignored issues
show
Coding Style Comprehensibility introduced by
$header was never initialized. Although not strictly required by PHP, it is generally a good practice to add $header = array(); before regardless.

Adding an explicit array definition is generally preferable to implicit array definition as it guarantees a stable state of the code.

Let’s take a look at an example:

foreach ($collection as $item) {
    $myArray['foo'] = $item->getFoo();

    if ($item->hasBar()) {
        $myArray['bar'] = $item->getBar();
    }

    // do something with $myArray
}

As you can see in this example, the array $myArray is initialized the first time when the foreach loop is entered. You can also see that the value of the bar key is only written conditionally; thus, its value might result from a previous iteration.

This might or might not be intended. To make your intention clear, your code more readible and to avoid accidental bugs, we recommend to add an explicit initialization $myArray = array() either outside or inside the foreach loop.

Loading history...
1516
            if(trim($tmp[0])=='Location') break;
1517
        }
1518
        if(!array_key_exists('Location',$header)) return false;
0 ignored issues
show
Bug introduced by
The variable $header does not seem to be defined for all execution paths leading up to this point.

If you define a variable conditionally, it can happen that it is not defined for all execution paths.

Let’s take a look at an example:

function myFunction($a) {
    switch ($a) {
        case 'foo':
            $x = 1;
            break;

        case 'bar':
            $x = 2;
            break;
    }

    // $x is potentially undefined here.
    echo $x;
}

In the above example, the variable $x is defined if you pass “foo” or “bar” as argument for $a. However, since the switch statement has no default case statement, if you pass any other value, the variable $x would be undefined.

Available Fixes

  1. Check for existence of the variable explicitly:

    function myFunction($a) {
        switch ($a) {
            case 'foo':
                $x = 1;
                break;
    
            case 'bar':
                $x = 2;
                break;
        }
    
        if (isset($x)) { // Make sure it's always set.
            echo $x;
        }
    }
    
  2. Define a default value for the variable:

    function myFunction($a) {
        $x = ''; // Set a default which gets overridden for certain paths.
        switch ($a) {
            case 'foo':
                $x = 1;
                break;
    
            case 'bar':
                $x = 2;
                break;
        }
    
        echo $x;
    }
    
  3. Add a value for the missing path:

    function myFunction($a) {
        switch ($a) {
            case 'foo':
                $x = 1;
                break;
    
            case 'bar':
                $x = 2;
                break;
    
            // We add support for the missing case.
            default:
                $x = '';
                break;
        }
    
        echo $x;
    }
    
Loading history...
1519
1520
        if($user!='') {
1521
            if(!($tmp = parse_url($header['Location']))) return false;
1522
            $newUrl = $tmp['scheme'] . '://' . $user . ':' . $pass . '@' . $tmp['host'] . $tmp['path'];
1523
            if($tmp['query']!='') $newUrl .= '?' . $tmp['query'];
1524
        } else {
1525
            $newUrl = $header['Location'];
1526
        }
1527
        return $newUrl;
1528
    }
1529
1530
1531
1532
1533
1534
1535
1536
1537
    /**************************
1538
     *
1539
     * tslib_fe hooks:
1540
     *
1541
     **************************/
1542
1543
    /**
1544
     * Initialization hook (called after database connection)
1545
     * Takes the "HTTP_X_T3CRAWLER" header and looks up queue record and verifies if the session comes from the system (by comparing hashes)
1546
     *
1547
     * @param    array        Parameters from frontend
1548
     * @param    object        TSFE object (reference under PHP5)
1549
     * @return    void
1550
     */
1551
    function fe_init(&$params, $ref)    {
0 ignored issues
show
Unused Code introduced by
The parameter $ref is not used and could be removed.

This check looks from parameters that have been defined for a function or method, but which are not used in the method body.

Loading history...
Best Practice introduced by
It is generally recommended to explicitly declare the visibility for methods.

Adding explicit visibility (private, protected, or public) is generally recommend to communicate to other developers how, and from where this method is intended to be used.

Loading history...
1552
1553
            // Authenticate crawler request:
1554
        if (isset($_SERVER['HTTP_X_T3CRAWLER']))    {
1555
            list($queueId,$hash) = explode(':', $_SERVER['HTTP_X_T3CRAWLER']);
1556
            list($queueRec) = $this->db->exec_SELECTgetRows('*','tx_crawler_queue','qid='.intval($queueId));
1557
1558
                // If a crawler record was found and hash was matching, set it up:
1559
            if (is_array($queueRec) && $hash === md5($queueRec['qid'].'|'.$queueRec['set_id'].'|'.$GLOBALS['TYPO3_CONF_VARS']['SYS']['encryptionKey']))    {
1560
                $params['pObj']->applicationData['tx_crawler']['running'] = TRUE;
1561
                $params['pObj']->applicationData['tx_crawler']['parameters'] = unserialize($queueRec['parameters']);
1562
                $params['pObj']->applicationData['tx_crawler']['log'] = array();
1563
            } else {
1564
                die('No crawler entry found!');
0 ignored issues
show
Coding Style Compatibility introduced by
The method fe_init() contains an exit expression.

An exit expression should only be used in rare cases. For example, if you write a short command line script.

In most cases however, using an exit expression makes the code untestable and often causes incompatibilities with other libraries. Thus, unless you are absolutely sure it is required here, we recommend to refactor your code to avoid its usage.

Loading history...
1565
            }
1566
        }
1567
    }
1568
1569
1570
1571
    /*****************************
1572
     *
1573
     * Compiling URLs to crawl - tools
1574
     *
1575
     *****************************/
1576
1577
    /**
1578
     * @param    integer        Root page id to start from.
1579
     * @param    integer        Depth of tree, 0=only id-page, 1= on sublevel, 99 = infinite
1580
     * @param    integer        Unix Time when the URL is timed to be visited when put in queue
1581
     * @param    integer        Number of requests per minute (creates the interleave between requests)
1582
     * @param    boolean        If set, submits the URLs to queue in database (real crawling)
1583
     * @param    boolean        If set (and submitcrawlUrls is false) will fill $downloadUrls with entries)
1584
     * @param    array        Array of processing instructions
1585
     * @param    array        Array of configuration keys
1586
     * @return    string        HTML code
1587
     */
1588
    function getPageTreeAndUrls(
0 ignored issues
show
Best Practice introduced by
It is generally recommended to explicitly declare the visibility for methods.

Adding explicit visibility (private, protected, or public) is generally recommend to communicate to other developers how, and from where this method is intended to be used.

Loading history...
1589
        $id,
1590
        $depth,
1591
        $scheduledTime,
1592
        $reqMinute,
1593
        $submitCrawlUrls,
1594
        $downloadCrawlUrls,
1595
        array $incomingProcInstructions,
1596
        array $configurationSelection
1597
    ) {
1598
1599
        global $BACK_PATH;
1600
        global $LANG;
1601
        if (!is_object($LANG)) {
1602
            $LANG = \TYPO3\CMS\Core\Utility\GeneralUtility::makeInstance('language');
1603
            $LANG->init(0);
1604
        }
1605
        $this->scheduledTime = $scheduledTime;
0 ignored issues
show
Bug introduced by
The property scheduledTime does not exist. Did you maybe forget to declare it?

In PHP it is possible to write to properties without declaring them. For example, the following is perfectly valid PHP code:

class MyClass { }

$x = new MyClass();
$x->foo = true;

Generally, it is a good practice to explictly declare properties to avoid accidental typos and provide IDE auto-completion:

class MyClass {
    public $foo;
}

$x = new MyClass();
$x->foo = true;
Loading history...
1606
        $this->reqMinute = $reqMinute;
0 ignored issues
show
Bug introduced by
The property reqMinute does not exist. Did you maybe forget to declare it?

In PHP it is possible to write to properties without declaring them. For example, the following is perfectly valid PHP code:

class MyClass { }

$x = new MyClass();
$x->foo = true;

Generally, it is a good practice to explictly declare properties to avoid accidental typos and provide IDE auto-completion:

class MyClass {
    public $foo;
}

$x = new MyClass();
$x->foo = true;
Loading history...
1607
        $this->submitCrawlUrls = $submitCrawlUrls;
0 ignored issues
show
Bug introduced by
The property submitCrawlUrls does not exist. Did you maybe forget to declare it?

In PHP it is possible to write to properties without declaring them. For example, the following is perfectly valid PHP code:

class MyClass { }

$x = new MyClass();
$x->foo = true;

Generally, it is a good practice to explictly declare properties to avoid accidental typos and provide IDE auto-completion:

class MyClass {
    public $foo;
}

$x = new MyClass();
$x->foo = true;
Loading history...
1608
        $this->downloadCrawlUrls = $downloadCrawlUrls;
0 ignored issues
show
Bug introduced by
The property downloadCrawlUrls does not exist. Did you maybe forget to declare it?

In PHP it is possible to write to properties without declaring them. For example, the following is perfectly valid PHP code:

class MyClass { }

$x = new MyClass();
$x->foo = true;

Generally, it is a good practice to explictly declare properties to avoid accidental typos and provide IDE auto-completion:

class MyClass {
    public $foo;
}

$x = new MyClass();
$x->foo = true;
Loading history...
1609
        $this->incomingProcInstructions = $incomingProcInstructions;
1610
        $this->incomingConfigurationSelection = $configurationSelection;
1611
1612
        $this->duplicateTrack = array();
1613
        $this->downloadUrls = array();
1614
1615
            // Drawing tree:
1616
            /* @var $tree \TYPO3\CMS\Backend\Tree\View\PageTreeView */
1617
        $tree = \TYPO3\CMS\Core\Utility\GeneralUtility::makeInstance('TYPO3\CMS\Backend\Tree\View\PageTreeView');
1618
        $perms_clause = $GLOBALS['BE_USER']->getPagePermsClause(1);
1619
        $tree->init('AND ' . $perms_clause);
1620
1621
        $pageinfo = \TYPO3\CMS\Backend\Utility\BackendUtility::readPageAccess($id, $perms_clause);
1622
1623
            // Set root row:
1624
        $tree->tree[] = Array(
1625
            'row' => $pageinfo,
1626
            'HTML' => \TYPO3\CMS\Backend\Utility\IconUtility::getSpriteIconForRecord('pages', $pageinfo)
1627
        );
1628
1629
            // Get branch beneath:
1630
        if ($depth)    {
1631
            $tree->getTree($id, $depth, '');
1632
        }
1633
1634
            // Traverse page tree:
1635
        $code = '';
1636
1637
        foreach ($tree->tree as $data) {
1638
1639
            $this->MP = false;
1640
1641
                // recognize mount points
1642
            if($data['row']['doktype'] == 7){
1643
                $mountpage = $this->db->exec_SELECTgetRows('*', 'pages', 'uid = '.$data['row']['uid']);
1644
1645
                    // fetch mounted pages
1646
                $this->MP = $mountpage[0]['mount_pid'].'-'.$data['row']['uid'];
0 ignored issues
show
Documentation Bug introduced by
The property $MP was declared of type boolean, but $mountpage[0]['mount_pid...' . $data['row']['uid'] is of type string. Maybe add a type cast?

This check looks for assignments to scalar types that may be of the wrong type.

To ensure the code behaves as expected, it may be a good idea to add an explicit type cast.

$answer = 42;

$correct = false;

$correct = (bool) $answer;
Loading history...
1647
1648
                $mountTree = \TYPO3\CMS\Core\Utility\GeneralUtility::makeInstance('TYPO3\CMS\Backend\Tree\View\PageTreeView');
1649
                $mountTree->init('AND '.$perms_clause);
1650
                $mountTree->getTree($mountpage[0]['mount_pid'], $depth, '');
1651
1652
                foreach($mountTree->tree as $mountData)    {
1653
                    $code .= $this->drawURLs_addRowsForPage(
1654
                        $mountData['row'],
1655
                        $mountData['HTML'].\TYPO3\CMS\Backend\Utility\BackendUtility::getRecordTitle('pages',$mountData['row'],TRUE)
1656
                    );
1657
                }
1658
1659
                    // replace page when mount_pid_ol is enabled
1660
                if($mountpage[0]['mount_pid_ol']){
1661
                    $data['row']['uid'] = $mountpage[0]['mount_pid'];
1662
                } else {
1663
                        // if the mount_pid_ol is not set the MP must not be used for the mountpoint page
1664
                    $this->MP = false;
1665
                }
1666
            }
1667
1668
            $code .= $this->drawURLs_addRowsForPage(
1669
                $data['row'],
0 ignored issues
show
Security Bug introduced by
It seems like $data['row'] can also be of type false; however, tx_crawler_lib::drawURLs_addRowsForPage() does only seem to accept array, did you maybe forget to handle an error condition?
Loading history...
1670
                $data['HTML'] . \TYPO3\CMS\Backend\Utility\BackendUtility::getRecordTitle('pages', $data['row'], TRUE)
0 ignored issues
show
Security Bug introduced by
It seems like $data['row'] can also be of type false; however, TYPO3\CMS\Backend\Utilit...ility::getRecordTitle() does only seem to accept array, did you maybe forget to handle an error condition?
Loading history...
1671
            );
1672
        }
1673
1674
        return $code;
1675
    }
1676
1677
    /**
1678
     * Expands exclude string.
1679
     *
1680
     * @param  string $excludeString    Exclude string
1681
     * @return array                    Array of page ids.
1682
     */
1683
    public function expandExcludeString($excludeString) {
1684
            // internal static caches;
1685
        static $expandedExcludeStringCache;
1686
        static $treeCache;
1687
1688
        if (empty($expandedExcludeStringCache[$excludeString])) {
1689
            $pidList = array();
1690
1691
            if (!empty($excludeString)) {
1692
                /* @var $tree \TYPO3\CMS\Backend\Tree\View\PageTreeView */
1693
                $tree = \TYPO3\CMS\Core\Utility\GeneralUtility::makeInstance('TYPO3\CMS\Backend\Tree\View\PageTreeView');
1694
                $tree->init('AND ' . $this->backendUser->getPagePermsClause(1));
1695
1696
                $excludeParts = \TYPO3\CMS\Core\Utility\GeneralUtility::trimExplode(',', $excludeString);
1697
1698
                foreach ($excludeParts as $excludePart) {
1699
                    list($pid, $depth) = \TYPO3\CMS\Core\Utility\GeneralUtility::trimExplode('+', $excludePart);
1700
1701
                        // default is "page only" = "depth=0"
1702
                    if (empty($depth)) {
1703
                        $depth = ( stristr($excludePart,'+')) ? 99 : 0;
1704
                    }
1705
1706
                    $pidList[] = $pid;
1707
1708
                    if ($depth > 0) {
1709
                        if (empty($treeCache[$pid][$depth])) {
1710
                            $tree->reset();
1711
                            $tree->getTree($pid, $depth);
1712
                            $treeCache[$pid][$depth] = $tree->tree;
1713
                        }
1714
1715
                        foreach ($treeCache[$pid][$depth] as $data) {
1716
                            $pidList[] = $data['row']['uid'];
1717
                        }
1718
                    }
1719
                }
1720
            }
1721
1722
            $expandedExcludeStringCache[$excludeString] = array_unique($pidList);
1723
        }
1724
1725
        return $expandedExcludeStringCache[$excludeString];
1726
    }
1727
1728
    /**
1729
     * Create the rows for display of the page tree
1730
     * For each page a number of rows are shown displaying GET variable configuration
1731
     *
1732
     * @param    array        Page row
1733
     * @param    string        Page icon and title for row
1734
     * @return    string        HTML <tr> content (one or more)
1735
     */
1736
    public function drawURLs_addRowsForPage(array $pageRow, $pageTitleAndIcon)    {
1737
1738
        $skipMessage = '';
1739
1740
            // Get list of configurations
1741
        $configurations = $this->getUrlsForPageRow($pageRow, $skipMessage);
1742
1743
        if (count($this->incomingConfigurationSelection) > 0) {
1744
                //     remove configuration that does not match the current selection
1745
            foreach ($configurations as $confKey => $confArray) {
1746
                if (!in_array($confKey, $this->incomingConfigurationSelection)) {
1747
                    unset($configurations[$confKey]);
1748
                }
1749
            }
1750
        }
1751
1752
            // Traverse parameter combinations:
1753
        $c = 0;
1754
        $cc = 0;
0 ignored issues
show
Unused Code introduced by
$cc is not used, you could remove the assignment.

This check looks for variable assignements that are either overwritten by other assignments or where the variable is not used subsequently.

$myVar = 'Value';
$higher = false;

if (rand(1, 6) > 3) {
    $higher = true;
} else {
    $higher = false;
}

Both the $myVar assignment in line 1 and the $higher assignment in line 2 are dead. The first because $myVar is never used and the second because $higher is always overwritten for every possible time line.

Loading history...
1755
        $content = '';
1756
        if (count($configurations)) {
1757
            foreach($configurations as $confKey => $confArray)    {
1758
1759
                    // Title column:
1760
                if (!$c) {
1761
                    $titleClm = '<td rowspan="'.count($configurations).'">'.$pageTitleAndIcon.'</td>';
1762
                } else {
1763
                    $titleClm = '';
1764
                }
1765
1766
1767
                if (!in_array($pageRow['uid'], $this->expandExcludeString($confArray['subCfg']['exclude']))) {
1768
1769
                        // URL list:
1770
                    $urlList = $this->urlListFromUrlArray(
1771
                        $confArray,
1772
                        $pageRow,
1773
                        $this->scheduledTime,
1774
                        $this->reqMinute,
1775
                        $this->submitCrawlUrls,
1776
                        $this->downloadCrawlUrls,
1777
                        $this->duplicateTrack,
1778
                        $this->downloadUrls,
1779
                        $this->incomingProcInstructions // if empty the urls won't be filtered by processing instructions
1780
                    );
1781
1782
                        // Expanded parameters:
1783
                    $paramExpanded = '';
1784
                    $calcAccu = array();
1785
                    $calcRes = 1;
1786
                    foreach($confArray['paramExpanded'] as $gVar => $gVal)    {
1787
                        $paramExpanded.= '
1788
                            <tr>
1789
                                <td class="bgColor4-20">'.htmlspecialchars('&'.$gVar.'=').'<br/>'.
1790
                                                '('.count($gVal).')'.
1791
                                                '</td>
1792
                                <td class="bgColor4" nowrap="nowrap">'.nl2br(htmlspecialchars(implode(chr(10),$gVal))).'</td>
1793
                            </tr>
1794
                        ';
1795
                        $calcRes*= count($gVal);
1796
                        $calcAccu[] = count($gVal);
1797
                    }
1798
                    $paramExpanded = '<table class="lrPadding c-list param-expanded">'.$paramExpanded.'</table>';
1799
                    $paramExpanded.= 'Comb: '.implode('*',$calcAccu).'='.$calcRes;
1800
1801
                        // Options
1802
                    $optionValues = '';
1803
                    if ($confArray['subCfg']['userGroups'])    {
1804
                        $optionValues.='User Groups: '.$confArray['subCfg']['userGroups'].'<br/>';
1805
                    }
1806
                    if ($confArray['subCfg']['baseUrl'])    {
1807
                        $optionValues.='Base Url: '.$confArray['subCfg']['baseUrl'].'<br/>';
1808
                    }
1809
                    if ($confArray['subCfg']['procInstrFilter'])    {
1810
                        $optionValues.='ProcInstr: '.$confArray['subCfg']['procInstrFilter'].'<br/>';
1811
                    }
1812
1813
                        // Compile row:
1814
                    $content .= '
1815
                        <tr class="bgColor' . ($c%2 ? '-20':'-10') . '">
1816
                            ' . $titleClm . '
1817
                            <td>' . htmlspecialchars($confKey) . '</td>
1818
                            <td>' . nl2br(htmlspecialchars(rawurldecode(trim(str_replace('&', chr(10) . '&', \TYPO3\CMS\Core\Utility\GeneralUtility::implodeArrayForUrl('', $confArray['paramParsed'])))))) . '</td>
1819
                            <td>'.$paramExpanded.'</td>
1820
                            <td nowrap="nowrap">' . $urlList . '</td>
1821
                            <td nowrap="nowrap">' . $optionValues . '</td>
1822
                            <td nowrap="nowrap">' . \TYPO3\CMS\Core\Utility\DebugUtility::viewArray($confArray['subCfg']['procInstrParams.']) . '</td>
1823
                        </tr>';
1824
                } else {
1825
1826
                    $content .= '<tr class="bgColor'.($c%2 ? '-20':'-10') . '">
1827
                            '.$titleClm.'
1828
                            <td>'.htmlspecialchars($confKey).'</td>
1829
                            <td colspan="5"><em>No entries</em> (Page is excluded in this configuration)</td>
1830
                        </tr>';
1831
1832
                }
1833
1834
1835
                $c++;
1836
            }
1837
        } else {
1838
            $message = !empty($skipMessage) ? ' ('.$skipMessage.')' : '';
1839
1840
                // Compile row:
1841
            $content.= '
1842
                <tr class="bgColor-20" style="border-bottom: 1px solid black;">
1843
                    <td>'.$pageTitleAndIcon.'</td>
1844
                    <td colspan="6"><em>No entries</em>'.$message.'</td>
1845
                </tr>';
1846
        }
1847
1848
        return $content;
1849
    }
1850
1851
    /**
1852
     *
1853
     * @return int
1854
     */
1855
    function getUnprocessedItemsCount() {
0 ignored issues
show
Best Practice introduced by
It is generally recommended to explicitly declare the visibility for methods.

Adding explicit visibility (private, protected, or public) is generally recommend to communicate to other developers how, and from where this method is intended to be used.

Loading history...
1856
        $res = $this->db->exec_SELECTquery(
1857
                    'count(*) as num',
1858
                    'tx_crawler_queue',
1859
                    'exec_time=0
1860
                    AND process_scheduled= 0
1861
                    AND scheduled<='.$this->getCurrentTime()
1862
        );
1863
1864
        $count = $this->db->sql_fetch_assoc($res);
1865
        return $count['num'];
1866
    }
1867
1868
1869
1870
1871
1872
1873
1874
1875
    /*****************************
1876
     *
1877
     * CLI functions
1878
     *
1879
     *****************************/
1880
1881
    /**
1882
     * Main function for running from Command Line PHP script (cron job)
1883
     * See ext/crawler/cli/crawler_cli.phpsh for details
1884
     *
1885
     * @return    int number of remaining items or false if error
1886
     */
1887
    function CLI_main() {
0 ignored issues
show
Best Practice introduced by
It is generally recommended to explicitly declare the visibility for methods.

Adding explicit visibility (private, protected, or public) is generally recommend to communicate to other developers how, and from where this method is intended to be used.

Loading history...
1888
        $this->setAccessMode('cli');
1889
        $result = self::CLI_STATUS_NOTHING_PROCCESSED;
1890
        $cliObj = \TYPO3\CMS\Core\Utility\GeneralUtility::makeInstance('tx_crawler_cli');
1891
1892
        if (isset($cliObj->cli_args['-h']) || isset($cliObj->cli_args['--help'])) {
1893
            $cliObj->cli_validateArgs();
1894
            $cliObj->cli_help();
1895
            exit;
0 ignored issues
show
Coding Style Compatibility introduced by
The method CLI_main() contains an exit expression.

An exit expression should only be used in rare cases. For example, if you write a short command line script.

In most cases however, using an exit expression makes the code untestable and often causes incompatibilities with other libraries. Thus, unless you are absolutely sure it is required here, we recommend to refactor your code to avoid its usage.

Loading history...
1896
        }
1897
1898
        if (!$this->getDisabled() && $this->CLI_checkAndAcquireNewProcess($this->CLI_buildProcessId())) {
1899
            $countInARun = $cliObj->cli_argValue('--countInARun') ? intval($cliObj->cli_argValue('--countInARun')) : $this->extensionSettings['countInARun'];
1900
                // Seconds
1901
            $sleepAfterFinish = $cliObj->cli_argValue('--sleepAfterFinish') ? intval($cliObj->cli_argValue('--sleepAfterFinish')) : $this->extensionSettings['sleepAfterFinish'];
1902
                // Milliseconds
1903
            $sleepTime = $cliObj->cli_argValue('--sleepTime') ? intval($cliObj->cli_argValue('--sleepTime')) : $this->extensionSettings['sleepTime'];
1904
1905
            try {
1906
                    // Run process:
1907
                $result = $this->CLI_run($countInARun, $sleepTime, $sleepAfterFinish);
1908
            } catch (Exception $e) {
1909
                $result = self::CLI_STATUS_ABORTED;
1910
            }
1911
1912
                // Cleanup
1913
            $this->db->exec_DELETEquery('tx_crawler_process', 'assigned_items_count = 0');
1914
1915
                //TODO can't we do that in a clean way?
1916
            $releaseStatus = $this->CLI_releaseProcesses($this->CLI_buildProcessId());
0 ignored issues
show
Unused Code introduced by
$releaseStatus is not used, you could remove the assignment.

This check looks for variable assignements that are either overwritten by other assignments or where the variable is not used subsequently.

$myVar = 'Value';
$higher = false;

if (rand(1, 6) > 3) {
    $higher = true;
} else {
    $higher = false;
}

Both the $myVar assignment in line 1 and the $higher assignment in line 2 are dead. The first because $myVar is never used and the second because $higher is always overwritten for every possible time line.

Loading history...
1917
1918
            $this->CLI_debug("Unprocessed Items remaining:".$this->getUnprocessedItemsCount()." (".$this->CLI_buildProcessId().")");
1919
            $result |= ( $this->getUnprocessedItemsCount() > 0 ? self::CLI_STATUS_REMAIN : self::CLI_STATUS_NOTHING_PROCCESSED );
1920
        } else {
1921
            $result |= self::CLI_STATUS_ABORTED;
1922
        }
1923
1924
        return $result;
1925
    }
1926
1927
    /**
1928
     * Function executed by crawler_im.php cli script.
1929
     *
1930
     * @return    void
1931
     */
1932
    function CLI_main_im()    {
0 ignored issues
show
Best Practice introduced by
It is generally recommended to explicitly declare the visibility for methods.

Adding explicit visibility (private, protected, or public) is generally recommend to communicate to other developers how, and from where this method is intended to be used.

Loading history...
1933
        $this->setAccessMode('cli_im');
1934
1935
        $cliObj = \TYPO3\CMS\Core\Utility\GeneralUtility::makeInstance('tx_crawler_cli_im');
1936
1937
            // Force user to admin state and set workspace to "Live":
1938
        $this->backendUser->user['admin'] = 1;
1939
        $this->backendUser->setWorkspace(0);
1940
1941
            // Print help
1942
        if (!isset($cliObj->cli_args['_DEFAULT'][1]))    {
1943
            $cliObj->cli_validateArgs();
1944
            $cliObj->cli_help();
1945
            exit;
0 ignored issues
show
Coding Style Compatibility introduced by
The method CLI_main_im() contains an exit expression.

An exit expression should only be used in rare cases. For example, if you write a short command line script.

In most cases however, using an exit expression makes the code untestable and often causes incompatibilities with other libraries. Thus, unless you are absolutely sure it is required here, we recommend to refactor your code to avoid its usage.

Loading history...
1946
        }
1947
1948
        $cliObj->cli_validateArgs();
1949
1950
        if ($cliObj->cli_argValue('-o')==='exec')    {
1951
            $this->registerQueueEntriesInternallyOnly=TRUE;
0 ignored issues
show
Documentation Bug introduced by
It seems like TRUE of type boolean is incompatible with the declared type array of property $registerQueueEntriesInternallyOnly.

Our type inference engine has found an assignment to a property that is incompatible with the declared type of that property.

Either this assignment is in error or the assigned type should be added to the documentation/type hint for that property..

Loading history...
1952
        }
1953
1954
        if (isset($cliObj->cli_args['_DEFAULT'][2])) {
1955
            // Crawler is called over TYPO3 BE
1956
            $pageId = \TYPO3\CMS\Core\Utility\MathUtility::forceIntegerInRange($cliObj->cli_args['_DEFAULT'][2], 0);
1957
        } else {
1958
            // Crawler is called over cli
1959
            $pageId = \TYPO3\CMS\Core\Utility\MathUtility::forceIntegerInRange($cliObj->cli_args['_DEFAULT'][1], 0);
1960
        }
1961
1962
        $configurationKeys  = $this->getConfigurationKeys($cliObj);
1963
1964
        if(!is_array($configurationKeys)){
1965
            $configurations = $this->getUrlsForPageId($pageId);
1966
            if(is_array($configurations)){
1967
                $configurationKeys = array_keys($configurations);
1968
            }else{
1969
                $configurationKeys = array();
1970
            }
1971
        }
1972
1973
        if($cliObj->cli_argValue('-o')==='queue' || $cliObj->cli_argValue('-o')==='exec'){
1974
1975
            $reason = new tx_crawler_domain_reason();
1976
            $reason->setReason(tx_crawler_domain_reason::REASON_GUI_SUBMIT);
1977
            $reason->setDetailText('The cli script of the crawler added to the queue');
1978
            tx_crawler_domain_events_dispatcher::getInstance()->post(
1979
                'invokeQueueChange',
1980
                $this->setID,
1981
                array(    'reason' => $reason )
1982
            );
1983
        }
1984
1985
        if ($this->extensionSettings['cleanUpOldQueueEntries']) {
1986
            $this->cleanUpOldQueueEntries();
1987
        }
1988
1989
        $this->setID = \TYPO3\CMS\Core\Utility\GeneralUtility::md5int(microtime());
0 ignored issues
show
Documentation Bug introduced by
It seems like \TYPO3\CMS\Core\Utility\...ty::md5int(microtime()) can also be of type double. However, the property $setID is declared as type integer. Maybe add an additional type check?

Our type inference engine has found a suspicous assignment of a value to a property. This check raises an issue when a value that can be of a mixed type is assigned to a property that is type hinted more strictly.

For example, imagine you have a variable $accountId that can either hold an Id object or false (if there is no account id yet). Your code now assigns that value to the id property of an instance of the Account class. This class holds a proper account, so the id value must no longer be false.

Either this assignment is in error or a type check should be added for that assignment.

class Id
{
    public $id;

    public function __construct($id)
    {
        $this->id = $id;
    }

}

class Account
{
    /** @var  Id $id */
    public $id;
}

$account_id = false;

if (starsAreRight()) {
    $account_id = new Id(42);
}

$account = new Account();
if ($account instanceof Id)
{
    $account->id = $account_id;
}
Loading history...
1990
        $this->getPageTreeAndUrls(
1991
            $pageId,
1992
            \TYPO3\CMS\Core\Utility\MathUtility::forceIntegerInRange($cliObj->cli_argValue('-d'),0,99),
1993
            $this->getCurrentTime(),
1994
            \TYPO3\CMS\Core\Utility\MathUtility::forceIntegerInRange($cliObj->cli_isArg('-n') ? $cliObj->cli_argValue('-n') : 30,1,1000),
1995
            $cliObj->cli_argValue('-o')==='queue' || $cliObj->cli_argValue('-o')==='exec',
1996
            $cliObj->cli_argValue('-o')==='url',
1997
            \TYPO3\CMS\Core\Utility\GeneralUtility::trimExplode(',',$cliObj->cli_argValue('-proc'),1),
0 ignored issues
show
Documentation introduced by
1 is of type integer, but the function expects a boolean.

It seems like the type of the argument is not accepted by the function/method which you are calling.

In some cases, in particular if PHP’s automatic type-juggling kicks in this might be fine. In other cases, however this might be a bug.

We suggest to add an explicit type cast like in the following example:

function acceptsInteger($int) { }

$x = '123'; // string "123"

// Instead of
acceptsInteger($x);

// we recommend to use
acceptsInteger((integer) $x);
Loading history...
1998
            $configurationKeys
1999
        );
2000
2001
        if ($cliObj->cli_argValue('-o')==='url') {
2002
            $cliObj->cli_echo(implode(chr(10),$this->downloadUrls).chr(10),1);
0 ignored issues
show
Documentation introduced by
1 is of type integer, but the function expects a boolean.

It seems like the type of the argument is not accepted by the function/method which you are calling.

In some cases, in particular if PHP’s automatic type-juggling kicks in this might be fine. In other cases, however this might be a bug.

We suggest to add an explicit type cast like in the following example:

function acceptsInteger($int) { }

$x = '123'; // string "123"

// Instead of
acceptsInteger($x);

// we recommend to use
acceptsInteger((integer) $x);
Loading history...
2003
        } elseif ($cliObj->cli_argValue('-o')==='exec')    {
2004
            $cliObj->cli_echo("Executing ".count($this->urlList)." requests right away:\n\n");
2005
            $cliObj->cli_echo(implode(chr(10),$this->urlList).chr(10));
2006
            $cliObj->cli_echo("\nProcessing:\n");
2007
2008
            foreach($this->queueEntries as $queueRec)    {
2009
                $p = unserialize($queueRec['parameters']);
2010
                $cliObj->cli_echo($p['url'].' ('.implode(',',$p['procInstructions']).') => ');
2011
2012
                $result = $this->readUrlFromArray($queueRec);
2013
2014
                $requestResult = unserialize($result['content']);
2015
                if (is_array($requestResult))    {
2016
                    $resLog = is_array($requestResult['log']) ?  chr(10).chr(9).chr(9).implode(chr(10).chr(9).chr(9),$requestResult['log']) : '';
2017
                    $cliObj->cli_echo('OK: '.$resLog.chr(10));
2018
                } else {
2019
                    $cliObj->cli_echo('Error checking Crawler Result: '.substr(preg_replace('/\s+/',' ',strip_tags($result['content'])),0,30000).'...'.chr(10));
2020
                }
2021
            }
2022
        } elseif ($cliObj->cli_argValue('-o')==='queue')    {
2023
            $cliObj->cli_echo("Putting ".count($this->urlList)." entries in queue:\n\n");
2024
            $cliObj->cli_echo(implode(chr(10),$this->urlList).chr(10));
2025
        } else {
2026
            $cliObj->cli_echo(count($this->urlList)." entries found for processing. (Use -o to decide action):\n\n",1);
0 ignored issues
show
Documentation introduced by
1 is of type integer, but the function expects a boolean.

It seems like the type of the argument is not accepted by the function/method which you are calling.

In some cases, in particular if PHP’s automatic type-juggling kicks in this might be fine. In other cases, however this might be a bug.

We suggest to add an explicit type cast like in the following example:

function acceptsInteger($int) { }

$x = '123'; // string "123"

// Instead of
acceptsInteger($x);

// we recommend to use
acceptsInteger((integer) $x);
Loading history...
2027
            $cliObj->cli_echo(implode(chr(10),$this->urlList).chr(10),1);
0 ignored issues
show
Documentation introduced by
1 is of type integer, but the function expects a boolean.

It seems like the type of the argument is not accepted by the function/method which you are calling.

In some cases, in particular if PHP’s automatic type-juggling kicks in this might be fine. In other cases, however this might be a bug.

We suggest to add an explicit type cast like in the following example:

function acceptsInteger($int) { }

$x = '123'; // string "123"

// Instead of
acceptsInteger($x);

// we recommend to use
acceptsInteger((integer) $x);
Loading history...
2028
        }
2029
    }
2030
2031
    /**
2032
     * Function executed by crawler_im.php cli script.
2033
     *
2034
     * @return bool
2035
     */
2036
    function CLI_main_flush() {
0 ignored issues
show
Best Practice introduced by
It is generally recommended to explicitly declare the visibility for methods.

Adding explicit visibility (private, protected, or public) is generally recommend to communicate to other developers how, and from where this method is intended to be used.

Loading history...
2037
        $this->setAccessMode('cli_flush');
2038
        $cliObj = \TYPO3\CMS\Core\Utility\GeneralUtility::makeInstance('tx_crawler_cli_flush');
2039
2040
            // Force user to admin state and set workspace to "Live":
2041
        $this->backendUser->user['admin'] = 1;
2042
        $this->backendUser->setWorkspace(0);
2043
2044
            // Print help
2045
        if (!isset($cliObj->cli_args['_DEFAULT'][1])) {
2046
            $cliObj->cli_validateArgs();
2047
            $cliObj->cli_help();
2048
            exit;
0 ignored issues
show
Coding Style Compatibility introduced by
The method CLI_main_flush() contains an exit expression.

An exit expression should only be used in rare cases. For example, if you write a short command line script.

In most cases however, using an exit expression makes the code untestable and often causes incompatibilities with other libraries. Thus, unless you are absolutely sure it is required here, we recommend to refactor your code to avoid its usage.

Loading history...
2049
        }
2050
2051
        $cliObj->cli_validateArgs();
2052
        $pageId = \TYPO3\CMS\Core\Utility\MathUtility::forceIntegerInRange($cliObj->cli_args['_DEFAULT'][1],0);
2053
        $fullFlush = ($pageId == 0);
2054
2055
        $mode = $cliObj->cli_argValue('-o');
2056
2057
        switch($mode) {
2058
            case 'all':
2059
                $result = $this->getLogEntriesForPageId($pageId, '', true, $fullFlush);
2060
                break;
2061
            case 'finished':
2062
            case 'pending':
2063
                $result = $this->getLogEntriesForPageId($pageId, $mode, true, $fullFlush);
2064
                break;
2065
            default:
2066
                $cliObj->cli_validateArgs();
2067
                $cliObj->cli_help();
2068
                $result = false;
2069
        }
2070
2071
        return $result !== false;
2072
    }
2073
2074
    /**
2075
     * Obtains configuration keys from the CLI arguments
2076
     *
2077
     * @param  tx_crawler_cli_im $cliObj    Command line object
2078
     * @return mixed                        Array of keys or null if no keys found
2079
     */
2080
    protected function getConfigurationKeys(tx_crawler_cli_im &$cliObj) {
2081
        $parameter = trim($cliObj->cli_argValue('-conf'));
2082
        return ($parameter != '' ? \TYPO3\CMS\Core\Utility\GeneralUtility::trimExplode(',', $parameter) : array());
2083
    }
2084
2085
    /**
2086
     * Running the functionality of the CLI (crawling URLs from queue)
2087
     *
2088
     * @param  int $countInARun
2089
     * @param  int $sleepTime
2090
     * @param  int $sleepAfterFinish
2091
     * @return string                   Status message
2092
     */
2093
    public function CLI_run($countInARun, $sleepTime, $sleepAfterFinish) {
2094
        $result = 0;
2095
        $counter = 0;
2096
2097
            // First, run hooks:
2098
        $this->CLI_runHooks();
2099
2100
            // Clean up the queue
2101
        if (intval($this->extensionSettings['purgeQueueDays']) > 0) {
2102
            $purgeDate = $this->getCurrentTime() - 24 * 60 * 60 * intval($this->extensionSettings['purgeQueueDays']);
2103
            $del = $this->db->exec_DELETEquery(
0 ignored issues
show
Unused Code introduced by
$del is not used, you could remove the assignment.

This check looks for variable assignements that are either overwritten by other assignments or where the variable is not used subsequently.

$myVar = 'Value';
$higher = false;

if (rand(1, 6) > 3) {
    $higher = true;
} else {
    $higher = false;
}

Both the $myVar assignment in line 1 and the $higher assignment in line 2 are dead. The first because $myVar is never used and the second because $higher is always overwritten for every possible time line.

Loading history...
2104
                'tx_crawler_queue',
2105
                'exec_time!=0 AND exec_time<' . $purgeDate
2106
            );
2107
        }
2108
2109
            // Select entries:
2110
            //TODO Shouldn't this reside within the transaction?
2111
        $rows = $this->db->exec_SELECTgetRows(
2112
            'qid,scheduled',
2113
            'tx_crawler_queue',
2114
            'exec_time=0
2115
                AND process_scheduled= 0
2116
                AND scheduled<='.$this->getCurrentTime(),
2117
            '',
2118
            'scheduled, qid',
2119
        intval($countInARun)
2120
        );
2121
2122
        if (count($rows)>0) {
2123
            $quidList = array();
2124
2125
            foreach($rows as $r) {
0 ignored issues
show
Bug introduced by
The expression $rows of type null|array is not guaranteed to be traversable. How about adding an additional type check?

There are different options of fixing this problem.

  1. If you want to be on the safe side, you can add an additional type-check:

    $collection = json_decode($data, true);
    if ( ! is_array($collection)) {
        throw new \RuntimeException('$collection must be an array.');
    }
    
    foreach ($collection as $item) { /** ... */ }
    
  2. If you are sure that the expression is traversable, you might want to add a doc comment cast to improve IDE auto-completion and static analysis:

    /** @var array $collection */
    $collection = json_decode($data, true);
    
    foreach ($collection as $item) { /** .. */ }
    
  3. Mark the issue as a false-positive: Just hover the remove button, in the top-right corner of this issue for more options.

Loading history...
2126
                $quidList[] = $r['qid'];
2127
            }
2128
2129
            $processId = $this->CLI_buildProcessId();
2130
2131
                //reserve queue entrys for process
2132
            $this->db->sql_query('BEGIN');
2133
                //TODO make sure we're not taking assigned queue-entires
2134
            $this->db->exec_UPDATEquery(
2135
                'tx_crawler_queue',
2136
                'qid IN ('.implode(',',$quidList).')',
2137
                array(
2138
                    'process_scheduled' => intval($this->getCurrentTime()),
2139
                    'process_id' => $processId
2140
                )
2141
            );
2142
2143
                //save the number of assigned queue entrys to determine who many have been processed later
2144
            $numberOfAffectedRows = $this->db->sql_affected_rows();
2145
            $this->db->exec_UPDATEquery(
2146
                'tx_crawler_process',
2147
                "process_id = '".$processId."'" ,
2148
                array(
2149
                    'assigned_items_count' => intval($numberOfAffectedRows)
2150
                )
2151
            );
2152
2153
            if($numberOfAffectedRows == count($quidList)) {
2154
                $this->db->sql_query('COMMIT');
2155
            } else  {
2156
                $this->db->sql_query('ROLLBACK');
2157
                $this->CLI_debug("Nothing processed due to multi-process collision (".$this->CLI_buildProcessId().")");
2158
                return ( $result | self::CLI_STATUS_ABORTED );
2159
            }
2160
2161
2162
2163
            foreach($rows as $r)    {
0 ignored issues
show
Bug introduced by
The expression $rows of type null|array is not guaranteed to be traversable. How about adding an additional type check?

There are different options of fixing this problem.

  1. If you want to be on the safe side, you can add an additional type-check:

    $collection = json_decode($data, true);
    if ( ! is_array($collection)) {
        throw new \RuntimeException('$collection must be an array.');
    }
    
    foreach ($collection as $item) { /** ... */ }
    
  2. If you are sure that the expression is traversable, you might want to add a doc comment cast to improve IDE auto-completion and static analysis:

    /** @var array $collection */
    $collection = json_decode($data, true);
    
    foreach ($collection as $item) { /** .. */ }
    
  3. Mark the issue as a false-positive: Just hover the remove button, in the top-right corner of this issue for more options.

Loading history...
2164
                $result |= $this->readUrl($r['qid']);
2165
2166
                $counter++;
2167
                usleep(intval($sleepTime));    // Just to relax the system
2168
2169
                    // if during the start and the current read url the cli has been disable we need to return from the function
2170
                    // mark the process NOT as ended.
2171
                if ($this->getDisabled()) {
2172
                    return ( $result | self::CLI_STATUS_ABORTED );
2173
                }
2174
2175
                if (!$this->CLI_checkIfProcessIsActive($this->CLI_buildProcessId())) {
2176
                    $this->CLI_debug("conflict / timeout (".$this->CLI_buildProcessId().")");
2177
2178
                        //TODO might need an additional returncode
2179
                    $result |= self::CLI_STATUS_ABORTED;
2180
                    break;        //possible timeout
2181
                }
2182
            }
2183
2184
            sleep(intval($sleepAfterFinish));
2185
2186
            $msg = 'Rows: '.$counter;
2187
            $this->CLI_debug($msg." (".$this->CLI_buildProcessId().")");
2188
2189
        } else {
2190
            $this->CLI_debug("Nothing within queue which needs to be processed (".$this->CLI_buildProcessId().")");
2191
        }
2192
2193
        if($counter > 0) {
2194
            $result |= self::CLI_STATUS_PROCESSED;
2195
        }
2196
2197
        return $result;
2198
    }
2199
2200
    /**
2201
     * Activate hooks
2202
     *
2203
     * @return    void
2204
     */
2205
    function CLI_runHooks()    {
0 ignored issues
show
Best Practice introduced by
It is generally recommended to explicitly declare the visibility for methods.

Adding explicit visibility (private, protected, or public) is generally recommend to communicate to other developers how, and from where this method is intended to be used.

Loading history...
2206
        global $TYPO3_CONF_VARS;
2207
        if (is_array($TYPO3_CONF_VARS['EXTCONF']['crawler']['cli_hooks']))    {
2208
            foreach($TYPO3_CONF_VARS['EXTCONF']['crawler']['cli_hooks'] as $objRef)    {
2209
                $hookObj = &\TYPO3\CMS\Core\Utility\GeneralUtility::getUserObj($objRef);
2210
                if (is_object($hookObj))    {
2211
                    $hookObj->crawler_init($this);
2212
                }
2213
            }
2214
        }
2215
    }
2216
2217
    /**
2218
     * Try to acquire a new process with the given id
2219
     * also performs some auto-cleanup for orphan processes
2220
     * @todo preemption might not be the most elegant way to clean up
2221
     *
2222
     * @param  string    $id  identification string for the process
2223
     * @return boolean        determines whether the attempt to get resources was successful
2224
     */
2225
    function CLI_checkAndAcquireNewProcess($id) {
0 ignored issues
show
Best Practice introduced by
It is generally recommended to explicitly declare the visibility for methods.

Adding explicit visibility (private, protected, or public) is generally recommend to communicate to other developers how, and from where this method is intended to be used.

Loading history...
2226
2227
        $ret = true;
2228
2229
        $systemProcessId = getmypid();
2230
        if ($systemProcessId < 1) {
2231
            return FALSE;
2232
        }
2233
2234
        $processCount = 0;
2235
        $orphanProcesses = array();
2236
2237
        $this->db->sql_query('BEGIN');
2238
2239
        $res = $this->db->exec_SELECTquery(
2240
            'process_id,ttl',
2241
            'tx_crawler_process',
2242
            'active=1 AND deleted=0'
2243
            );
2244
2245
            $currentTime = $this->getCurrentTime();
2246
2247
            while($row = $this->db->sql_fetch_assoc($res))    {
2248
                if ($row['ttl'] < $currentTime) {
2249
                    $orphanProcesses[] = $row['process_id'];
2250
                } else {
2251
                    $processCount++;
2252
                }
2253
            }
2254
2255
                // if there are less than allowed active processes then add a new one
2256
            if ($processCount < intval($this->extensionSettings['processLimit'])) {
2257
                $this->CLI_debug("add ".$this->CLI_buildProcessId()." (".($processCount+1)."/".intval($this->extensionSettings['processLimit']).")");
2258
2259
                    // create new process record
2260
                $this->db->exec_INSERTquery(
2261
                'tx_crawler_process',
2262
                array(
2263
                    'process_id' => $id,
2264
                    'active'=>'1',
2265
                    'ttl' => ($currentTime + intval($this->extensionSettings['processMaxRunTime'])),
2266
                    'system_process_id' => $systemProcessId
2267
                )
2268
                );
2269
2270
            } else {
2271
                $this->CLI_debug("Processlimit reached (".($processCount)."/".intval($this->extensionSettings['processLimit']).")");
2272
                $ret = false;
2273
            }
2274
2275
            $this->CLI_releaseProcesses($orphanProcesses, true); // maybe this should be somehow included into the current lock
2276
            $this->CLI_deleteProcessesMarkedDeleted();
2277
2278
            $this->db->sql_query('COMMIT');
2279
2280
            return $ret;
2281
    }
2282
2283
    /**
2284
     * Release a process and the required resources
2285
     *
2286
     * @param  mixed    $releaseIds   string with a single process-id or array with multiple process-ids
2287
     * @param  boolean  $withinLock   show whether the DB-actions are included within an existing lock
2288
     * @return boolean
2289
     */
2290
    function CLI_releaseProcesses($releaseIds, $withinLock=false) {
0 ignored issues
show
Best Practice introduced by
It is generally recommended to explicitly declare the visibility for methods.

Adding explicit visibility (private, protected, or public) is generally recommend to communicate to other developers how, and from where this method is intended to be used.

Loading history...
2291
2292
        if (!is_array($releaseIds)) {
2293
            $releaseIds = array($releaseIds);
2294
        }
2295
2296
        if (!count($releaseIds) > 0) {
2297
            return false;   //nothing to release
2298
        }
2299
2300
        if(!$withinLock) $this->db->sql_query('BEGIN');
2301
2302
            // some kind of 2nd chance algo - this way you need at least 2 processes to have a real cleanup
2303
            // this ensures that a single process can't mess up the entire process table
2304
2305
            // mark all processes as deleted which have no "waiting" queue-entires and which are not active
2306
        $this->db->exec_UPDATEquery(
2307
            'tx_crawler_queue',
2308
            'process_id IN (SELECT process_id FROM tx_crawler_process WHERE active=0 AND deleted=0)',
2309
            array(
2310
                'process_scheduled' => 0,
2311
                'process_id' => ''
2312
            )
2313
        );
2314
        $this->db->exec_UPDATEquery(
2315
            'tx_crawler_process',
2316
            'active=0 AND deleted=0
2317
            AND NOT EXISTS (
2318
                SELECT * FROM tx_crawler_queue
2319
                WHERE tx_crawler_queue.process_id = tx_crawler_process.process_id
2320
                AND tx_crawler_queue.exec_time = 0
2321
            )',
2322
            array(
2323
                'deleted'=>'1',
2324
                'system_process_id' => 0
2325
            )
2326
        );
2327
                // mark all requested processes as non-active
2328
        $this->db->exec_UPDATEquery(
2329
            'tx_crawler_process',
2330
            'process_id IN (\''.implode('\',\'',$releaseIds).'\') AND deleted=0',
2331
            array(
2332
                'active'=>'0'
2333
            )
2334
        );
2335
        $this->db->exec_UPDATEquery(
2336
            'tx_crawler_queue',
2337
            'exec_time=0 AND process_id IN ("'.implode('","',$releaseIds).'")',
2338
            array(
2339
                'process_scheduled'=>0,
2340
                'process_id'=>''
2341
            )
2342
        );
2343
2344
        if(!$withinLock) $this->db->sql_query('COMMIT');
2345
2346
        return true;
2347
    }
2348
2349
    /**
2350
     * Delete processes marked as deleted
2351
     *
2352
     * @return void
2353
     */
2354
     public function CLI_deleteProcessesMarkedDeleted() {
2355
        $this->db->exec_DELETEquery('tx_crawler_process', 'deleted = 1');
2356
    }
2357
2358
    /**
2359
     * Check if there are still resources left for the process with the given id
2360
     * Used to determine timeouts and to ensure a proper cleanup if there's a timeout
2361
     *
2362
     * @param  string  identification string for the process
2363
     * @return boolean determines if the process is still active / has resources
2364
     */
2365
    function CLI_checkIfProcessIsActive($pid) {
0 ignored issues
show
Best Practice introduced by
It is generally recommended to explicitly declare the visibility for methods.

Adding explicit visibility (private, protected, or public) is generally recommend to communicate to other developers how, and from where this method is intended to be used.

Loading history...
2366
        $ret = false;
2367
        $this->db->sql_query('BEGIN');
2368
        $res = $this->db->exec_SELECTquery(
2369
            'process_id,active,ttl',
2370
            'tx_crawler_process','process_id = \''.$pid.'\'  AND deleted=0',
2371
            '',
2372
            'ttl',
2373
            '0,1'
2374
        );
2375
        if($row = $this->db->sql_fetch_assoc($res))    {
2376
            $ret = intVal($row['active'])==1;
2377
        }
2378
        $this->db->sql_query('COMMIT');
2379
2380
        return $ret;
2381
    }
2382
2383
    /**
2384
     * Create a unique Id for the current process
2385
     *
2386
     * @return string  the ID
2387
     */
2388
    function CLI_buildProcessId() {
0 ignored issues
show
Best Practice introduced by
It is generally recommended to explicitly declare the visibility for methods.

Adding explicit visibility (private, protected, or public) is generally recommend to communicate to other developers how, and from where this method is intended to be used.

Loading history...
2389
        if(!$this->processID) {
2390
            $this->processID= \TYPO3\CMS\Core\Utility\GeneralUtility::shortMD5($this->microtime(true));
2391
        }
2392
        return $this->processID;
2393
    }
2394
2395
    /**
2396
     * @param bool $get_as_float
2397
     *
2398
     * @return mixed
2399
     */
2400
    protected function microtime($get_as_float = false )
2401
    {
2402
        return microtime($get_as_float);
2403
    }
2404
2405
    /**
2406
     * Prints a message to the stdout (only if debug-mode is enabled)
2407
     *
2408
     * @param  string $msg  the message
2409
     */
2410
    function CLI_debug($msg) {
0 ignored issues
show
Best Practice introduced by
It is generally recommended to explicitly declare the visibility for methods.

Adding explicit visibility (private, protected, or public) is generally recommend to communicate to other developers how, and from where this method is intended to be used.

Loading history...
2411
        if(intval($this->extensionSettings['processDebug'])) {
2412
            echo $msg."\n"; flush();
2413
        }
2414
    }
2415
2416
2417
2418
    /**
2419
     * Get URL content by making direct request to TYPO3.
2420
     *
2421
     * @param  string $url          Page URL
2422
     * @param  int    $crawlerId    Crawler-ID
2423
     * @return array
2424
     */
2425
    protected function sendDirectRequest($url, $crawlerId) {
2426
        $requestHeaders = $this->buildRequestHeaderArray(parse_url($url), $crawlerId);
0 ignored issues
show
Security Bug introduced by
It seems like parse_url($url) targeting parse_url() can also be of type false; however, tx_crawler_lib::buildRequestHeaderArray() does only seem to accept array, did you maybe forget to handle an error condition?
Loading history...
2427
2428
        $cmd  = escapeshellcmd($this->extensionSettings['phpPath']);
2429
        $cmd .= ' ';
2430
        $cmd .= escapeshellarg(\TYPO3\CMS\Core\Utility\ExtensionManagementUtility::extPath('crawler') . 'cli/bootstrap.php');
2431
        $cmd .= ' ';
2432
        $cmd .= escapeshellarg($this->getFrontendBasePath());
2433
        $cmd .= ' ';
2434
        $cmd .= escapeshellarg($url);
2435
        $cmd .= ' ';
2436
        $cmd .= escapeshellarg(base64_encode(serialize($requestHeaders)));
2437
2438
        $startTime = microtime(true);
2439
        $content = $this->executeShellCommand($cmd);
2440
        $this->log($url . (microtime(true) - $startTime));
2441
2442
        $result = array(
2443
            'request' => implode("\r\n", $requestHeaders) . "\r\n\r\n",
2444
            'headers' => '',
2445
            'content' => $content
2446
        );
2447
2448
        return $result;
2449
    }
2450
2451
    /**
2452
     * Cleans up entries that stayed for too long in the queue. These are:
2453
     * - processed entries that are over 1.5 days in age
2454
     * - scheduled entries that are over 7 days old
2455
     *
2456
     * @return void
2457
     */
2458
    protected function cleanUpOldQueueEntries() {
2459
        $processedAgeInSeconds = $this->extensionSettings['cleanUpProcessedAge'] * 86400; // 24*60*60 Seconds in 24 hours
2460
        $scheduledAgeInSeconds = $this->extensionSettings['cleanUpScheduledAge'] * 86400;
2461
2462
        $now = time();
2463
        $condition = '(exec_time<>0 AND exec_time<' . ($now - $processedAgeInSeconds) . ') OR scheduled<=' . ($now - $scheduledAgeInSeconds);
2464
        $this->flushQueue($condition);
2465
    }
2466
2467
    /**
2468
     * Initializes a TypoScript Frontend necessary for using TypoScript and TypoLink functions
2469
     *
2470
     * @param int $id
2471
     * @param int $typeNum
2472
     *
2473
     * @return void
2474
     */
2475
    protected function initTSFE($id = 1, $typeNum = 0) {
2476
        \TYPO3\CMS\Frontend\Utility\EidUtility::initTCA();
2477
        if (!is_object($GLOBALS['TT'])) {
2478
            $GLOBALS['TT'] = new \TYPO3\CMS\Core\TimeTracker\NullTimeTracker;
0 ignored issues
show
Deprecated Code introduced by
The class TYPO3\CMS\Core\TimeTracker\NullTimeTracker has been deprecated with message: since TYPO3 v8, will be removed in v9

This class, trait or interface has been deprecated. The supplier of the file has supplied an explanatory message.

The explanatory message should give you some clue as to whether and when the type will be removed from the class and what other constant to use instead.

Loading history...
2479
            $GLOBALS['TT']->start();
0 ignored issues
show
Deprecated Code introduced by
The method TYPO3\CMS\Core\TimeTrack...ullTimeTracker::start() has been deprecated with message: since TYPO3 v8, will be removed in v9, use the regular time tracking

This method has been deprecated. The supplier of the class has supplied an explanatory message.

The explanatory message should give you some clue as to whether and when the method will be removed from the class and what other method or class to use instead.

Loading history...
2480
        }
2481
2482
        $GLOBALS['TSFE'] = \TYPO3\CMS\Core\Utility\GeneralUtility::makeInstance('TYPO3\\CMS\\Frontend\\Controller\\TypoScriptFrontendController',  $GLOBALS['TYPO3_CONF_VARS'], $id, $typeNum);
2483
        $GLOBALS['TSFE']->sys_page = \TYPO3\CMS\Core\Utility\GeneralUtility::makeInstance('TYPO3\\CMS\\Frontend\\Page\\PageRepository');
2484
        $GLOBALS['TSFE']->sys_page->init(TRUE);
2485
        $GLOBALS['TSFE']->connectToDB();
2486
        $GLOBALS['TSFE']->initFEuser();
2487
        $GLOBALS['TSFE']->determineId();
2488
        $GLOBALS['TSFE']->initTemplate();
2489
        $GLOBALS['TSFE']->rootLine = $GLOBALS['TSFE']->sys_page->getRootLine($id, '');
2490
        $GLOBALS['TSFE']->getConfigArray();
2491
        \TYPO3\CMS\Frontend\Page\PageGenerator::pagegenInit();
2492
    }
2493
}
2494
2495
if (defined('TYPO3_MODE') && $TYPO3_CONF_VARS[TYPO3_MODE]['XCLASS']['ext/crawler/class.tx_crawler_lib.php'])    {
2496
    include_once($TYPO3_CONF_VARS[TYPO3_MODE]['XCLASS']['ext/crawler/class.tx_crawler_lib.php']);
2497
}
2498