Completed
Branch master (b7ffcb)
by Tomas Norre
17:57
created

CrawlerController::CLI_main_im()   F

Complexity

Conditions 17
Paths 337

Size

Total Lines 98
Code Lines 65

Duplication

Lines 0
Ratio 0 %

Importance

Changes 0
Metric Value
cc 17
eloc 65
nc 337
nop 0
dl 0
loc 98
rs 3.6909
c 0
b 0
f 0

How to fix   Long Method    Complexity   

Long Method

Small methods make your code easier to understand, in particular if combined with a good name. Besides, if your method is small, finding a good name is usually much easier.

For example, if you find yourself adding comments to a method's body, this is usually a good sign to extract the commented part to a new method, and use the comment as a starting point when coming up with a good name for this new method.

Commonly applied refactorings include:

1
<?php
2
namespace AOE\Crawler\Controller;
3
4
/***************************************************************
5
 *  Copyright notice
6
 *
7
 *  (c) 2017 AOE GmbH <[email protected]>
8
 *
9
 *  All rights reserved
10
 *
11
 *  This script is part of the TYPO3 project. The TYPO3 project is
12
 *  free software; you can redistribute it and/or modify
13
 *  it under the terms of the GNU General Public License as published by
14
 *  the Free Software Foundation; either version 3 of the License, or
15
 *  (at your option) any later version.
16
 *
17
 *  The GNU General Public License can be found at
18
 *  http://www.gnu.org/copyleft/gpl.html.
19
 *
20
 *  This script is distributed in the hope that it will be useful,
21
 *  but WITHOUT ANY WARRANTY; without even the implied warranty of
22
 *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
23
 *  GNU General Public License for more details.
24
 *
25
 *  This copyright notice MUST APPEAR in all copies of the script!
26
 ***************************************************************/
27
28
use AOE\Crawler\Command\CrawlerCommandLineController;
29
use AOE\Crawler\Command\FlushCommandLineController;
30
use AOE\Crawler\Command\QueueCommandLineController;
31
use AOE\Crawler\Domain\Model\Reason;
32
use AOE\Crawler\Event\EventDispatcher;
33
use AOE\Crawler\Utility\IconUtility;
34
use AOE\Crawler\Utility\SignalSlotUtility;
35
use TYPO3\CMS\Backend\Utility\BackendUtility;
36
use TYPO3\CMS\Backend\View\PageTreeView;
37
use TYPO3\CMS\Core\Authentication\BackendUserAuthentication;
38
use TYPO3\CMS\Core\Database\DatabaseConnection;
39
use TYPO3\CMS\Core\TimeTracker\NullTimeTracker;
40
use TYPO3\CMS\Core\Utility\DebugUtility;
41
use TYPO3\CMS\Core\Utility\ExtensionManagementUtility;
42
use TYPO3\CMS\Core\Utility\GeneralUtility;
43
use TYPO3\CMS\Core\Utility\MathUtility;
44
use TYPO3\CMS\Frontend\Controller\TypoScriptFrontendController;
45
use TYPO3\CMS\Frontend\Page\PageGenerator;
46
use TYPO3\CMS\Frontend\Page\PageRepository;
47
use TYPO3\CMS\Frontend\Utility\EidUtility;
48
49
/**
50
 * Class CrawlerController
51
 * 
52
 * @package AOE\Crawler\Controller
53
 */
54
class CrawlerController
55
{
56
    /**
57
     * @var integer
58
     */
59
    public $setID = 0;
60
61
    /**
62
     * @var string
63
     */
64
    public $processID = '';
65
66
    /**
67
     * One hour is max stalled time for the CLI
68
     * If the process had the status "start" for 3600 seconds, it will be regarded stalled and a new process is started
69
     *
70
     * @var integer
71
     */
72
    public $max_CLI_exec_time = 3600;
73
74
    /**
75
     * @var array
76
     */
77
    public $duplicateTrack = [];
78
79
    /**
80
     * @var array
81
     */
82
    public $downloadUrls = [];
83
84
    /**
85
     * @var array
86
     */
87
    public $incomingProcInstructions = [];
88
89
    /**
90
     * @var array
91
     */
92
    public $incomingConfigurationSelection = [];
93
94
    /**
95
     * @var array
96
     */
97
    public $registerQueueEntriesInternallyOnly = [];
98
99
    /**
100
     * @var array
101
     */
102
    public $queueEntries = [];
103
104
    /**
105
     * @var array
106
     */
107
    public $urlList = [];
108
109
    /**
110
     * @var boolean
111
     */
112
    public $debugMode = false;
113
114
    /**
115
     * @var array
116
     */
117
    public $extensionSettings = [];
118
119
    /**
120
     * Mount Point
121
     *
122
     * @var boolean
123
     */
124
    public $MP = false;
125
126
    /**
127
     * @var string
128
     */
129
    protected $processFilename;
130
131
    /**
132
     * Holds the internal access mode can be 'gui','cli' or 'cli_im'
133
     *
134
     * @var string
135
     */
136
    protected $accessMode;
137
138
    /**
139
     * @var DatabaseConnection
140
     */
141
    private $db;
142
143
    /**
144
     * @var BackendUserAuthentication
145
     */
146
    private $backendUser;
147
148
    const CLI_STATUS_NOTHING_PROCCESSED = 0;
149
    const CLI_STATUS_REMAIN = 1; //queue not empty
150
    const CLI_STATUS_PROCESSED = 2; //(some) queue items where processed
151
    const CLI_STATUS_ABORTED = 4; //instance didn't finish
152
    const CLI_STATUS_POLLABLE_PROCESSED = 8;
153
154
    /**
155
     * Method to set the accessMode can be gui, cli or cli_im
156
     *
157
     * @return string
158
     */
159
    public function getAccessMode()
160
    {
161
        return $this->accessMode;
162
    }
163
164
    /**
165
     * @param string $accessMode
166
     */
167
    public function setAccessMode($accessMode)
168
    {
169
        $this->accessMode = $accessMode;
170
    }
171
172
    /**
173
     * Set disabled status to prevent processes from being processed
174
     *
175
     * @param  bool $disabled (optional, defaults to true)
176
     * @return void
177
     */
178
    public function setDisabled($disabled = true)
179
    {
180
        if ($disabled) {
181
            GeneralUtility::writeFile($this->processFilename, '');
182
        } else {
183
            if (is_file($this->processFilename)) {
184
                unlink($this->processFilename);
185
            }
186
        }
187
    }
188
189
    /**
190
     * Get disable status
191
     *
192
     * @return bool true if disabled
193
     */
194
    public function getDisabled()
195
    {
196
        if (is_file($this->processFilename)) {
197
            return true;
198
        } else {
199
            return false;
200
        }
201
    }
202
203
    /**
204
     * @param string $filenameWithPath
205
     *
206
     * @return void
207
     */
208
    public function setProcessFilename($filenameWithPath)
209
    {
210
        $this->processFilename = $filenameWithPath;
211
    }
212
213
    /**
214
     * @return string
215
     */
216
    public function getProcessFilename()
217
    {
218
        return $this->processFilename;
219
    }
220
221
    /************************************
222
     *
223
     * Getting URLs based on Page TSconfig
224
     *
225
     ************************************/
226
227
    public function __construct()
228
    {
229
        $this->db = $GLOBALS['TYPO3_DB'];
230
        $this->backendUser = $GLOBALS['BE_USER'];
231
        $this->processFilename = PATH_site . 'typo3temp/tx_crawler.proc';
232
233
        $settings = unserialize($GLOBALS['TYPO3_CONF_VARS']['EXT']['extConf']['crawler']);
234
        $settings = is_array($settings) ? $settings : [];
235
236
        // read ext_em_conf_template settings and set
237
        $this->setExtensionSettings($settings);
238
239
        // set defaults:
240
        if (MathUtility::convertToPositiveInteger($this->extensionSettings['countInARun']) == 0) {
241
            $this->extensionSettings['countInARun'] = 100;
242
        }
243
244
        $this->extensionSettings['processLimit'] = MathUtility::forceIntegerInRange($this->extensionSettings['processLimit'], 1, 99, 1);
245
    }
246
247
    /**
248
     * Sets the extensions settings (unserialized pendant of $TYPO3_CONF_VARS['EXT']['extConf']['crawler']).
249
     *
250
     * @param array $extensionSettings
251
     * @return void
252
     */
253
    public function setExtensionSettings(array $extensionSettings)
254
    {
255
        $this->extensionSettings = $extensionSettings;
256
    }
257
258
    /**
259
     * Check if the given page should be crawled
260
     *
261
     * @param array $pageRow
262
     * @return false|string false if the page should be crawled (not excluded), true / skipMessage if it should be skipped
263
     */
264
    public function checkIfPageShouldBeSkipped(array $pageRow)
265
    {
266
        $skipPage = false;
267
        $skipMessage = 'Skipped'; // message will be overwritten later
268
269
        // if page is hidden
270
        if (!$this->extensionSettings['crawlHiddenPages']) {
271
            if ($pageRow['hidden']) {
272
                $skipPage = true;
273
                $skipMessage = 'Because page is hidden';
274
            }
275
        }
276
277
        if (!$skipPage) {
278
            if (GeneralUtility::inList('3,4', $pageRow['doktype']) || $pageRow['doktype'] >= 199) {
279
                $skipPage = true;
280
                $skipMessage = 'Because doktype is not allowed';
281
            }
282
        }
283
284
        if (!$skipPage) {
285
            if (is_array($GLOBALS['TYPO3_CONF_VARS']['EXTCONF']['crawler']['excludeDoktype'])) {
286
                foreach ($GLOBALS['TYPO3_CONF_VARS']['EXTCONF']['crawler']['excludeDoktype'] as $key => $doktypeList) {
287
                    if (GeneralUtility::inList($doktypeList, $pageRow['doktype'])) {
288
                        $skipPage = true;
289
                        $skipMessage = 'Doktype was excluded by "' . $key . '"';
290
                        break;
291
                    }
292
                }
293
            }
294
        }
295
296
        if (!$skipPage) {
297
            // veto hook
298
            if (is_array($GLOBALS['TYPO3_CONF_VARS']['EXTCONF']['crawler']['pageVeto'])) {
299
                foreach ($GLOBALS['TYPO3_CONF_VARS']['EXTCONF']['crawler']['pageVeto'] as $key => $func) {
300
                    $params = [
301
                        'pageRow' => $pageRow
302
                    ];
303
                    // expects "false" if page is ok and "true" or a skipMessage if this page should _not_ be crawled
304
                    $veto = GeneralUtility::callUserFunction($func, $params, $this);
305
                    if ($veto !== false) {
306
                        $skipPage = true;
307
                        if (is_string($veto)) {
308
                            $skipMessage = $veto;
309
                        } else {
310
                            $skipMessage = 'Veto from hook "' . htmlspecialchars($key) . '"';
311
                        }
312
                        // no need to execute other hooks if a previous one return a veto
313
                        break;
314
                    }
315
                }
316
            }
317
        }
318
319
        return $skipPage ? $skipMessage : false;
320
    }
321
322
    /**
323
     * Wrapper method for getUrlsForPageId()
324
     * It returns an array of configurations and no urls!
325
     *
326
     * @param array $pageRow Page record with at least dok-type and uid columns.
327
     * @param string $skipMessage
328
     * @return array
329
     * @see getUrlsForPageId()
330
     */
331
    public function getUrlsForPageRow(array $pageRow, &$skipMessage = '')
332
    {
333
        $message = $this->checkIfPageShouldBeSkipped($pageRow);
334
335
        if ($message === false) {
336
            $forceSsl = ($pageRow['url_scheme'] === 2) ? true : false;
337
            $res = $this->getUrlsForPageId($pageRow['uid'], $forceSsl);
338
            $skipMessage = '';
339
        } else {
340
            $skipMessage = $message;
341
            $res = [];
342
        }
343
344
        return $res;
345
    }
346
347
    /**
348
     * This method is used to count if there are ANY unprocessed queue entries
349
     * of a given page_id and the configuration which matches a given hash.
350
     * If there if none, we can skip an inner detail check
351
     *
352
     * @param  int $uid
353
     * @param  string $configurationHash
354
     * @return boolean
355
     */
356
    protected function noUnprocessedQueueEntriesForPageWithConfigurationHashExist($uid, $configurationHash)
357
    {
358
        $configurationHash = $this->db->fullQuoteStr($configurationHash, 'tx_crawler_queue');
359
        $res = $this->db->exec_SELECTquery('count(*) as anz', 'tx_crawler_queue', "page_id=" . intval($uid) . " AND configuration_hash=" . $configurationHash . " AND exec_time=0");
360
        $row = $this->db->sql_fetch_assoc($res);
361
362
        return ($row['anz'] == 0);
363
    }
364
365
    /**
366
     * Creates a list of URLs from input array (and submits them to queue if asked for)
367
     * See Web > Info module script + "indexed_search"'s crawler hook-client using this!
368
     *
369
     * @param    array        Information about URLs from pageRow to crawl.
370
     * @param    array        Page row
371
     * @param    integer        Unix time to schedule indexing to, typically time()
372
     * @param    integer        Number of requests per minute (creates the interleave between requests)
373
     * @param    boolean        If set, submits the URLs to queue
374
     * @param    boolean        If set (and submitcrawlUrls is false) will fill $downloadUrls with entries)
375
     * @param    array        Array which is passed by reference and contains the an id per url to secure we will not crawl duplicates
376
     * @param    array        Array which will be filled with URLS for download if flag is set.
377
     * @param    array        Array of processing instructions
378
     * @return    string        List of URLs (meant for display in backend module)
379
     *
380
     */
381
    public function urlListFromUrlArray(
382
    array $vv,
383
    array $pageRow,
384
    $scheduledTime,
385
    $reqMinute,
386
    $submitCrawlUrls,
387
    $downloadCrawlUrls,
388
    array &$duplicateTrack,
389
    array &$downloadUrls,
390
    array $incomingProcInstructions
391
    ) {
392
393
        // realurl support (thanks to Ingo Renner)
394
        if (ExtensionManagementUtility::isLoaded('realurl') && $vv['subCfg']['realurl']) {
395
396
            /** @var tx_realurl $urlObj */
397
            $urlObj = GeneralUtility::makeInstance('tx_realurl');
398
399
            if (!empty($vv['subCfg']['baseUrl'])) {
400
                $urlParts = parse_url($vv['subCfg']['baseUrl']);
401
                $host = strtolower($urlParts['host']);
402
                $urlObj->host = $host;
403
404
                // First pass, finding configuration OR pointer string:
405
                $urlObj->extConf = isset($GLOBALS['TYPO3_CONF_VARS']['EXTCONF']['realurl'][$urlObj->host]) ? $GLOBALS['TYPO3_CONF_VARS']['EXTCONF']['realurl'][$urlObj->host] : $GLOBALS['TYPO3_CONF_VARS']['EXTCONF']['realurl']['_DEFAULT'];
406
407
                // If it turned out to be a string pointer, then look up the real config:
408
                if (is_string($urlObj->extConf)) {
409
                    $urlObj->extConf = is_array($GLOBALS['TYPO3_CONF_VARS']['EXTCONF']['realurl'][$urlObj->extConf]) ? $GLOBALS['TYPO3_CONF_VARS']['EXTCONF']['realurl'][$urlObj->extConf] : $GLOBALS['TYPO3_CONF_VARS']['EXTCONF']['realurl']['_DEFAULT'];
410
                }
411
            }
412
413
            if (!$GLOBALS['TSFE']->sys_page) {
414
                $GLOBALS['TSFE']->sys_page = GeneralUtility::makeInstance('TYPO3\CMS\Frontend\Page\PageRepository');
415
            }
416
            if (!$GLOBALS['TSFE']->csConvObj) {
417
                $GLOBALS['TSFE']->csConvObj = GeneralUtility::makeInstance('TYPO3\CMS\Core\Charset\CharsetConverter');
418
            }
419
            if (!$GLOBALS['TSFE']->tmpl->rootLine[0]['uid']) {
420
                $GLOBALS['TSFE']->tmpl->rootLine[0]['uid'] = $urlObj->extConf['pagePath']['rootpage_id'];
421
            }
422
        }
423
424
        if (is_array($vv['URLs'])) {
425
            $configurationHash = md5(serialize($vv));
426
            $skipInnerCheck = $this->noUnprocessedQueueEntriesForPageWithConfigurationHashExist($pageRow['uid'], $configurationHash);
427
428
            foreach ($vv['URLs'] as $urlQuery) {
429
                if ($this->drawURLs_PIfilter($vv['subCfg']['procInstrFilter'], $incomingProcInstructions)) {
430
431
                    // Calculate cHash:
432
                    if ($vv['subCfg']['cHash']) {
433
                        /* @var $cacheHash \TYPO3\CMS\Frontend\Page\CacheHashCalculator */
434
                        $cacheHash = GeneralUtility::makeInstance('TYPO3\CMS\Frontend\Page\CacheHashCalculator');
435
                        $urlQuery .= '&cHash=' . $cacheHash->generateForParameters($urlQuery);
436
                    }
437
438
                    // Create key by which to determine unique-ness:
439
                    $uKey = $urlQuery . '|' . $vv['subCfg']['userGroups'] . '|' . $vv['subCfg']['baseUrl'] . '|' . $vv['subCfg']['procInstrFilter'];
440
441
                    // realurl support (thanks to Ingo Renner)
442
                    $urlQuery = 'index.php' . $urlQuery;
443
                    if (ExtensionManagementUtility::isLoaded('realurl') && $vv['subCfg']['realurl']) {
444
                        $params = [
445
                            'LD' => [
446
                                'totalURL' => $urlQuery
447
                            ],
448
                            'TCEmainHook' => true
449
                        ];
450
                        $urlObj->encodeSpURL($params);
0 ignored issues
show
Bug introduced by
The variable $urlObj does not seem to be defined for all execution paths leading up to this point.

If you define a variable conditionally, it can happen that it is not defined for all execution paths.

Let’s take a look at an example:

function myFunction($a) {
    switch ($a) {
        case 'foo':
            $x = 1;
            break;

        case 'bar':
            $x = 2;
            break;
    }

    // $x is potentially undefined here.
    echo $x;
}

In the above example, the variable $x is defined if you pass “foo” or “bar” as argument for $a. However, since the switch statement has no default case statement, if you pass any other value, the variable $x would be undefined.

Available Fixes

  1. Check for existence of the variable explicitly:

    function myFunction($a) {
        switch ($a) {
            case 'foo':
                $x = 1;
                break;
    
            case 'bar':
                $x = 2;
                break;
        }
    
        if (isset($x)) { // Make sure it's always set.
            echo $x;
        }
    }
    
  2. Define a default value for the variable:

    function myFunction($a) {
        $x = ''; // Set a default which gets overridden for certain paths.
        switch ($a) {
            case 'foo':
                $x = 1;
                break;
    
            case 'bar':
                $x = 2;
                break;
        }
    
        echo $x;
    }
    
  3. Add a value for the missing path:

    function myFunction($a) {
        switch ($a) {
            case 'foo':
                $x = 1;
                break;
    
            case 'bar':
                $x = 2;
                break;
    
            // We add support for the missing case.
            default:
                $x = '';
                break;
        }
    
        echo $x;
    }
    
Loading history...
451
                        $urlQuery = $params['LD']['totalURL'];
452
                    }
453
454
                    // Scheduled time:
455
                    $schTime = $scheduledTime + round(count($duplicateTrack) * (60 / $reqMinute));
456
                    $schTime = floor($schTime / 60) * 60;
457
458
                    if (isset($duplicateTrack[$uKey])) {
459
460
                        //if the url key is registered just display it and do not resubmit is
461
                        $urlList = '<em><span class="typo3-dimmed">' . htmlspecialchars($urlQuery) . '</span></em><br/>';
462
                    } else {
463
                        $urlList = '[' . date('d.m.y H:i', $schTime) . '] ' . htmlspecialchars($urlQuery);
464
                        $this->urlList[] = '[' . date('d.m.y H:i', $schTime) . '] ' . $urlQuery;
465
466
                        $theUrl = ($vv['subCfg']['baseUrl'] ? $vv['subCfg']['baseUrl'] : GeneralUtility::getIndpEnv('TYPO3_SITE_URL')) . $urlQuery;
467
468
                        // Submit for crawling!
469
                        if ($submitCrawlUrls) {
470
                            $added = $this->addUrl(
471
                            $pageRow['uid'],
472
                            $theUrl,
473
                            $vv['subCfg'],
474
                            $scheduledTime,
475
                            $configurationHash,
476
                            $skipInnerCheck
477
                            );
478
                            if ($added === false) {
479
                                $urlList .= ' (Url already existed)';
480
                            }
481
                        } elseif ($downloadCrawlUrls) {
482
                            $downloadUrls[$theUrl] = $theUrl;
483
                        }
484
485
                        $urlList .= '<br />';
486
                    }
487
                    $duplicateTrack[$uKey] = true;
488
                }
489
            }
490
        } else {
491
            $urlList = 'ERROR - no URL generated';
492
        }
493
494
        return $urlList;
0 ignored issues
show
Bug introduced by
The variable $urlList does not seem to be defined for all execution paths leading up to this point.

If you define a variable conditionally, it can happen that it is not defined for all execution paths.

Let’s take a look at an example:

function myFunction($a) {
    switch ($a) {
        case 'foo':
            $x = 1;
            break;

        case 'bar':
            $x = 2;
            break;
    }

    // $x is potentially undefined here.
    echo $x;
}

In the above example, the variable $x is defined if you pass “foo” or “bar” as argument for $a. However, since the switch statement has no default case statement, if you pass any other value, the variable $x would be undefined.

Available Fixes

  1. Check for existence of the variable explicitly:

    function myFunction($a) {
        switch ($a) {
            case 'foo':
                $x = 1;
                break;
    
            case 'bar':
                $x = 2;
                break;
        }
    
        if (isset($x)) { // Make sure it's always set.
            echo $x;
        }
    }
    
  2. Define a default value for the variable:

    function myFunction($a) {
        $x = ''; // Set a default which gets overridden for certain paths.
        switch ($a) {
            case 'foo':
                $x = 1;
                break;
    
            case 'bar':
                $x = 2;
                break;
        }
    
        echo $x;
    }
    
  3. Add a value for the missing path:

    function myFunction($a) {
        switch ($a) {
            case 'foo':
                $x = 1;
                break;
    
            case 'bar':
                $x = 2;
                break;
    
            // We add support for the missing case.
            default:
                $x = '';
                break;
        }
    
        echo $x;
    }
    
Loading history...
495
    }
496
497
    /**
498
     * Returns true if input processing instruction is among registered ones.
499
     *
500
     * @param string $piString PI to test
501
     * @param array $incomingProcInstructions Processing instructions
502
     * @return boolean
503
     */
504
    public function drawURLs_PIfilter($piString, array $incomingProcInstructions)
505
    {
506
        if (empty($incomingProcInstructions)) {
507
            return true;
508
        }
509
510
        foreach ($incomingProcInstructions as $pi) {
511
            if (GeneralUtility::inList($piString, $pi)) {
512
                return true;
513
            }
514
        }
515
    }
516
517
    public function getPageTSconfigForId($id)
518
    {
519
        if (!$this->MP) {
520
            $pageTSconfig = BackendUtility::getPagesTSconfig($id);
521
        } else {
522
            list(, $mountPointId) = explode('-', $this->MP);
523
            $pageTSconfig = BackendUtility::getPagesTSconfig($mountPointId);
524
        }
525
526
        // Call a hook to alter configuration
527
        if (is_array($GLOBALS['TYPO3_CONF_VARS']['EXTCONF']['crawler']['getPageTSconfigForId'])) {
528
            $params = [
529
                'pageId' => $id,
530
                'pageTSConfig' => &$pageTSconfig
531
            ];
532
            foreach ($GLOBALS['TYPO3_CONF_VARS']['EXTCONF']['crawler']['getPageTSconfigForId'] as $userFunc) {
533
                GeneralUtility::callUserFunction($userFunc, $params, $this);
534
            }
535
        }
536
537
        return $pageTSconfig;
538
    }
539
540
    /**
541
     * This methods returns an array of configurations.
542
     * And no urls!
543
     *
544
     * @param integer $id Page ID
545
     * @param bool $forceSsl Use https
546
     * @return array
547
     */
548
    protected function getUrlsForPageId($id, $forceSsl = false)
549
    {
550
551
        /**
552
         * Get configuration from tsConfig
553
         */
554
555
        // Get page TSconfig for page ID:
556
        $pageTSconfig = $this->getPageTSconfigForId($id);
557
558
        $res = [];
559
560
        if (is_array($pageTSconfig) && is_array($pageTSconfig['tx_crawler.']['crawlerCfg.'])) {
561
            $crawlerCfg = $pageTSconfig['tx_crawler.']['crawlerCfg.'];
562
563
            if (is_array($crawlerCfg['paramSets.'])) {
564
                foreach ($crawlerCfg['paramSets.'] as $key => $values) {
565
                    if (!is_array($values)) {
566
567
                        // Sub configuration for a single configuration string:
568
                        $subCfg = (array)$crawlerCfg['paramSets.'][$key . '.'];
569
                        $subCfg['key'] = $key;
570
571
                        if (strcmp($subCfg['procInstrFilter'], '')) {
572
                            $subCfg['procInstrFilter'] = implode(',', GeneralUtility::trimExplode(',', $subCfg['procInstrFilter']));
573
                        }
574
                        $pidOnlyList = implode(',', GeneralUtility::trimExplode(',', $subCfg['pidsOnly'], 1));
0 ignored issues
show
Documentation introduced by
1 is of type integer, but the function expects a boolean.

It seems like the type of the argument is not accepted by the function/method which you are calling.

In some cases, in particular if PHP’s automatic type-juggling kicks in this might be fine. In other cases, however this might be a bug.

We suggest to add an explicit type cast like in the following example:

function acceptsInteger($int) { }

$x = '123'; // string "123"

// Instead of
acceptsInteger($x);

// we recommend to use
acceptsInteger((integer) $x);
Loading history...
575
576
                        // process configuration if it is not page-specific or if the specific page is the current page:
577
                        if (!strcmp($subCfg['pidsOnly'], '') || GeneralUtility::inList($pidOnlyList, $id)) {
578
579
                                // add trailing slash if not present
580
                            if (!empty($subCfg['baseUrl']) && substr($subCfg['baseUrl'], -1) != '/') {
581
                                $subCfg['baseUrl'] .= '/';
582
                            }
583
584
                            // Explode, process etc.:
585
                            $res[$key] = [];
586
                            $res[$key]['subCfg'] = $subCfg;
587
                            $res[$key]['paramParsed'] = $this->parseParams($values);
588
                            $res[$key]['paramExpanded'] = $this->expandParameters($res[$key]['paramParsed'], $id);
589
                            $res[$key]['origin'] = 'pagets';
590
591
                            // recognize MP value
592
                            if (!$this->MP) {
593
                                $res[$key]['URLs'] = $this->compileUrls($res[$key]['paramExpanded'], ['?id=' . $id]);
594
                            } else {
595
                                $res[$key]['URLs'] = $this->compileUrls($res[$key]['paramExpanded'], ['?id=' . $id . '&MP=' . $this->MP]);
596
                            }
597
                        }
598
                    }
599
                }
600
            }
601
        }
602
603
        /**
604
         * Get configuration from tx_crawler_configuration records
605
         */
606
607
        // get records along the rootline
608
        $rootLine = BackendUtility::BEgetRootLine($id);
609
610
        foreach ($rootLine as $page) {
611
            $configurationRecordsForCurrentPage = BackendUtility::getRecordsByField(
612
                'tx_crawler_configuration',
613
                'pid',
614
                intval($page['uid']),
615
                BackendUtility::BEenableFields('tx_crawler_configuration') . BackendUtility::deleteClause('tx_crawler_configuration')
616
            );
617
618
            if (is_array($configurationRecordsForCurrentPage)) {
619
                foreach ($configurationRecordsForCurrentPage as $configurationRecord) {
620
621
                        // check access to the configuration record
622
                    if (empty($configurationRecord['begroups']) || $GLOBALS['BE_USER']->isAdmin() || $this->hasGroupAccess($GLOBALS['BE_USER']->user['usergroup_cached_list'], $configurationRecord['begroups'])) {
623
                        $pidOnlyList = implode(',', GeneralUtility::trimExplode(',', $configurationRecord['pidsonly'], 1));
0 ignored issues
show
Documentation introduced by
1 is of type integer, but the function expects a boolean.

It seems like the type of the argument is not accepted by the function/method which you are calling.

In some cases, in particular if PHP’s automatic type-juggling kicks in this might be fine. In other cases, however this might be a bug.

We suggest to add an explicit type cast like in the following example:

function acceptsInteger($int) { }

$x = '123'; // string "123"

// Instead of
acceptsInteger($x);

// we recommend to use
acceptsInteger((integer) $x);
Loading history...
624
625
                        // process configuration if it is not page-specific or if the specific page is the current page:
626
                        if (!strcmp($configurationRecord['pidsonly'], '') || GeneralUtility::inList($pidOnlyList, $id)) {
627
                            $key = $configurationRecord['name'];
628
629
                            // don't overwrite previously defined paramSets
630
                            if (!isset($res[$key])) {
631
632
                                    /* @var $TSparserObject \TYPO3\CMS\Core\TypoScript\Parser\TypoScriptParser */
633
                                $TSparserObject = GeneralUtility::makeInstance('TYPO3\CMS\Core\TypoScript\Parser\TypoScriptParser');
634
                                $TSparserObject->parse($configurationRecord['processing_instruction_parameters_ts']);
635
636
                                $subCfg = [
637
                                    'procInstrFilter' => $configurationRecord['processing_instruction_filter'],
638
                                    'procInstrParams.' => $TSparserObject->setup,
639
                                    'baseUrl' => $this->getBaseUrlForConfigurationRecord(
640
                                        $configurationRecord['base_url'],
641
                                        $configurationRecord['sys_domain_base_url'],
642
                                        $forceSsl
643
                                    ),
644
                                    'realurl' => $configurationRecord['realurl'],
645
                                    'cHash' => $configurationRecord['chash'],
646
                                    'userGroups' => $configurationRecord['fegroups'],
647
                                    'exclude' => $configurationRecord['exclude'],
648
                                    'rootTemplatePid' => (int) $configurationRecord['root_template_pid'],
649
                                    'key' => $key,
650
                                ];
651
652
                                // add trailing slash if not present
653
                                if (!empty($subCfg['baseUrl']) && substr($subCfg['baseUrl'], -1) != '/') {
654
                                    $subCfg['baseUrl'] .= '/';
655
                                }
656
                                if (!in_array($id, $this->expandExcludeString($subCfg['exclude']))) {
657
                                    $res[$key] = [];
658
                                    $res[$key]['subCfg'] = $subCfg;
659
                                    $res[$key]['paramParsed'] = $this->parseParams($configurationRecord['configuration']);
660
                                    $res[$key]['paramExpanded'] = $this->expandParameters($res[$key]['paramParsed'], $id);
661
                                    $res[$key]['URLs'] = $this->compileUrls($res[$key]['paramExpanded'], ['?id=' . $id]);
662
                                    $res[$key]['origin'] = 'tx_crawler_configuration_' . $configurationRecord['uid'];
663
                                }
664
                            }
665
                        }
666
                    }
667
                }
668
            }
669
        }
670
671
        if (is_array($GLOBALS['TYPO3_CONF_VARS']['EXTCONF']['crawler']['processUrls'])) {
672
            foreach ($GLOBALS['TYPO3_CONF_VARS']['EXTCONF']['crawler']['processUrls'] as $func) {
673
                $params = [
674
                    'res' => &$res,
675
                ];
676
                GeneralUtility::callUserFunction($func, $params, $this);
677
            }
678
        }
679
680
        return $res;
681
    }
682
683
    /**
684
     * Checks if a domain record exist and returns the base-url based on the record. If not the given baseUrl string is used.
685
     *
686
     * @param string $baseUrl
687
     * @param integer $sysDomainUid
688
     * @param bool $ssl
689
     * @return string
690
     */
691
    protected function getBaseUrlForConfigurationRecord($baseUrl, $sysDomainUid, $ssl = false)
692
    {
693
        $sysDomainUid = intval($sysDomainUid);
694
        $urlScheme = ($ssl === false) ? 'http' : 'https';
695
696
        if ($sysDomainUid > 0) {
697
            $res = $this->db->exec_SELECTquery(
698
                '*',
699
                'sys_domain',
700
                'uid = ' . $sysDomainUid .
701
                BackendUtility::BEenableFields('sys_domain') .
702
                BackendUtility::deleteClause('sys_domain')
703
            );
704
            $row = $this->db->sql_fetch_assoc($res);
705
            if ($row['domainName'] != '') {
706
                return $urlScheme . '://' . $row['domainName'];
707
            }
708
        }
709
        return $baseUrl;
710
    }
711
712
    public function getConfigurationsForBranch($rootid, $depth)
713
    {
714
        $configurationsForBranch = [];
715
716
        $pageTSconfig = $this->getPageTSconfigForId($rootid);
717
        if (is_array($pageTSconfig) && is_array($pageTSconfig['tx_crawler.']['crawlerCfg.']) && is_array($pageTSconfig['tx_crawler.']['crawlerCfg.']['paramSets.'])) {
718
            $sets = $pageTSconfig['tx_crawler.']['crawlerCfg.']['paramSets.'];
719
            if (is_array($sets)) {
720
                foreach ($sets as $key => $value) {
721
                    if (!is_array($value)) {
722
                        continue;
723
                    }
724
                    $configurationsForBranch[] = substr($key, -1) == '.' ? substr($key, 0, -1) : $key;
725
                }
726
            }
727
        }
728
        $pids = [];
729
        $rootLine = BackendUtility::BEgetRootLine($rootid);
730
        foreach ($rootLine as $node) {
731
            $pids[] = $node['uid'];
732
        }
733
        /* @var PageTreeView $tree */
734
        $tree = GeneralUtility::makeInstance(PageTreeView::class);
735
        $perms_clause = $GLOBALS['BE_USER']->getPagePermsClause(1);
736
        $tree->init('AND ' . $perms_clause);
737
        $tree->getTree($rootid, $depth, '');
738
        foreach ($tree->tree as $node) {
739
            $pids[] = $node['row']['uid'];
740
        }
741
742
        $res = $this->db->exec_SELECTquery(
743
            '*',
744
            'tx_crawler_configuration',
745
            'pid IN (' . implode(',', $pids) . ') ' .
746
            BackendUtility::BEenableFields('tx_crawler_configuration') .
747
            BackendUtility::deleteClause('tx_crawler_configuration') . ' ' .
748
            BackendUtility::versioningPlaceholderClause('tx_crawler_configuration') . ' '
749
        );
750
751
        while ($row = $this->db->sql_fetch_assoc($res)) {
752
            $configurationsForBranch[] = $row['name'];
753
        }
754
        $this->db->sql_free_result($res);
755
        return $configurationsForBranch;
756
    }
757
758
    /**
759
     * Check if a user has access to an item
760
     * (e.g. get the group list of the current logged in user from $GLOBALS['TSFE']->gr_list)
761
     *
762
     * @see \TYPO3\CMS\Frontend\Page\PageRepository::getMultipleGroupsWhereClause()
763
     * @param  string $groupList    Comma-separated list of (fe_)group UIDs from a user
764
     * @param  string $accessList   Comma-separated list of (fe_)group UIDs of the item to access
765
     * @return bool                 TRUE if at least one of the users group UIDs is in the access list or the access list is empty
766
     */
767
    public function hasGroupAccess($groupList, $accessList)
768
    {
769
        if (empty($accessList)) {
770
            return true;
771
        }
772
        foreach (GeneralUtility::intExplode(',', $groupList) as $groupUid) {
773
            if (GeneralUtility::inList($accessList, $groupUid)) {
774
                return true;
775
            }
776
        }
777
        return false;
778
    }
779
780
    /**
781
     * Parse GET vars of input Query into array with key=>value pairs
782
     *
783
     * @param string $inputQuery Input query string
784
     * @return array
785
     */
786
    public function parseParams($inputQuery)
787
    {
788
        // Extract all GET parameters into an ARRAY:
789
        $paramKeyValues = [];
790
        $GETparams = explode('&', $inputQuery);
791
792
        foreach ($GETparams as $paramAndValue) {
793
            list($p, $v) = explode('=', $paramAndValue, 2);
794
            if (strlen($p)) {
795
                $paramKeyValues[rawurldecode($p)] = rawurldecode($v);
796
            }
797
        }
798
799
        return $paramKeyValues;
800
    }
801
802
    /**
803
     * Will expand the parameters configuration to individual values. This follows a certain syntax of the value of each parameter.
804
     * Syntax of values:
805
     * - Basically: If the value is wrapped in [...] it will be expanded according to the following syntax, otherwise the value is taken literally
806
     * - Configuration is splitted by "|" and the parts are processed individually and finally added together
807
     * - For each configuration part:
808
     *         - "[int]-[int]" = Integer range, will be expanded to all values in between, values included, starting from low to high (max. 1000). Example "1-34" or "-40--30"
809
     *         - "_TABLE:[TCA table name];[_PID:[optional page id, default is current page]];[_ENABLELANG:1]" = Look up of table records from PID, filtering out deleted records. Example "_TABLE:tt_content; _PID:123"
810
     *        _ENABLELANG:1 picks only original records without their language overlays
811
     *         - Default: Literal value
812
     *
813
     * @param array $paramArray Array with key (GET var name) and values (value of GET var which is configuration for expansion)
814
     * @param integer $pid Current page ID
815
     * @return array
816
     */
817
    public function expandParameters($paramArray, $pid)
818
    {
819
        global $TCA;
820
821
        // Traverse parameter names:
822
        foreach ($paramArray as $p => $v) {
823
            $v = trim($v);
824
825
            // If value is encapsulated in square brackets it means there are some ranges of values to find, otherwise the value is literal
826
            if (substr($v, 0, 1) === '[' && substr($v, -1) === ']') {
827
                // So, find the value inside brackets and reset the paramArray value as an array.
828
                $v = substr($v, 1, -1);
829
                $paramArray[$p] = [];
830
831
                // Explode parts and traverse them:
832
                $parts = explode('|', $v);
833
                foreach ($parts as $pV) {
834
835
                        // Look for integer range: (fx. 1-34 or -40--30 // reads minus 40 to minus 30)
836
                    if (preg_match('/^(-?[0-9]+)\s*-\s*(-?[0-9]+)$/', trim($pV), $reg)) {
837
838
                        // Swap if first is larger than last:
839
                        if ($reg[1] > $reg[2]) {
840
                            $temp = $reg[2];
841
                            $reg[2] = $reg[1];
842
                            $reg[1] = $temp;
843
                        }
844
845
                        // Traverse range, add values:
846
                        $runAwayBrake = 1000; // Limit to size of range!
847
                        for ($a = $reg[1]; $a <= $reg[2];$a++) {
848
                            $paramArray[$p][] = $a;
849
                            $runAwayBrake--;
850
                            if ($runAwayBrake <= 0) {
851
                                break;
852
                            }
853
                        }
854
                    } elseif (substr(trim($pV), 0, 7) == '_TABLE:') {
855
856
                        // Parse parameters:
857
                        $subparts = GeneralUtility::trimExplode(';', $pV);
858
                        $subpartParams = [];
859
                        foreach ($subparts as $spV) {
860
                            list($pKey, $pVal) = GeneralUtility::trimExplode(':', $spV);
861
                            $subpartParams[$pKey] = $pVal;
862
                        }
863
864
                        // Table exists:
865
                        if (isset($TCA[$subpartParams['_TABLE']])) {
866
                            $lookUpPid = isset($subpartParams['_PID']) ? intval($subpartParams['_PID']) : $pid;
867
                            $pidField = isset($subpartParams['_PIDFIELD']) ? trim($subpartParams['_PIDFIELD']) : 'pid';
868
                            $where = isset($subpartParams['_WHERE']) ? $subpartParams['_WHERE'] : '';
869
                            $addTable = isset($subpartParams['_ADDTABLE']) ? $subpartParams['_ADDTABLE'] : '';
870
871
                            $fieldName = $subpartParams['_FIELD'] ? $subpartParams['_FIELD'] : 'uid';
872
                            if ($fieldName === 'uid' || $TCA[$subpartParams['_TABLE']]['columns'][$fieldName]) {
873
                                $andWhereLanguage = '';
874
                                $transOrigPointerField = $TCA[$subpartParams['_TABLE']]['ctrl']['transOrigPointerField'];
875
876
                                if ($subpartParams['_ENABLELANG'] && $transOrigPointerField) {
877
                                    $andWhereLanguage = ' AND ' . $this->db->quoteStr($transOrigPointerField, $subpartParams['_TABLE']) . ' <= 0 ';
878
                                }
879
880
                                $where = $this->db->quoteStr($pidField, $subpartParams['_TABLE']) . '=' . intval($lookUpPid) . ' ' .
881
                                    $andWhereLanguage . $where;
882
883
                                $rows = $this->db->exec_SELECTgetRows(
884
                                    $fieldName,
885
                                    $subpartParams['_TABLE'] . $addTable,
886
                                    $where . BackendUtility::deleteClause($subpartParams['_TABLE']),
887
                                    '',
888
                                    '',
889
                                    '',
890
                                    $fieldName
891
                                );
892
893
                                if (is_array($rows)) {
894
                                    $paramArray[$p] = array_merge($paramArray[$p], array_keys($rows));
895
                                }
896
                            }
897
                        }
898
                    } else { // Just add value:
899
                        $paramArray[$p][] = $pV;
900
                    }
901
                    // Hook for processing own expandParameters place holder
902
                    if (is_array($GLOBALS['TYPO3_CONF_VARS']['SC_OPTIONS']['crawler/class.tx_crawler_lib.php']['expandParameters'])) {
903
                        $_params = [
904
                            'pObj' => &$this,
905
                            'paramArray' => &$paramArray,
906
                            'currentKey' => $p,
907
                            'currentValue' => $pV,
908
                            'pid' => $pid
909
                        ];
910
                        foreach ($GLOBALS['TYPO3_CONF_VARS']['SC_OPTIONS']['crawler/class.tx_crawler_lib.php']['expandParameters'] as $key => $_funcRef) {
911
                            GeneralUtility::callUserFunction($_funcRef, $_params, $this);
912
                        }
913
                    }
914
                }
915
916
                // Make unique set of values and sort array by key:
917
                $paramArray[$p] = array_unique($paramArray[$p]);
918
                ksort($paramArray);
919
            } else {
920
                // Set the literal value as only value in array:
921
                $paramArray[$p] = [$v];
922
            }
923
        }
924
925
        return $paramArray;
926
    }
927
928
    /**
929
     * Compiling URLs from parameter array (output of expandParameters())
930
     * The number of URLs will be the multiplication of the number of parameter values for each key
931
     *
932
     * @param array $paramArray Output of expandParameters(): Array with keys (GET var names) and for each an array of values
933
     * @param array $urls URLs accumulated in this array (for recursion)
934
     * @return array
935
     */
936
    public function compileUrls($paramArray, $urls = [])
937
    {
938
        if (count($paramArray) && is_array($urls)) {
939
            // shift first off stack:
940
            reset($paramArray);
941
            $varName = key($paramArray);
942
            $valueSet = array_shift($paramArray);
943
944
            // Traverse value set:
945
            $newUrls = [];
946
            foreach ($urls as $url) {
947
                foreach ($valueSet as $val) {
948
                    $newUrls[] = $url . (strcmp($val, '') ? '&' . rawurlencode($varName) . '=' . rawurlencode($val) : '');
949
950
                    if (count($newUrls) > MathUtility::forceIntegerInRange($this->extensionSettings['maxCompileUrls'], 1, 1000000000, 10000)) {
951
                        break;
952
                    }
953
                }
954
            }
955
            $urls = $newUrls;
956
            $urls = $this->compileUrls($paramArray, $urls);
957
        }
958
959
        return $urls;
960
    }
961
962
    /************************************
963
     *
964
     * Crawler log
965
     *
966
     ************************************/
967
968
    /**
969
     * Return array of records from crawler queue for input page ID
970
     *
971
     * @param integer $id Page ID for which to look up log entries.
972
     * @param string$filter Filter: "all" => all entries, "pending" => all that is not yet run, "finished" => all complete ones
973
     * @param boolean $doFlush If TRUE, then entries selected at DELETED(!) instead of selected!
974
     * @param boolean $doFullFlush
975
     * @param integer $itemsPerPage Limit the amount of entries per page default is 10
976
     * @return array
977
     */
978
    public function getLogEntriesForPageId($id, $filter = '', $doFlush = false, $doFullFlush = false, $itemsPerPage = 10)
979
    {
980
        // FIXME: Write Unit tests for Filters
981
        switch ($filter) {
982
            case 'pending':
983
                $addWhere = ' AND exec_time=0';
984
                break;
985
            case 'finished':
986
                $addWhere = ' AND exec_time>0';
987
                break;
988
            default:
989
                $addWhere = '';
990
                break;
991
        }
992
993
        // FIXME: Write unit test that ensures that the right records are deleted.
994
        if ($doFlush) {
995
            $this->flushQueue(($doFullFlush ? '1=1' : ('page_id=' . intval($id))) . $addWhere);
996
            return [];
997
        } else {
998
            return $this->db->exec_SELECTgetRows(
999
                '*',
1000
                'tx_crawler_queue',
1001
                'page_id=' . intval($id) . $addWhere,
1002
                '',
1003
                'scheduled DESC',
1004
                (intval($itemsPerPage) > 0 ? intval($itemsPerPage) : '')
1005
            );
1006
        }
1007
    }
1008
1009
    /**
1010
     * Return array of records from crawler queue for input set ID
1011
     *
1012
     * @param integer $set_id Set ID for which to look up log entries.
1013
     * @param string $filter Filter: "all" => all entries, "pending" => all that is not yet run, "finished" => all complete ones
1014
     * @param boolean $doFlush If TRUE, then entries selected at DELETED(!) instead of selected!
1015
     * @param integer $itemsPerPage Limit the amount of entires per page default is 10
1016
     * @return array
1017
     */
1018
    public function getLogEntriesForSetId($set_id, $filter = '', $doFlush = false, $doFullFlush = false, $itemsPerPage = 10)
1019
    {
1020
        // FIXME: Write Unit tests for Filters
1021
        switch ($filter) {
1022
            case 'pending':
1023
                $addWhere = ' AND exec_time=0';
1024
                break;
1025
            case 'finished':
1026
                $addWhere = ' AND exec_time>0';
1027
                break;
1028
            default:
1029
                $addWhere = '';
1030
                break;
1031
        }
1032
        // FIXME: Write unit test that ensures that the right records are deleted.
1033
        if ($doFlush) {
1034
            $this->flushQueue($doFullFlush ? '' : ('set_id=' . intval($set_id) . $addWhere));
1035
            return [];
1036
        } else {
1037
            return $this->db->exec_SELECTgetRows(
1038
                '*',
1039
                'tx_crawler_queue',
1040
                'set_id=' . intval($set_id) . $addWhere,
1041
                '',
1042
                'scheduled DESC',
1043
                (intval($itemsPerPage) > 0 ? intval($itemsPerPage) : '')
1044
            );
1045
        }
1046
    }
1047
1048
    /**
1049
     * Removes queue entires
1050
     *
1051
     * @param string $where SQL related filter for the entries which should be removed
1052
     * @return void
1053
     */
1054
    protected function flushQueue($where = '')
1055
    {
1056
        $realWhere = strlen($where) > 0 ? $where : '1=1';
1057
1058
        if (EventDispatcher::getInstance()->hasObserver('queueEntryFlush')) {
1059
            $groups = $this->db->exec_SELECTgetRows('DISTINCT set_id', 'tx_crawler_queue', $realWhere);
1060
            foreach ($groups as $group) {
0 ignored issues
show
Bug introduced by
The expression $groups of type null|array is not guaranteed to be traversable. How about adding an additional type check?

There are different options of fixing this problem.

  1. If you want to be on the safe side, you can add an additional type-check:

    $collection = json_decode($data, true);
    if ( ! is_array($collection)) {
        throw new \RuntimeException('$collection must be an array.');
    }
    
    foreach ($collection as $item) { /** ... */ }
    
  2. If you are sure that the expression is traversable, you might want to add a doc comment cast to improve IDE auto-completion and static analysis:

    /** @var array $collection */
    $collection = json_decode($data, true);
    
    foreach ($collection as $item) { /** .. */ }
    
  3. Mark the issue as a false-positive: Just hover the remove button, in the top-right corner of this issue for more options.

Loading history...
1061
                EventDispatcher::getInstance()->post('queueEntryFlush', $group['set_id'], $this->db->exec_SELECTgetRows('uid, set_id', 'tx_crawler_queue', $realWhere . ' AND set_id="' . $group['set_id'] . '"'));
1062
            }
1063
        }
1064
1065
        $this->db->exec_DELETEquery('tx_crawler_queue', $realWhere);
1066
    }
1067
1068
    /**
1069
     * Adding call back entries to log (called from hooks typically, see indexed search class "class.crawler.php"
1070
     *
1071
     * @param integer $setId Set ID
1072
     * @param array $params Parameters to pass to call back function
1073
     * @param string $callBack Call back object reference, eg. 'EXT:indexed_search/class.crawler.php:&tx_indexedsearch_crawler'
1074
     * @param integer $page_id Page ID to attach it to
1075
     * @param integer $schedule Time at which to activate
1076
     * @return void
1077
     */
1078
    public function addQueueEntry_callBack($setId, $params, $callBack, $page_id = 0, $schedule = 0)
1079
    {
1080
        if (!is_array($params)) {
1081
            $params = [];
1082
        }
1083
        $params['_CALLBACKOBJ'] = $callBack;
1084
1085
        // Compile value array:
1086
        $fieldArray = [
1087
            'page_id' => intval($page_id),
1088
            'parameters' => serialize($params),
1089
            'scheduled' => intval($schedule) ? intval($schedule) : $this->getCurrentTime(),
1090
            'exec_time' => 0,
1091
            'set_id' => intval($setId),
1092
            'result_data' => '',
1093
        ];
1094
1095
        $this->db->exec_INSERTquery('tx_crawler_queue', $fieldArray);
1096
    }
1097
1098
    /************************************
1099
     *
1100
     * URL setting
1101
     *
1102
     ************************************/
1103
1104
    /**
1105
     * Setting a URL for crawling:
1106
     *
1107
     * @param integer $id Page ID
1108
     * @param string $url Complete URL
1109
     * @param array $subCfg Sub configuration array (from TS config)
1110
     * @param integer $tstamp Scheduled-time
1111
     * @param string $configurationHash (optional) configuration hash
1112
     * @param bool $skipInnerDuplicationCheck (optional) skip inner duplication check
1113
     * @return bool
1114
     */
1115
    public function addUrl(
1116
        $id,
1117
        $url,
1118
        array $subCfg,
1119
        $tstamp,
1120
        $configurationHash = '',
1121
        $skipInnerDuplicationCheck = false
1122
    ) {
1123
        $urlAdded = false;
1124
1125
        // Creating parameters:
1126
        $parameters = [
1127
            'url' => $url
1128
        ];
1129
1130
        // fe user group simulation:
1131
        $uGs = implode(',', array_unique(GeneralUtility::intExplode(',', $subCfg['userGroups'], 1)));
0 ignored issues
show
Documentation introduced by
1 is of type integer, but the function expects a boolean.

It seems like the type of the argument is not accepted by the function/method which you are calling.

In some cases, in particular if PHP’s automatic type-juggling kicks in this might be fine. In other cases, however this might be a bug.

We suggest to add an explicit type cast like in the following example:

function acceptsInteger($int) { }

$x = '123'; // string "123"

// Instead of
acceptsInteger($x);

// we recommend to use
acceptsInteger((integer) $x);
Loading history...
1132
        if ($uGs) {
1133
            $parameters['feUserGroupList'] = $uGs;
1134
        }
1135
1136
        // Setting processing instructions
1137
        $parameters['procInstructions'] = GeneralUtility::trimExplode(',', $subCfg['procInstrFilter']);
1138
        if (is_array($subCfg['procInstrParams.'])) {
1139
            $parameters['procInstrParams'] = $subCfg['procInstrParams.'];
1140
        }
1141
1142
        // Possible TypoScript Template Parents
1143
        $parameters['rootTemplatePid'] = $subCfg['rootTemplatePid'];
1144
1145
        // Compile value array:
1146
        $parameters_serialized = serialize($parameters);
1147
        $fieldArray = [
1148
            'page_id' => intval($id),
1149
            'parameters' => $parameters_serialized,
1150
            'parameters_hash' => GeneralUtility::shortMD5($parameters_serialized),
1151
            'configuration_hash' => $configurationHash,
1152
            'scheduled' => $tstamp,
1153
            'exec_time' => 0,
1154
            'set_id' => intval($this->setID),
1155
            'result_data' => '',
1156
            'configuration' => $subCfg['key'],
1157
        ];
1158
1159
        if ($this->registerQueueEntriesInternallyOnly) {
0 ignored issues
show
Bug Best Practice introduced by
The expression $this->registerQueueEntriesInternallyOnly of type array is implicitly converted to a boolean; are you sure this is intended? If so, consider using ! empty($expr) instead to make it clear that you intend to check for an array without elements.

This check marks implicit conversions of arrays to boolean values in a comparison. While in PHP an empty array is considered to be equal (but not identical) to false, this is not always apparent.

Consider making the comparison explicit by using empty(..) or ! empty(...) instead.

Loading history...
1160
            //the entries will only be registered and not stored to the database
1161
            $this->queueEntries[] = $fieldArray;
1162
        } else {
1163
            if (!$skipInnerDuplicationCheck) {
1164
                // check if there is already an equal entry
1165
                $rows = $this->getDuplicateRowsIfExist($tstamp, $fieldArray);
1166
            }
1167
1168
            if (count($rows) == 0) {
1169
                $this->db->exec_INSERTquery('tx_crawler_queue', $fieldArray);
1170
                $uid = $this->db->sql_insert_id();
1171
                $rows[] = $uid;
0 ignored issues
show
Bug introduced by
The variable $rows does not seem to be defined for all execution paths leading up to this point.

If you define a variable conditionally, it can happen that it is not defined for all execution paths.

Let’s take a look at an example:

function myFunction($a) {
    switch ($a) {
        case 'foo':
            $x = 1;
            break;

        case 'bar':
            $x = 2;
            break;
    }

    // $x is potentially undefined here.
    echo $x;
}

In the above example, the variable $x is defined if you pass “foo” or “bar” as argument for $a. However, since the switch statement has no default case statement, if you pass any other value, the variable $x would be undefined.

Available Fixes

  1. Check for existence of the variable explicitly:

    function myFunction($a) {
        switch ($a) {
            case 'foo':
                $x = 1;
                break;
    
            case 'bar':
                $x = 2;
                break;
        }
    
        if (isset($x)) { // Make sure it's always set.
            echo $x;
        }
    }
    
  2. Define a default value for the variable:

    function myFunction($a) {
        $x = ''; // Set a default which gets overridden for certain paths.
        switch ($a) {
            case 'foo':
                $x = 1;
                break;
    
            case 'bar':
                $x = 2;
                break;
        }
    
        echo $x;
    }
    
  3. Add a value for the missing path:

    function myFunction($a) {
        switch ($a) {
            case 'foo':
                $x = 1;
                break;
    
            case 'bar':
                $x = 2;
                break;
    
            // We add support for the missing case.
            default:
                $x = '';
                break;
        }
    
        echo $x;
    }
    
Loading history...
1172
                $urlAdded = true;
1173
                EventDispatcher::getInstance()->post('urlAddedToQueue', $this->setID, ['uid' => $uid, 'fieldArray' => $fieldArray]);
1174
            } else {
1175
                EventDispatcher::getInstance()->post('duplicateUrlInQueue', $this->setID, ['rows' => $rows, 'fieldArray' => $fieldArray]);
1176
            }
1177
        }
1178
1179
        return $urlAdded;
1180
    }
1181
1182
    /**
1183
     * This method determines duplicates for a queue entry with the same parameters and this timestamp.
1184
     * If the timestamp is in the past, it will check if there is any unprocessed queue entry in the past.
1185
     * If the timestamp is in the future it will check, if the queued entry has exactly the same timestamp
1186
     *
1187
     * @param int $tstamp
1188
     * @param array $fieldArray
1189
     *
1190
     * @return array;
0 ignored issues
show
Documentation introduced by
The doc-type array; could not be parsed: Expected "|" or "end of type", but got ";" at position 5. (view supported doc-types)

This check marks PHPDoc comments that could not be parsed by our parser. To see which comment annotations we can parse, please refer to our documentation on supported doc-types.

Loading history...
1191
     */
1192
    protected function getDuplicateRowsIfExist($tstamp, $fieldArray)
1193
    {
1194
        $rows = [];
1195
1196
        $currentTime = $this->getCurrentTime();
1197
1198
        //if this entry is scheduled with "now"
1199
        if ($tstamp <= $currentTime) {
1200
            if ($this->extensionSettings['enableTimeslot']) {
1201
                $timeBegin = $currentTime - 100;
1202
                $timeEnd = $currentTime + 100;
1203
                $where = ' ((scheduled BETWEEN ' . $timeBegin . ' AND ' . $timeEnd . ' ) OR scheduled <= ' . $currentTime . ') ';
1204
            } else {
1205
                $where = 'scheduled <= ' . $currentTime;
1206
            }
1207
        } elseif ($tstamp > $currentTime) {
1208
            //entry with a timestamp in the future need to have the same schedule time
1209
            $where = 'scheduled = ' . $tstamp ;
1210
        }
1211
1212
        if (!empty($where)) {
1213
            $result = $this->db->exec_SELECTgetRows(
1214
                'qid',
1215
                'tx_crawler_queue',
1216
                $where .
1217
                ' AND NOT exec_time' .
1218
                ' AND NOT process_id ' .
1219
                ' AND page_id=' . intval($fieldArray['page_id']) .
1220
                ' AND parameters_hash = ' . $this->db->fullQuoteStr($fieldArray['parameters_hash'], 'tx_crawler_queue')
1221
            );
1222
1223
            if (is_array($result)) {
1224
                foreach ($result as $value) {
1225
                    $rows[] = $value['qid'];
1226
                }
1227
            }
1228
        }
1229
1230
        return $rows;
1231
    }
1232
1233
    /**
1234
     * Returns the current system time
1235
     *
1236
     * @return int
1237
     */
1238
    public function getCurrentTime()
1239
    {
1240
        return time();
1241
    }
1242
1243
    /************************************
1244
     *
1245
     * URL reading
1246
     *
1247
     ************************************/
1248
1249
    /**
1250
     * Read URL for single queue entry
1251
     *
1252
     * @param integer $queueId
1253
     * @param boolean $force If set, will process even if exec_time has been set!
1254
     * @return integer
1255
     */
1256
    public function readUrl($queueId, $force = false)
1257
    {
1258
        $ret = 0;
1259
        if ($this->debugMode) {
1260
            GeneralUtility::devlog('crawler-readurl start ' . microtime(true), __FUNCTION__);
1261
        }
1262
        // Get entry:
1263
        list($queueRec) = $this->db->exec_SELECTgetRows(
1264
            '*',
1265
            'tx_crawler_queue',
1266
            'qid=' . intval($queueId) . ($force ? '' : ' AND exec_time=0 AND process_scheduled > 0')
1267
        );
1268
1269
        if (!is_array($queueRec)) {
1270
            return;
1271
        }
1272
1273
        $parameters = unserialize($queueRec['parameters']);
1274
        if ($parameters['rootTemplatePid']) {
1275
            $this->initTSFE((int)$parameters['rootTemplatePid']);
1276
        } else {
1277
            GeneralUtility::sysLog(
1278
                'Page with (' . $queueRec['page_id'] . ') could not be crawled, please check your crawler configuration. Perhaps no Root Template Pid is set',
1279
                'crawler',
1280
                GeneralUtility::SYSLOG_SEVERITY_WARNING
1281
            );
1282
        }
1283
1284
        SignalSlotUtility::emitSignal(
1285
            __CLASS__,
1286
            SignalSlotUtility::SIGNNAL_QUEUEITEM_PREPROCESS,
1287
            [$queueId, &$queueRec]
1288
        );
1289
1290
        // Set exec_time to lock record:
1291
        $field_array = ['exec_time' => $this->getCurrentTime()];
1292
1293
        if (isset($this->processID)) {
1294
            //if mulitprocessing is used we need to store the id of the process which has handled this entry
1295
            $field_array['process_id_completed'] = $this->processID;
1296
        }
1297
        $this->db->exec_UPDATEquery('tx_crawler_queue', 'qid=' . intval($queueId), $field_array);
1298
1299
        $result = $this->readUrl_exec($queueRec);
1300
        $resultData = unserialize($result['content']);
1301
1302
        //atm there's no need to point to specific pollable extensions
1303
        if (is_array($GLOBALS['TYPO3_CONF_VARS']['EXTCONF']['crawler']['pollSuccess'])) {
1304
            foreach ($GLOBALS['TYPO3_CONF_VARS']['EXTCONF']['crawler']['pollSuccess'] as $pollable) {
1305
                // only check the success value if the instruction is runnig
1306
                // it is important to name the pollSuccess key same as the procInstructions key
1307
                if (is_array($resultData['parameters']['procInstructions']) && in_array(
1308
                    $pollable,
1309
                        $resultData['parameters']['procInstructions']
1310
                )
1311
                ) {
1312
                    if (!empty($resultData['success'][$pollable]) && $resultData['success'][$pollable]) {
1313
                        $ret |= self::CLI_STATUS_POLLABLE_PROCESSED;
1314
                    }
1315
                }
1316
            }
1317
        }
1318
1319
        // Set result in log which also denotes the end of the processing of this entry.
1320
        $field_array = ['result_data' => serialize($result)];
1321
1322
        SignalSlotUtility::emitSignal(
1323
            __CLASS__,
1324
            SignalSlotUtility::SIGNNAL_QUEUEITEM_POSTPROCESS,
1325
            [$queueId, &$field_array]
1326
        );
1327
1328
        $this->db->exec_UPDATEquery('tx_crawler_queue', 'qid=' . intval($queueId), $field_array);
1329
1330
        if ($this->debugMode) {
1331
            GeneralUtility::devlog('crawler-readurl stop ' . microtime(true), __FUNCTION__);
1332
        }
1333
1334
        return $ret;
1335
    }
1336
1337
    /**
1338
     * Read URL for not-yet-inserted log-entry
1339
     *
1340
     * @param integer $field_array Queue field array,
1341
     * @return string
1342
     */
1343
    public function readUrlFromArray($field_array)
1344
    {
1345
1346
            // Set exec_time to lock record:
1347
        $field_array['exec_time'] = $this->getCurrentTime();
1348
        $this->db->exec_INSERTquery('tx_crawler_queue', $field_array);
0 ignored issues
show
Documentation introduced by
$field_array is of type integer, but the function expects a array.

It seems like the type of the argument is not accepted by the function/method which you are calling.

In some cases, in particular if PHP’s automatic type-juggling kicks in this might be fine. In other cases, however this might be a bug.

We suggest to add an explicit type cast like in the following example:

function acceptsInteger($int) { }

$x = '123'; // string "123"

// Instead of
acceptsInteger($x);

// we recommend to use
acceptsInteger((integer) $x);
Loading history...
1349
        $queueId = $field_array['qid'] = $this->db->sql_insert_id();
1350
1351
        $result = $this->readUrl_exec($field_array);
0 ignored issues
show
Documentation introduced by
$field_array is of type integer, but the function expects a array.

It seems like the type of the argument is not accepted by the function/method which you are calling.

In some cases, in particular if PHP’s automatic type-juggling kicks in this might be fine. In other cases, however this might be a bug.

We suggest to add an explicit type cast like in the following example:

function acceptsInteger($int) { }

$x = '123'; // string "123"

// Instead of
acceptsInteger($x);

// we recommend to use
acceptsInteger((integer) $x);
Loading history...
1352
1353
        // Set result in log which also denotes the end of the processing of this entry.
1354
        $field_array = ['result_data' => serialize($result)];
1355
1356
        SignalSlotUtility::emitSignal(
1357
            __CLASS__,
1358
            SignalSlotUtility::SIGNNAL_QUEUEITEM_POSTPROCESS,
1359
            [$queueId, &$field_array]
1360
        );
1361
1362
        $this->db->exec_UPDATEquery('tx_crawler_queue', 'qid=' . intval($queueId), $field_array);
1363
1364
        return $result;
1365
    }
1366
1367
    /**
1368
     * Read URL for a queue record
1369
     *
1370
     * @param array $queueRec Queue record
1371
     * @return string
1372
     */
1373
    public function readUrl_exec($queueRec)
1374
    {
1375
        // Decode parameters:
1376
        $parameters = unserialize($queueRec['parameters']);
1377
        $result = 'ERROR';
1378
        if (is_array($parameters)) {
1379
            if ($parameters['_CALLBACKOBJ']) { // Calling object:
1380
                $objRef = $parameters['_CALLBACKOBJ'];
1381
                $callBackObj = &GeneralUtility::getUserObj($objRef);
1382
                if (is_object($callBackObj)) {
1383
                    unset($parameters['_CALLBACKOBJ']);
1384
                    $result = ['content' => serialize($callBackObj->crawler_execute($parameters, $this))];
1385
                } else {
1386
                    $result = ['content' => 'No object: ' . $objRef];
1387
                }
1388
            } else { // Regular FE request:
1389
1390
                // Prepare:
1391
                $crawlerId = $queueRec['qid'] . ':' . md5($queueRec['qid'] . '|' . $queueRec['set_id'] . '|' . $GLOBALS['TYPO3_CONF_VARS']['SYS']['encryptionKey']);
1392
1393
                // Get result:
1394
                $result = $this->requestUrl($parameters['url'], $crawlerId);
1395
1396
                EventDispatcher::getInstance()->post('urlCrawled', $queueRec['set_id'], ['url' => $parameters['url'], 'result' => $result]);
1397
            }
1398
        }
1399
1400
        return $result;
1401
    }
1402
1403
    /**
1404
     * Gets the content of a URL.
1405
     *
1406
     * @param string $originalUrl URL to read
1407
     * @param string $crawlerId Crawler ID string (qid + hash to verify)
1408
     * @param integer $timeout Timeout time
1409
     * @param integer $recursion Recursion limiter for 302 redirects
1410
     * @return array
1411
     */
1412
    public function requestUrl($originalUrl, $crawlerId, $timeout = 2, $recursion = 10)
1413
    {
1414
        if (!$recursion) {
1415
            return false;
1416
        }
1417
1418
        // Parse URL, checking for scheme:
1419
        $url = parse_url($originalUrl);
1420
1421
        if ($url === false) {
1422
            if (TYPO3_DLOG) {
1423
                GeneralUtility::devLog(sprintf('Could not parse_url() for string "%s"', $url), 'crawler', 4, ['crawlerId' => $crawlerId]);
1424
            }
1425
            return false;
1426
        }
1427
1428
        if (!in_array($url['scheme'], ['','http','https'])) {
1429
            if (TYPO3_DLOG) {
1430
                GeneralUtility::devLog(sprintf('Scheme does not match for url "%s"', $url), 'crawler', 4, ['crawlerId' => $crawlerId]);
1431
            }
1432
            return false;
1433
        }
1434
1435
        // direct request
1436
        if ($this->extensionSettings['makeDirectRequests']) {
1437
            $result = $this->sendDirectRequest($originalUrl, $crawlerId);
1438
            return $result;
1439
        }
1440
1441
        $reqHeaders = $this->buildRequestHeaderArray($url, $crawlerId);
1442
1443
        // thanks to Pierrick Caillon for adding proxy support
1444
        $rurl = $url;
1445
1446
        if ($GLOBALS['TYPO3_CONF_VARS']['SYS']['curlUse'] && $GLOBALS['TYPO3_CONF_VARS']['SYS']['curlProxyServer']) {
1447
            $rurl = parse_url($GLOBALS['TYPO3_CONF_VARS']['SYS']['curlProxyServer']);
1448
            $url['path'] = $url['scheme'] . '://' . $url['host'] . ($url['port'] > 0 ? ':' . $url['port'] : '') . $url['path'];
1449
            $reqHeaders = $this->buildRequestHeaderArray($url, $crawlerId);
1450
        }
1451
1452
        $host = $rurl['host'];
1453
1454
        if ($url['scheme'] == 'https') {
1455
            $host = 'ssl://' . $host;
1456
            $port = ($rurl['port'] > 0) ? $rurl['port'] : 443;
1457
        } else {
1458
            $port = ($rurl['port'] > 0) ? $rurl['port'] : 80;
1459
        }
1460
1461
        $startTime = microtime(true);
1462
        $fp = fsockopen($host, $port, $errno, $errstr, $timeout);
1463
1464
        if (!$fp) {
1465
            if (TYPO3_DLOG) {
1466
                GeneralUtility::devLog(sprintf('Error while opening "%s"', $url), 'crawler', 4, ['crawlerId' => $crawlerId]);
1467
            }
1468
            return false;
1469
        } else {
1470
            // Request message:
1471
            $msg = implode("\r\n", $reqHeaders) . "\r\n\r\n";
1472
            fputs($fp, $msg);
1473
1474
            // Read response:
1475
            $d = $this->getHttpResponseFromStream($fp);
1476
            fclose($fp);
1477
1478
            $time = microtime(true) - $startTime;
1479
            $this->log($originalUrl . ' ' . $time);
1480
1481
            // Implode content and headers:
1482
            $result = [
1483
                'request' => $msg,
1484
                'headers' => implode('', $d['headers']),
1485
                'content' => implode('', (array)$d['content'])
1486
            ];
1487
1488
            if (($this->extensionSettings['follow30x']) && ($newUrl = $this->getRequestUrlFrom302Header($d['headers'], $url['user'], $url['pass']))) {
1489
                $result = array_merge(['parentRequest' => $result], $this->requestUrl($newUrl, $crawlerId, $recursion--));
1490
                $newRequestUrl = $this->requestUrl($newUrl, $crawlerId, $timeout, --$recursion);
1491
1492
                if (is_array($newRequestUrl)) {
1493
                    $result = array_merge(['parentRequest' => $result], $newRequestUrl);
1494
                } else {
1495
                    if (TYPO3_DLOG) {
1496
                        GeneralUtility::devLog(sprintf('Error while opening "%s"', $url), 'crawler', 4, ['crawlerId' => $crawlerId]);
1497
                    }
1498
                    return false;
1499
                }
1500
            }
1501
1502
            return $result;
1503
        }
1504
    }
1505
1506
    /**
1507
     * Gets the base path of the website frontend.
1508
     * (e.g. if you call http://mydomain.com/cms/index.php in
1509
     * the browser the base path is "/cms/")
1510
     *
1511
     * @return string Base path of the website frontend
1512
     */
1513
    protected function getFrontendBasePath()
1514
    {
1515
        $frontendBasePath = '/';
1516
1517
        // Get the path from the extension settings:
1518
        if (isset($this->extensionSettings['frontendBasePath']) && $this->extensionSettings['frontendBasePath']) {
1519
            $frontendBasePath = $this->extensionSettings['frontendBasePath'];
1520
            // If empty, try to use config.absRefPrefix:
1521
        } elseif (isset($GLOBALS['TSFE']->absRefPrefix) && !empty($GLOBALS['TSFE']->absRefPrefix)) {
1522
            $frontendBasePath = $GLOBALS['TSFE']->absRefPrefix;
1523
            // If not in CLI mode the base path can be determined from $_SERVER environment:
1524
        } elseif (!defined('TYPO3_REQUESTTYPE_CLI') || !TYPO3_REQUESTTYPE_CLI) {
1525
            $frontendBasePath = GeneralUtility::getIndpEnv('TYPO3_SITE_PATH');
1526
        }
1527
1528
        // Base path must be '/<pathSegements>/':
1529
        if ($frontendBasePath != '/') {
1530
            $frontendBasePath = '/' . ltrim($frontendBasePath, '/');
1531
            $frontendBasePath = rtrim($frontendBasePath, '/') . '/';
1532
        }
1533
1534
        return $frontendBasePath;
1535
    }
1536
1537
    /**
1538
     * Executes a shell command and returns the outputted result.
1539
     *
1540
     * @param string $command Shell command to be executed
1541
     * @return string Outputted result of the command execution
1542
     */
1543
    protected function executeShellCommand($command)
1544
    {
1545
        $result = shell_exec($command);
1546
        return $result;
1547
    }
1548
1549
    /**
1550
     * Reads HTTP response from the given stream.
1551
     *
1552
     * @param  resource $streamPointer  Pointer to connection stream.
1553
     * @return array                    Associative array with the following items:
1554
     *                                  headers <array> Response headers sent by server.
1555
     *                                  content <array> Content, with each line as an array item.
1556
     */
1557
    protected function getHttpResponseFromStream($streamPointer)
1558
    {
1559
        $response = ['headers' => [], 'content' => []];
1560
1561
        if (is_resource($streamPointer)) {
1562
            // read headers
1563
            while ($line = fgets($streamPointer, '2048')) {
1564
                $line = trim($line);
1565
                if ($line !== '') {
1566
                    $response['headers'][] = $line;
1567
                } else {
1568
                    break;
1569
                }
1570
            }
1571
1572
            // read content
1573
            while ($line = fgets($streamPointer, '2048')) {
1574
                $response['content'][] = $line;
1575
            }
1576
        }
1577
1578
        return $response;
1579
    }
1580
1581
    /**
1582
     * @param message
1583
     */
1584
    protected function log($message)
1585
    {
1586
        if (!empty($this->extensionSettings['logFileName'])) {
1587
            @file_put_contents($this->extensionSettings['logFileName'], date('Ymd His') . ' ' . $message . PHP_EOL, FILE_APPEND);
0 ignored issues
show
Security Best Practice introduced by
It seems like you do not handle an error condition here. This can introduce security issues, and is generally not recommended.

If you suppress an error, we recommend checking for the error condition explicitly:

// For example instead of
@mkdir($dir);

// Better use
if (@mkdir($dir) === false) {
    throw new \RuntimeException('The directory '.$dir.' could not be created.');
}
Loading history...
1588
        }
1589
    }
1590
1591
    /**
1592
     * Builds HTTP request headers.
1593
     *
1594
     * @param array $url
1595
     * @param string $crawlerId
1596
     *
1597
     * @return array
1598
     */
1599
    protected function buildRequestHeaderArray(array $url, $crawlerId)
1600
    {
1601
        $reqHeaders = [];
1602
        $reqHeaders[] = 'GET ' . $url['path'] . ($url['query'] ? '?' . $url['query'] : '') . ' HTTP/1.0';
1603
        $reqHeaders[] = 'Host: ' . $url['host'];
1604
        if (stristr($url['query'], 'ADMCMD_previewWS')) {
1605
            $reqHeaders[] = 'Cookie: $Version="1"; be_typo_user="1"; $Path=/';
1606
        }
1607
        $reqHeaders[] = 'Connection: close';
1608
        if ($url['user'] != '') {
1609
            $reqHeaders[] = 'Authorization: Basic ' . base64_encode($url['user'] . ':' . $url['pass']);
1610
        }
1611
        $reqHeaders[] = 'X-T3crawler: ' . $crawlerId;
1612
        $reqHeaders[] = 'User-Agent: TYPO3 crawler';
1613
        return $reqHeaders;
1614
    }
1615
1616
    /**
1617
     * Check if the submitted HTTP-Header contains a redirect location and built new crawler-url
1618
     *
1619
     * @param array $headers HTTP Header
1620
     * @param string $user HTTP Auth. User
1621
     * @param string $pass HTTP Auth. Password
1622
     * @return string
1623
     */
1624
    protected function getRequestUrlFrom302Header($headers, $user = '', $pass = '')
1625
    {
1626
        if (!is_array($headers)) {
1627
            return false;
1628
        }
1629
        if (!(stristr($headers[0], '301 Moved') || stristr($headers[0], '302 Found') || stristr($headers[0], '302 Moved'))) {
1630
            return false;
1631
        }
1632
1633
        foreach ($headers as $hl) {
1634
            $tmp = explode(": ", $hl);
1635
            $header[trim($tmp[0])] = trim($tmp[1]);
0 ignored issues
show
Coding Style Comprehensibility introduced by
$header was never initialized. Although not strictly required by PHP, it is generally a good practice to add $header = array(); before regardless.

Adding an explicit array definition is generally preferable to implicit array definition as it guarantees a stable state of the code.

Let’s take a look at an example:

foreach ($collection as $item) {
    $myArray['foo'] = $item->getFoo();

    if ($item->hasBar()) {
        $myArray['bar'] = $item->getBar();
    }

    // do something with $myArray
}

As you can see in this example, the array $myArray is initialized the first time when the foreach loop is entered. You can also see that the value of the bar key is only written conditionally; thus, its value might result from a previous iteration.

This might or might not be intended. To make your intention clear, your code more readible and to avoid accidental bugs, we recommend to add an explicit initialization $myArray = array() either outside or inside the foreach loop.

Loading history...
1636
            if (trim($tmp[0]) == 'Location') {
1637
                break;
1638
            }
1639
        }
1640
        if (!array_key_exists('Location', $header)) {
0 ignored issues
show
Bug introduced by
The variable $header does not seem to be defined for all execution paths leading up to this point.

If you define a variable conditionally, it can happen that it is not defined for all execution paths.

Let’s take a look at an example:

function myFunction($a) {
    switch ($a) {
        case 'foo':
            $x = 1;
            break;

        case 'bar':
            $x = 2;
            break;
    }

    // $x is potentially undefined here.
    echo $x;
}

In the above example, the variable $x is defined if you pass “foo” or “bar” as argument for $a. However, since the switch statement has no default case statement, if you pass any other value, the variable $x would be undefined.

Available Fixes

  1. Check for existence of the variable explicitly:

    function myFunction($a) {
        switch ($a) {
            case 'foo':
                $x = 1;
                break;
    
            case 'bar':
                $x = 2;
                break;
        }
    
        if (isset($x)) { // Make sure it's always set.
            echo $x;
        }
    }
    
  2. Define a default value for the variable:

    function myFunction($a) {
        $x = ''; // Set a default which gets overridden for certain paths.
        switch ($a) {
            case 'foo':
                $x = 1;
                break;
    
            case 'bar':
                $x = 2;
                break;
        }
    
        echo $x;
    }
    
  3. Add a value for the missing path:

    function myFunction($a) {
        switch ($a) {
            case 'foo':
                $x = 1;
                break;
    
            case 'bar':
                $x = 2;
                break;
    
            // We add support for the missing case.
            default:
                $x = '';
                break;
        }
    
        echo $x;
    }
    
Loading history...
1641
            return false;
1642
        }
1643
1644
        if ($user != '') {
1645
            if (!($tmp = parse_url($header['Location']))) {
1646
                return false;
1647
            }
1648
            $newUrl = $tmp['scheme'] . '://' . $user . ':' . $pass . '@' . $tmp['host'] . $tmp['path'];
1649
            if ($tmp['query'] != '') {
1650
                $newUrl .= '?' . $tmp['query'];
1651
            }
1652
        } else {
1653
            $newUrl = $header['Location'];
1654
        }
1655
        return $newUrl;
1656
    }
1657
1658
    /**************************
1659
     *
1660
     * tslib_fe hooks:
1661
     *
1662
     **************************/
1663
1664
    /**
1665
     * Initialization hook (called after database connection)
1666
     * Takes the "HTTP_X_T3CRAWLER" header and looks up queue record and verifies if the session comes from the system (by comparing hashes)
1667
     *
1668
     * @param array $params Parameters from frontend
1669
     * @param object $ref TSFE object (reference under PHP5)
1670
     * @return void
1671
     */
1672
    public function fe_init(&$params, $ref)
0 ignored issues
show
Unused Code introduced by
The parameter $ref is not used and could be removed.

This check looks from parameters that have been defined for a function or method, but which are not used in the method body.

Loading history...
1673
    {
1674
1675
            // Authenticate crawler request:
1676
        if (isset($_SERVER['HTTP_X_T3CRAWLER'])) {
1677
            list($queueId, $hash) = explode(':', $_SERVER['HTTP_X_T3CRAWLER']);
1678
            list($queueRec) = $this->db->exec_SELECTgetRows('*', 'tx_crawler_queue', 'qid=' . intval($queueId));
1679
1680
            // If a crawler record was found and hash was matching, set it up:
1681
            if (is_array($queueRec) && $hash === md5($queueRec['qid'] . '|' . $queueRec['set_id'] . '|' . $GLOBALS['TYPO3_CONF_VARS']['SYS']['encryptionKey'])) {
1682
                $params['pObj']->applicationData['tx_crawler']['running'] = true;
1683
                $params['pObj']->applicationData['tx_crawler']['parameters'] = unserialize($queueRec['parameters']);
1684
                $params['pObj']->applicationData['tx_crawler']['log'] = [];
1685
            } else {
1686
                die('No crawler entry found!');
1687
            }
1688
        }
1689
    }
1690
1691
    /*****************************
1692
     *
1693
     * Compiling URLs to crawl - tools
1694
     *
1695
     *****************************/
1696
1697
    /**
1698
     * @param integer $id Root page id to start from.
1699
     * @param integer $depth Depth of tree, 0=only id-page, 1= on sublevel, 99 = infinite
1700
     * @param integer $scheduledTime Unix Time when the URL is timed to be visited when put in queue
1701
     * @param integer $reqMinute Number of requests per minute (creates the interleave between requests)
1702
     * @param boolean $submitCrawlUrls If set, submits the URLs to queue in database (real crawling)
1703
     * @param boolean $downloadCrawlUrls If set (and submitcrawlUrls is false) will fill $downloadUrls with entries)
1704
     * @param array $incomingProcInstructions Array of processing instructions
1705
     * @param array $configurationSelection Array of configuration keys
1706
     * @return string
1707
     */
1708
    public function getPageTreeAndUrls(
1709
        $id,
1710
        $depth,
1711
        $scheduledTime,
1712
        $reqMinute,
1713
        $submitCrawlUrls,
1714
        $downloadCrawlUrls,
1715
        array $incomingProcInstructions,
1716
        array $configurationSelection
1717
    ) {
1718
        global $BACK_PATH;
1719
        global $LANG;
1720
        if (!is_object($LANG)) {
1721
            $LANG = GeneralUtility::makeInstance('language');
1722
            $LANG->init(0);
1723
        }
1724
        $this->scheduledTime = $scheduledTime;
0 ignored issues
show
Bug introduced by
The property scheduledTime does not exist. Did you maybe forget to declare it?

In PHP it is possible to write to properties without declaring them. For example, the following is perfectly valid PHP code:

class MyClass { }

$x = new MyClass();
$x->foo = true;

Generally, it is a good practice to explictly declare properties to avoid accidental typos and provide IDE auto-completion:

class MyClass {
    public $foo;
}

$x = new MyClass();
$x->foo = true;
Loading history...
1725
        $this->reqMinute = $reqMinute;
0 ignored issues
show
Bug introduced by
The property reqMinute does not exist. Did you maybe forget to declare it?

In PHP it is possible to write to properties without declaring them. For example, the following is perfectly valid PHP code:

class MyClass { }

$x = new MyClass();
$x->foo = true;

Generally, it is a good practice to explictly declare properties to avoid accidental typos and provide IDE auto-completion:

class MyClass {
    public $foo;
}

$x = new MyClass();
$x->foo = true;
Loading history...
1726
        $this->submitCrawlUrls = $submitCrawlUrls;
0 ignored issues
show
Bug introduced by
The property submitCrawlUrls does not exist. Did you maybe forget to declare it?

In PHP it is possible to write to properties without declaring them. For example, the following is perfectly valid PHP code:

class MyClass { }

$x = new MyClass();
$x->foo = true;

Generally, it is a good practice to explictly declare properties to avoid accidental typos and provide IDE auto-completion:

class MyClass {
    public $foo;
}

$x = new MyClass();
$x->foo = true;
Loading history...
1727
        $this->downloadCrawlUrls = $downloadCrawlUrls;
0 ignored issues
show
Bug introduced by
The property downloadCrawlUrls does not exist. Did you maybe forget to declare it?

In PHP it is possible to write to properties without declaring them. For example, the following is perfectly valid PHP code:

class MyClass { }

$x = new MyClass();
$x->foo = true;

Generally, it is a good practice to explictly declare properties to avoid accidental typos and provide IDE auto-completion:

class MyClass {
    public $foo;
}

$x = new MyClass();
$x->foo = true;
Loading history...
1728
        $this->incomingProcInstructions = $incomingProcInstructions;
1729
        $this->incomingConfigurationSelection = $configurationSelection;
1730
1731
        $this->duplicateTrack = [];
1732
        $this->downloadUrls = [];
1733
1734
        // Drawing tree:
1735
        /* @var PageTreeView $tree */
1736
        $tree = GeneralUtility::makeInstance(PageTreeView::class);
1737
        $perms_clause = $GLOBALS['BE_USER']->getPagePermsClause(1);
1738
        $tree->init('AND ' . $perms_clause);
1739
1740
        $pageinfo = BackendUtility::readPageAccess($id, $perms_clause);
1741
1742
        // Set root row:
1743
        $tree->tree[] = [
1744
            'row' => $pageinfo,
1745
            'HTML' => IconUtility::getIconForRecord('pages', $pageinfo)
0 ignored issues
show
Security Bug introduced by
It seems like $pageinfo defined by \TYPO3\CMS\Backend\Utili...ess($id, $perms_clause) on line 1740 can also be of type false; however, AOE\Crawler\Utility\Icon...ity::getIconForRecord() does only seem to accept array, did you maybe forget to handle an error condition?

This check looks for type mismatches where the missing type is false. This is usually indicative of an error condtion.

Consider the follow example

<?php

function getDate($date)
{
    if ($date !== null) {
        return new DateTime($date);
    }

    return false;
}

This function either returns a new DateTime object or false, if there was an error. This is a typical pattern in PHP programming to show that an error has occurred without raising an exception. The calling code should check for this returned false before passing on the value to another function or method that may not be able to handle a false.

Loading history...
1746
        ];
1747
1748
        // Get branch beneath:
1749
        if ($depth) {
1750
            $tree->getTree($id, $depth, '');
1751
        }
1752
1753
        // Traverse page tree:
1754
        $code = '';
1755
1756
        foreach ($tree->tree as $data) {
1757
            $this->MP = false;
1758
1759
            // recognize mount points
1760
            if ($data['row']['doktype'] == 7) {
1761
                $mountpage = $this->db->exec_SELECTgetRows('*', 'pages', 'uid = ' . $data['row']['uid']);
1762
1763
                // fetch mounted pages
1764
                $this->MP = $mountpage[0]['mount_pid'] . '-' . $data['row']['uid'];
0 ignored issues
show
Documentation Bug introduced by
The property $MP was declared of type boolean, but $mountpage[0]['mount_pid...' . $data['row']['uid'] is of type string. Maybe add a type cast?

This check looks for assignments to scalar types that may be of the wrong type.

To ensure the code behaves as expected, it may be a good idea to add an explicit type cast.

$answer = 42;

$correct = false;

$correct = (bool) $answer;
Loading history...
1765
1766
                $mountTree = GeneralUtility::makeInstance('TYPO3\CMS\Backend\Tree\View\PageTreeView');
1767
                $mountTree->init('AND ' . $perms_clause);
1768
                $mountTree->getTree($mountpage[0]['mount_pid'], $depth, '');
1769
1770
                foreach ($mountTree->tree as $mountData) {
1771
                    $code .= $this->drawURLs_addRowsForPage(
1772
                        $mountData['row'],
1773
                        $mountData['HTML'] . BackendUtility::getRecordTitle('pages', $mountData['row'], true)
1774
                    );
1775
                }
1776
1777
                // replace page when mount_pid_ol is enabled
1778
                if ($mountpage[0]['mount_pid_ol']) {
1779
                    $data['row']['uid'] = $mountpage[0]['mount_pid'];
1780
                } else {
1781
                    // if the mount_pid_ol is not set the MP must not be used for the mountpoint page
1782
                    $this->MP = false;
1783
                }
1784
            }
1785
1786
            $code .= $this->drawURLs_addRowsForPage(
1787
                $data['row'],
0 ignored issues
show
Security Bug introduced by
It seems like $data['row'] can also be of type false; however, AOE\Crawler\Controller\C...awURLs_addRowsForPage() does only seem to accept array, did you maybe forget to handle an error condition?
Loading history...
1788
                $data['HTML'] . BackendUtility::getRecordTitle('pages', $data['row'], true)
0 ignored issues
show
Security Bug introduced by
It seems like $data['row'] can also be of type false; however, TYPO3\CMS\Backend\Utilit...ility::getRecordTitle() does only seem to accept array, did you maybe forget to handle an error condition?
Loading history...
1789
            );
1790
        }
1791
1792
        return $code;
1793
    }
1794
1795
    /**
1796
     * Expands exclude string
1797
     *
1798
     * @param string $excludeString Exclude string
1799
     * @return array
1800
     */
1801
    public function expandExcludeString($excludeString)
1802
    {
1803
        // internal static caches;
1804
        static $expandedExcludeStringCache;
1805
        static $treeCache;
1806
1807
        if (empty($expandedExcludeStringCache[$excludeString])) {
1808
            $pidList = [];
1809
1810
            if (!empty($excludeString)) {
1811
                /* @var $tree \TYPO3\CMS\Backend\Tree\View\PageTreeView */
1812
                $tree = GeneralUtility::makeInstance('TYPO3\CMS\Backend\Tree\View\PageTreeView');
1813
                $tree->init('AND ' . $this->backendUser->getPagePermsClause(1));
1814
1815
                $excludeParts = GeneralUtility::trimExplode(',', $excludeString);
1816
1817
                foreach ($excludeParts as $excludePart) {
1818
                    list($pid, $depth) = GeneralUtility::trimExplode('+', $excludePart);
1819
1820
                    // default is "page only" = "depth=0"
1821
                    if (empty($depth)) {
1822
                        $depth = (stristr($excludePart, '+')) ? 99 : 0;
1823
                    }
1824
1825
                    $pidList[] = $pid;
1826
1827
                    if ($depth > 0) {
1828
                        if (empty($treeCache[$pid][$depth])) {
1829
                            $tree->reset();
1830
                            $tree->getTree($pid, $depth);
1831
                            $treeCache[$pid][$depth] = $tree->tree;
1832
                        }
1833
1834
                        foreach ($treeCache[$pid][$depth] as $data) {
1835
                            $pidList[] = $data['row']['uid'];
1836
                        }
1837
                    }
1838
                }
1839
            }
1840
1841
            $expandedExcludeStringCache[$excludeString] = array_unique($pidList);
1842
        }
1843
1844
        return $expandedExcludeStringCache[$excludeString];
1845
    }
1846
1847
    /**
1848
     * Create the rows for display of the page tree
1849
     * For each page a number of rows are shown displaying GET variable configuration
1850
     *
1851
     * @param    array        Page row
1852
     * @param    string        Page icon and title for row
1853
     * @return    string        HTML <tr> content (one or more)
1854
     */
1855
    public function drawURLs_addRowsForPage(array $pageRow, $pageTitleAndIcon)
1856
    {
1857
        $skipMessage = '';
1858
1859
        // Get list of configurations
1860
        $configurations = $this->getUrlsForPageRow($pageRow, $skipMessage);
1861
1862
        if (count($this->incomingConfigurationSelection) > 0) {
1863
            // remove configuration that does not match the current selection
1864
            foreach ($configurations as $confKey => $confArray) {
1865
                if (!in_array($confKey, $this->incomingConfigurationSelection)) {
1866
                    unset($configurations[$confKey]);
1867
                }
1868
            }
1869
        }
1870
1871
        // Traverse parameter combinations:
1872
        $c = 0;
1873
        $cc = 0;
0 ignored issues
show
Unused Code introduced by
$cc is not used, you could remove the assignment.

This check looks for variable assignements that are either overwritten by other assignments or where the variable is not used subsequently.

$myVar = 'Value';
$higher = false;

if (rand(1, 6) > 3) {
    $higher = true;
} else {
    $higher = false;
}

Both the $myVar assignment in line 1 and the $higher assignment in line 2 are dead. The first because $myVar is never used and the second because $higher is always overwritten for every possible time line.

Loading history...
1874
        $content = '';
1875
        if (count($configurations)) {
1876
            foreach ($configurations as $confKey => $confArray) {
1877
1878
                    // Title column:
1879
                if (!$c) {
1880
                    $titleClm = '<td rowspan="' . count($configurations) . '">' . $pageTitleAndIcon . '</td>';
1881
                } else {
1882
                    $titleClm = '';
1883
                }
1884
1885
                if (!in_array($pageRow['uid'], $this->expandExcludeString($confArray['subCfg']['exclude']))) {
1886
1887
                        // URL list:
1888
                    $urlList = $this->urlListFromUrlArray(
1889
                        $confArray,
1890
                        $pageRow,
1891
                        $this->scheduledTime,
1892
                        $this->reqMinute,
1893
                        $this->submitCrawlUrls,
1894
                        $this->downloadCrawlUrls,
1895
                        $this->duplicateTrack,
1896
                        $this->downloadUrls,
1897
                        $this->incomingProcInstructions // if empty the urls won't be filtered by processing instructions
1898
                    );
1899
1900
                    // Expanded parameters:
1901
                    $paramExpanded = '';
1902
                    $calcAccu = [];
1903
                    $calcRes = 1;
1904
                    foreach ($confArray['paramExpanded'] as $gVar => $gVal) {
1905
                        $paramExpanded .= '
1906
                            <tr>
1907
                                <td class="bgColor4-20">' . htmlspecialchars('&' . $gVar . '=') . '<br/>' .
1908
                                                '(' . count($gVal) . ')' .
1909
                                                '</td>
1910
                                <td class="bgColor4" nowrap="nowrap">' . nl2br(htmlspecialchars(implode(chr(10), $gVal))) . '</td>
1911
                            </tr>
1912
                        ';
1913
                        $calcRes *= count($gVal);
1914
                        $calcAccu[] = count($gVal);
1915
                    }
1916
                    $paramExpanded = '<table class="lrPadding c-list param-expanded">' . $paramExpanded . '</table>';
1917
                    $paramExpanded .= 'Comb: ' . implode('*', $calcAccu) . '=' . $calcRes;
1918
1919
                    // Options
1920
                    $optionValues = '';
1921
                    if ($confArray['subCfg']['userGroups']) {
1922
                        $optionValues .= 'User Groups: ' . $confArray['subCfg']['userGroups'] . '<br/>';
1923
                    }
1924
                    if ($confArray['subCfg']['baseUrl']) {
1925
                        $optionValues .= 'Base Url: ' . $confArray['subCfg']['baseUrl'] . '<br/>';
1926
                    }
1927
                    if ($confArray['subCfg']['procInstrFilter']) {
1928
                        $optionValues .= 'ProcInstr: ' . $confArray['subCfg']['procInstrFilter'] . '<br/>';
1929
                    }
1930
1931
                    // Compile row:
1932
                    $content .= '
1933
                        <tr class="bgColor' . ($c % 2 ? '-20' : '-10') . '">
1934
                            ' . $titleClm . '
1935
                            <td>' . htmlspecialchars($confKey) . '</td>
1936
                            <td>' . nl2br(htmlspecialchars(rawurldecode(trim(str_replace('&', chr(10) . '&', GeneralUtility::implodeArrayForUrl('', $confArray['paramParsed'])))))) . '</td>
1937
                            <td>' . $paramExpanded . '</td>
1938
                            <td nowrap="nowrap">' . $urlList . '</td>
1939
                            <td nowrap="nowrap">' . $optionValues . '</td>
1940
                            <td nowrap="nowrap">' . DebugUtility::viewArray($confArray['subCfg']['procInstrParams.']) . '</td>
1941
                        </tr>';
1942
                } else {
1943
                    $content .= '<tr class="bgColor' . ($c % 2 ? '-20' : '-10') . '">
1944
                            ' . $titleClm . '
1945
                            <td>' . htmlspecialchars($confKey) . '</td>
1946
                            <td colspan="5"><em>No entries</em> (Page is excluded in this configuration)</td>
1947
                        </tr>';
1948
                }
1949
1950
                $c++;
1951
            }
1952
        } else {
1953
            $message = !empty($skipMessage) ? ' (' . $skipMessage . ')' : '';
1954
1955
            // Compile row:
1956
            $content .= '
1957
                <tr class="bgColor-20" style="border-bottom: 1px solid black;">
1958
                    <td>' . $pageTitleAndIcon . '</td>
1959
                    <td colspan="6"><em>No entries</em>' . $message . '</td>
1960
                </tr>';
1961
        }
1962
1963
        return $content;
1964
    }
1965
1966
    /**
1967
     * @return int
1968
     */
1969
    public function getUnprocessedItemsCount()
1970
    {
1971
        $res = $this->db->exec_SELECTquery(
1972
            'count(*) as num',
1973
            'tx_crawler_queue',
1974
            'exec_time=0 AND process_scheduled=0 AND scheduled<=' . $this->getCurrentTime()
1975
        );
1976
1977
        $count = $this->db->sql_fetch_assoc($res);
1978
        return $count['num'];
1979
    }
1980
1981
    /*****************************
1982
     *
1983
     * CLI functions
1984
     *
1985
     *****************************/
1986
1987
    /**
1988
     * Main function for running from Command Line PHP script (cron job)
1989
     * See ext/crawler/cli/crawler_cli.phpsh for details
1990
     *
1991
     * @return int number of remaining items or false if error
1992
     */
1993
    public function CLI_main()
1994
    {
1995
        $this->setAccessMode('cli');
1996
        $result = self::CLI_STATUS_NOTHING_PROCCESSED;
1997
        $cliObj = GeneralUtility::makeInstance(CrawlerCommandLineController::class);
1998
1999
        if (isset($cliObj->cli_args['-h']) || isset($cliObj->cli_args['--help'])) {
2000
            $cliObj->cli_validateArgs();
2001
            $cliObj->cli_help();
2002
            exit;
2003
        }
2004
2005
        if (!$this->getDisabled() && $this->CLI_checkAndAcquireNewProcess($this->CLI_buildProcessId())) {
2006
            $countInARun = $cliObj->cli_argValue('--countInARun') ? intval($cliObj->cli_argValue('--countInARun')) : $this->extensionSettings['countInARun'];
2007
            // Seconds
2008
            $sleepAfterFinish = $cliObj->cli_argValue('--sleepAfterFinish') ? intval($cliObj->cli_argValue('--sleepAfterFinish')) : $this->extensionSettings['sleepAfterFinish'];
2009
            // Milliseconds
2010
            $sleepTime = $cliObj->cli_argValue('--sleepTime') ? intval($cliObj->cli_argValue('--sleepTime')) : $this->extensionSettings['sleepTime'];
2011
2012
            try {
2013
                // Run process:
2014
                $result = $this->CLI_run($countInARun, $sleepTime, $sleepAfterFinish);
2015
            } catch (Exception $e) {
0 ignored issues
show
Bug introduced by
The class AOE\Crawler\Controller\Exception does not exist. Did you forget a USE statement, or did you not list all dependencies?

Scrutinizer analyzes your composer.json/composer.lock file if available to determine the classes, and functions that are defined by your dependencies.

It seems like the listed class was neither found in your dependencies, nor was it found in the analyzed files in your repository. If you are using some other form of dependency management, you might want to disable this analysis.

Loading history...
2016
                $this->CLI_debug(get_class($e) . ': ' . $e->getMessage());
2017
                $result = self::CLI_STATUS_ABORTED;
2018
            }
2019
2020
            // Cleanup
2021
            $this->db->exec_DELETEquery('tx_crawler_process', 'assigned_items_count = 0');
2022
2023
            //TODO can't we do that in a clean way?
2024
            $releaseStatus = $this->CLI_releaseProcesses($this->CLI_buildProcessId());
0 ignored issues
show
Unused Code introduced by
$releaseStatus is not used, you could remove the assignment.

This check looks for variable assignements that are either overwritten by other assignments or where the variable is not used subsequently.

$myVar = 'Value';
$higher = false;

if (rand(1, 6) > 3) {
    $higher = true;
} else {
    $higher = false;
}

Both the $myVar assignment in line 1 and the $higher assignment in line 2 are dead. The first because $myVar is never used and the second because $higher is always overwritten for every possible time line.

Loading history...
2025
2026
            $this->CLI_debug("Unprocessed Items remaining:" . $this->getUnprocessedItemsCount() . " (" . $this->CLI_buildProcessId() . ")");
2027
            $result |= ($this->getUnprocessedItemsCount() > 0 ? self::CLI_STATUS_REMAIN : self::CLI_STATUS_NOTHING_PROCCESSED);
2028
        } else {
2029
            $result |= self::CLI_STATUS_ABORTED;
2030
        }
2031
2032
        return $result;
2033
    }
2034
2035
    /**
2036
     * Function executed by crawler_im.php cli script.
2037
     *
2038
     * @return void
2039
     */
2040
    public function CLI_main_im()
2041
    {
2042
        $this->setAccessMode('cli_im');
2043
2044
        $cliObj = GeneralUtility::makeInstance(QueueCommandLineController::class);
2045
2046
        // Force user to admin state and set workspace to "Live":
2047
        $this->backendUser->user['admin'] = 1;
2048
        $this->backendUser->setWorkspace(0);
2049
2050
        // Print help
2051
        if (!isset($cliObj->cli_args['_DEFAULT'][1])) {
2052
            $cliObj->cli_validateArgs();
2053
            $cliObj->cli_help();
2054
            exit;
2055
        }
2056
2057
        $cliObj->cli_validateArgs();
2058
2059
        if ($cliObj->cli_argValue('-o') === 'exec') {
2060
            $this->registerQueueEntriesInternallyOnly = true;
0 ignored issues
show
Documentation Bug introduced by
It seems like true of type boolean is incompatible with the declared type array of property $registerQueueEntriesInternallyOnly.

Our type inference engine has found an assignment to a property that is incompatible with the declared type of that property.

Either this assignment is in error or the assigned type should be added to the documentation/type hint for that property..

Loading history...
2061
        }
2062
2063
        if (isset($cliObj->cli_args['_DEFAULT'][2])) {
2064
            // Crawler is called over TYPO3 BE
2065
            $pageId = MathUtility::forceIntegerInRange($cliObj->cli_args['_DEFAULT'][2], 0);
2066
        } else {
2067
            // Crawler is called over cli
2068
            $pageId = MathUtility::forceIntegerInRange($cliObj->cli_args['_DEFAULT'][1], 0);
2069
        }
2070
2071
        $configurationKeys = $this->getConfigurationKeys($cliObj);
2072
2073
        if (!is_array($configurationKeys)) {
2074
            $configurations = $this->getUrlsForPageId($pageId);
2075
            if (is_array($configurations)) {
2076
                $configurationKeys = array_keys($configurations);
2077
            } else {
2078
                $configurationKeys = [];
2079
            }
2080
        }
2081
2082
        if ($cliObj->cli_argValue('-o') === 'queue' || $cliObj->cli_argValue('-o') === 'exec') {
2083
            $reason = new Reason();
2084
            $reason->setReason(Reason::REASON_GUI_SUBMIT);
2085
            $reason->setDetailText('The cli script of the crawler added to the queue');
2086
            EventDispatcher::getInstance()->post(
2087
                'invokeQueueChange',
2088
                $this->setID,
2089
                ['reason' => $reason]
2090
            );
2091
        }
2092
2093
        if ($this->extensionSettings['cleanUpOldQueueEntries']) {
2094
            $this->cleanUpOldQueueEntries();
2095
        }
2096
2097
        $this->setID = GeneralUtility::md5int(microtime());
0 ignored issues
show
Documentation Bug introduced by
It seems like \TYPO3\CMS\Core\Utility\...ty::md5int(microtime()) can also be of type double. However, the property $setID is declared as type integer. Maybe add an additional type check?

Our type inference engine has found a suspicous assignment of a value to a property. This check raises an issue when a value that can be of a mixed type is assigned to a property that is type hinted more strictly.

For example, imagine you have a variable $accountId that can either hold an Id object or false (if there is no account id yet). Your code now assigns that value to the id property of an instance of the Account class. This class holds a proper account, so the id value must no longer be false.

Either this assignment is in error or a type check should be added for that assignment.

class Id
{
    public $id;

    public function __construct($id)
    {
        $this->id = $id;
    }

}

class Account
{
    /** @var  Id $id */
    public $id;
}

$account_id = false;

if (starsAreRight()) {
    $account_id = new Id(42);
}

$account = new Account();
if ($account instanceof Id)
{
    $account->id = $account_id;
}
Loading history...
2098
        $this->getPageTreeAndUrls(
2099
            $pageId,
2100
            MathUtility::forceIntegerInRange($cliObj->cli_argValue('-d'), 0, 99),
2101
            $this->getCurrentTime(),
2102
            MathUtility::forceIntegerInRange($cliObj->cli_isArg('-n') ? $cliObj->cli_argValue('-n') : 30, 1, 1000),
2103
            $cliObj->cli_argValue('-o') === 'queue' || $cliObj->cli_argValue('-o') === 'exec',
2104
            $cliObj->cli_argValue('-o') === 'url',
2105
            GeneralUtility::trimExplode(',', $cliObj->cli_argValue('-proc'), 1),
0 ignored issues
show
Documentation introduced by
1 is of type integer, but the function expects a boolean.

It seems like the type of the argument is not accepted by the function/method which you are calling.

In some cases, in particular if PHP’s automatic type-juggling kicks in this might be fine. In other cases, however this might be a bug.

We suggest to add an explicit type cast like in the following example:

function acceptsInteger($int) { }

$x = '123'; // string "123"

// Instead of
acceptsInteger($x);

// we recommend to use
acceptsInteger((integer) $x);
Loading history...
2106
            $configurationKeys
2107
        );
2108
2109
        if ($cliObj->cli_argValue('-o') === 'url') {
2110
            $cliObj->cli_echo(implode(chr(10), $this->downloadUrls) . chr(10), 1);
0 ignored issues
show
Documentation introduced by
1 is of type integer, but the function expects a boolean.

It seems like the type of the argument is not accepted by the function/method which you are calling.

In some cases, in particular if PHP’s automatic type-juggling kicks in this might be fine. In other cases, however this might be a bug.

We suggest to add an explicit type cast like in the following example:

function acceptsInteger($int) { }

$x = '123'; // string "123"

// Instead of
acceptsInteger($x);

// we recommend to use
acceptsInteger((integer) $x);
Loading history...
2111
        } elseif ($cliObj->cli_argValue('-o') === 'exec') {
2112
            $cliObj->cli_echo("Executing " . count($this->urlList) . " requests right away:\n\n");
2113
            $cliObj->cli_echo(implode(chr(10), $this->urlList) . chr(10));
2114
            $cliObj->cli_echo("\nProcessing:\n");
2115
2116
            foreach ($this->queueEntries as $queueRec) {
2117
                $p = unserialize($queueRec['parameters']);
2118
                $cliObj->cli_echo($p['url'] . ' (' . implode(',', $p['procInstructions']) . ') => ');
2119
2120
                $result = $this->readUrlFromArray($queueRec);
2121
2122
                $requestResult = unserialize($result['content']);
2123
                if (is_array($requestResult)) {
2124
                    $resLog = is_array($requestResult['log']) ? chr(10) . chr(9) . chr(9) . implode(chr(10) . chr(9) . chr(9), $requestResult['log']) : '';
2125
                    $cliObj->cli_echo('OK: ' . $resLog . chr(10));
2126
                } else {
2127
                    $cliObj->cli_echo('Error checking Crawler Result: ' . substr(preg_replace('/\s+/', ' ', strip_tags($result['content'])), 0, 30000) . '...' . chr(10));
2128
                }
2129
            }
2130
        } elseif ($cliObj->cli_argValue('-o') === 'queue') {
2131
            $cliObj->cli_echo("Putting " . count($this->urlList) . " entries in queue:\n\n");
2132
            $cliObj->cli_echo(implode(chr(10), $this->urlList) . chr(10));
2133
        } else {
2134
            $cliObj->cli_echo(count($this->urlList) . " entries found for processing. (Use -o to decide action):\n\n", 1);
0 ignored issues
show
Documentation introduced by
1 is of type integer, but the function expects a boolean.

It seems like the type of the argument is not accepted by the function/method which you are calling.

In some cases, in particular if PHP’s automatic type-juggling kicks in this might be fine. In other cases, however this might be a bug.

We suggest to add an explicit type cast like in the following example:

function acceptsInteger($int) { }

$x = '123'; // string "123"

// Instead of
acceptsInteger($x);

// we recommend to use
acceptsInteger((integer) $x);
Loading history...
2135
            $cliObj->cli_echo(implode(chr(10), $this->urlList) . chr(10), 1);
0 ignored issues
show
Documentation introduced by
1 is of type integer, but the function expects a boolean.

It seems like the type of the argument is not accepted by the function/method which you are calling.

In some cases, in particular if PHP’s automatic type-juggling kicks in this might be fine. In other cases, however this might be a bug.

We suggest to add an explicit type cast like in the following example:

function acceptsInteger($int) { }

$x = '123'; // string "123"

// Instead of
acceptsInteger($x);

// we recommend to use
acceptsInteger((integer) $x);
Loading history...
2136
        }
2137
    }
2138
2139
    /**
2140
     * Function executed by crawler_im.php cli script.
2141
     *
2142
     * @return bool
2143
     */
2144
    public function CLI_main_flush()
2145
    {
2146
        $this->setAccessMode('cli_flush');
2147
        $cliObj = GeneralUtility::makeInstance(FlushCommandLineController::class);
2148
2149
        // Force user to admin state and set workspace to "Live":
2150
        $this->backendUser->user['admin'] = 1;
2151
        $this->backendUser->setWorkspace(0);
2152
2153
        // Print help
2154
        if (!isset($cliObj->cli_args['_DEFAULT'][1])) {
2155
            $cliObj->cli_validateArgs();
2156
            $cliObj->cli_help();
2157
            exit;
2158
        }
2159
2160
        $cliObj->cli_validateArgs();
2161
        $pageId = MathUtility::forceIntegerInRange($cliObj->cli_args['_DEFAULT'][1], 0);
2162
        $fullFlush = ($pageId == 0);
2163
2164
        $mode = $cliObj->cli_argValue('-o');
2165
2166
        switch ($mode) {
2167
            case 'all':
2168
                $result = $this->getLogEntriesForPageId($pageId, '', true, $fullFlush);
2169
                break;
2170
            case 'finished':
2171
            case 'pending':
2172
                $result = $this->getLogEntriesForPageId($pageId, $mode, true, $fullFlush);
2173
                break;
2174
            default:
2175
                $cliObj->cli_validateArgs();
2176
                $cliObj->cli_help();
2177
                $result = false;
2178
        }
2179
2180
        return $result !== false;
2181
    }
2182
2183
    /**
2184
     * Obtains configuration keys from the CLI arguments
2185
     *
2186
     * @param  QueueCommandLineController $cliObj    Command line object
2187
     * @return mixed                        Array of keys or null if no keys found
2188
     */
2189
    protected function getConfigurationKeys(QueueCommandLineController &$cliObj)
2190
    {
2191
        $parameter = trim($cliObj->cli_argValue('-conf'));
2192
        return ($parameter != '' ? GeneralUtility::trimExplode(',', $parameter) : []);
2193
    }
2194
2195
    /**
2196
     * Running the functionality of the CLI (crawling URLs from queue)
2197
     *
2198
     * @param int $countInARun
2199
     * @param int $sleepTime
2200
     * @param int $sleepAfterFinish
2201
     * @return string
2202
     */
2203
    public function CLI_run($countInARun, $sleepTime, $sleepAfterFinish)
2204
    {
2205
        $result = 0;
2206
        $counter = 0;
2207
2208
        // First, run hooks:
2209
        $this->CLI_runHooks();
2210
2211
        // Clean up the queue
2212
        if (intval($this->extensionSettings['purgeQueueDays']) > 0) {
2213
            $purgeDate = $this->getCurrentTime() - 24 * 60 * 60 * intval($this->extensionSettings['purgeQueueDays']);
2214
            $del = $this->db->exec_DELETEquery(
0 ignored issues
show
Unused Code introduced by
$del is not used, you could remove the assignment.

This check looks for variable assignements that are either overwritten by other assignments or where the variable is not used subsequently.

$myVar = 'Value';
$higher = false;

if (rand(1, 6) > 3) {
    $higher = true;
} else {
    $higher = false;
}

Both the $myVar assignment in line 1 and the $higher assignment in line 2 are dead. The first because $myVar is never used and the second because $higher is always overwritten for every possible time line.

Loading history...
2215
                'tx_crawler_queue',
2216
                'exec_time!=0 AND exec_time<' . $purgeDate
2217
            );
2218
        }
2219
2220
        // Select entries:
2221
        //TODO Shouldn't this reside within the transaction?
2222
        $rows = $this->db->exec_SELECTgetRows(
2223
            'qid,scheduled',
2224
            'tx_crawler_queue',
2225
            'exec_time=0
2226
                AND process_scheduled= 0
2227
                AND scheduled<=' . $this->getCurrentTime(),
2228
            '',
2229
            'scheduled, qid',
2230
        intval($countInARun)
2231
        );
2232
2233
        if (count($rows) > 0) {
2234
            $quidList = [];
2235
2236
            foreach ($rows as $r) {
0 ignored issues
show
Bug introduced by
The expression $rows of type null|array is not guaranteed to be traversable. How about adding an additional type check?

There are different options of fixing this problem.

  1. If you want to be on the safe side, you can add an additional type-check:

    $collection = json_decode($data, true);
    if ( ! is_array($collection)) {
        throw new \RuntimeException('$collection must be an array.');
    }
    
    foreach ($collection as $item) { /** ... */ }
    
  2. If you are sure that the expression is traversable, you might want to add a doc comment cast to improve IDE auto-completion and static analysis:

    /** @var array $collection */
    $collection = json_decode($data, true);
    
    foreach ($collection as $item) { /** .. */ }
    
  3. Mark the issue as a false-positive: Just hover the remove button, in the top-right corner of this issue for more options.

Loading history...
2237
                $quidList[] = $r['qid'];
2238
            }
2239
2240
            $processId = $this->CLI_buildProcessId();
2241
2242
            //reserve queue entries for process
2243
            $this->db->sql_query('BEGIN');
2244
            //TODO make sure we're not taking assigned queue-entires
2245
            $this->db->exec_UPDATEquery(
2246
                'tx_crawler_queue',
2247
                'qid IN (' . implode(',', $quidList) . ')',
2248
                [
2249
                    'process_scheduled' => intval($this->getCurrentTime()),
2250
                    'process_id' => $processId
2251
                ]
2252
            );
2253
2254
            //save the number of assigned queue entrys to determine who many have been processed later
2255
            $numberOfAffectedRows = $this->db->sql_affected_rows();
2256
            $this->db->exec_UPDATEquery(
2257
                'tx_crawler_process',
2258
                "process_id = '" . $processId . "'",
2259
                [
2260
                    'assigned_items_count' => intval($numberOfAffectedRows)
2261
                ]
2262
            );
2263
2264
            if ($numberOfAffectedRows == count($quidList)) {
2265
                $this->db->sql_query('COMMIT');
2266
            } else {
2267
                $this->db->sql_query('ROLLBACK');
2268
                $this->CLI_debug("Nothing processed due to multi-process collision (" . $this->CLI_buildProcessId() . ")");
2269
                return ($result | self::CLI_STATUS_ABORTED);
2270
            }
2271
2272
            foreach ($rows as $r) {
0 ignored issues
show
Bug introduced by
The expression $rows of type null|array is not guaranteed to be traversable. How about adding an additional type check?

There are different options of fixing this problem.

  1. If you want to be on the safe side, you can add an additional type-check:

    $collection = json_decode($data, true);
    if ( ! is_array($collection)) {
        throw new \RuntimeException('$collection must be an array.');
    }
    
    foreach ($collection as $item) { /** ... */ }
    
  2. If you are sure that the expression is traversable, you might want to add a doc comment cast to improve IDE auto-completion and static analysis:

    /** @var array $collection */
    $collection = json_decode($data, true);
    
    foreach ($collection as $item) { /** .. */ }
    
  3. Mark the issue as a false-positive: Just hover the remove button, in the top-right corner of this issue for more options.

Loading history...
2273
                $result |= $this->readUrl($r['qid']);
2274
2275
                $counter++;
2276
                usleep(intval($sleepTime)); // Just to relax the system
2277
2278
                // if during the start and the current read url the cli has been disable we need to return from the function
2279
                // mark the process NOT as ended.
2280
                if ($this->getDisabled()) {
2281
                    return ($result | self::CLI_STATUS_ABORTED);
2282
                }
2283
2284
                if (!$this->CLI_checkIfProcessIsActive($this->CLI_buildProcessId())) {
2285
                    $this->CLI_debug("conflict / timeout (" . $this->CLI_buildProcessId() . ")");
2286
2287
                    //TODO might need an additional returncode
2288
                    $result |= self::CLI_STATUS_ABORTED;
2289
                    break; //possible timeout
2290
                }
2291
            }
2292
2293
            sleep(intval($sleepAfterFinish));
2294
2295
            $msg = 'Rows: ' . $counter;
2296
            $this->CLI_debug($msg . " (" . $this->CLI_buildProcessId() . ")");
2297
        } else {
2298
            $this->CLI_debug("Nothing within queue which needs to be processed (" . $this->CLI_buildProcessId() . ")");
2299
        }
2300
2301
        if ($counter > 0) {
2302
            $result |= self::CLI_STATUS_PROCESSED;
2303
        }
2304
2305
        return $result;
2306
    }
2307
2308
    /**
2309
     * Activate hooks
2310
     *
2311
     * @return void
2312
     */
2313
    public function CLI_runHooks()
2314
    {
2315
        global $TYPO3_CONF_VARS;
2316
        if (is_array($TYPO3_CONF_VARS['EXTCONF']['crawler']['cli_hooks'])) {
2317
            foreach ($TYPO3_CONF_VARS['EXTCONF']['crawler']['cli_hooks'] as $objRef) {
2318
                $hookObj = &GeneralUtility::getUserObj($objRef);
2319
                if (is_object($hookObj)) {
2320
                    $hookObj->crawler_init($this);
2321
                }
2322
            }
2323
        }
2324
    }
2325
2326
    /**
2327
     * Try to acquire a new process with the given id
2328
     * also performs some auto-cleanup for orphan processes
2329
     * @todo preemption might not be the most elegant way to clean up
2330
     *
2331
     * @param string $id identification string for the process
2332
     * @return boolean
2333
     */
2334
    public function CLI_checkAndAcquireNewProcess($id)
2335
    {
2336
        $ret = true;
2337
2338
        $systemProcessId = getmypid();
2339
        if ($systemProcessId < 1) {
2340
            return false;
2341
        }
2342
2343
        $processCount = 0;
2344
        $orphanProcesses = [];
2345
2346
        $this->db->sql_query('BEGIN');
2347
2348
        $res = $this->db->exec_SELECTquery(
2349
            'process_id,ttl',
2350
            'tx_crawler_process',
2351
            'active=1 AND deleted=0'
2352
            );
2353
2354
        $currentTime = $this->getCurrentTime();
2355
2356
        while ($row = $this->db->sql_fetch_assoc($res)) {
2357
            if ($row['ttl'] < $currentTime) {
2358
                $orphanProcesses[] = $row['process_id'];
2359
            } else {
2360
                $processCount++;
2361
            }
2362
        }
2363
2364
        // if there are less than allowed active processes then add a new one
2365
        if ($processCount < intval($this->extensionSettings['processLimit'])) {
2366
            $this->CLI_debug("add process " . $this->CLI_buildProcessId() . " (" . ($processCount + 1) . "/" . intval($this->extensionSettings['processLimit']) . ")");
2367
2368
            // create new process record
2369
            $this->db->exec_INSERTquery(
2370
                'tx_crawler_process',
2371
                [
2372
                    'process_id' => $id,
2373
                    'active' => '1',
2374
                    'ttl' => ($currentTime + intval($this->extensionSettings['processMaxRunTime'])),
2375
                    'system_process_id' => $systemProcessId
2376
                ]
2377
                );
2378
        } else {
2379
            $this->CLI_debug("Processlimit reached (" . ($processCount) . "/" . intval($this->extensionSettings['processLimit']) . ")");
2380
            $ret = false;
2381
        }
2382
2383
        $this->CLI_releaseProcesses($orphanProcesses, true); // maybe this should be somehow included into the current lock
2384
        $this->CLI_deleteProcessesMarkedDeleted();
2385
2386
        $this->db->sql_query('COMMIT');
2387
2388
        return $ret;
2389
    }
2390
2391
    /**
2392
     * Release a process and the required resources
2393
     *
2394
     * @param  mixed    $releaseIds   string with a single process-id or array with multiple process-ids
2395
     * @param  boolean  $withinLock   show whether the DB-actions are included within an existing lock
2396
     * @return boolean
2397
     */
2398
    public function CLI_releaseProcesses($releaseIds, $withinLock = false)
2399
    {
2400
        if (!is_array($releaseIds)) {
2401
            $releaseIds = [$releaseIds];
2402
        }
2403
2404
        if (!count($releaseIds) > 0) {
2405
            return false;   //nothing to release
2406
        }
2407
2408
        if (!$withinLock) {
2409
            $this->db->sql_query('BEGIN');
2410
        }
2411
2412
        // some kind of 2nd chance algo - this way you need at least 2 processes to have a real cleanup
2413
        // this ensures that a single process can't mess up the entire process table
2414
2415
        // mark all processes as deleted which have no "waiting" queue-entires and which are not active
2416
        $this->db->exec_UPDATEquery(
2417
            'tx_crawler_queue',
2418
            'process_id IN (SELECT process_id FROM tx_crawler_process WHERE active=0 AND deleted=0)',
2419
            [
2420
                'process_scheduled' => 0,
2421
                'process_id' => ''
2422
            ]
2423
        );
2424
        $this->db->exec_UPDATEquery(
2425
            'tx_crawler_process',
2426
            'active=0 AND deleted=0
2427
            AND NOT EXISTS (
2428
                SELECT * FROM tx_crawler_queue
2429
                WHERE tx_crawler_queue.process_id = tx_crawler_process.process_id
2430
                AND tx_crawler_queue.exec_time = 0
2431
            )',
2432
            [
2433
                'deleted' => '1',
2434
                'system_process_id' => 0
2435
            ]
2436
        );
2437
        // mark all requested processes as non-active
2438
        $this->db->exec_UPDATEquery(
2439
            'tx_crawler_process',
2440
            'process_id IN (\'' . implode('\',\'', $releaseIds) . '\') AND deleted=0',
2441
            [
2442
                'active' => '0'
2443
            ]
2444
        );
2445
        $this->db->exec_UPDATEquery(
2446
            'tx_crawler_queue',
2447
            'exec_time=0 AND process_id IN ("' . implode('","', $releaseIds) . '")',
2448
            [
2449
                'process_scheduled' => 0,
2450
                'process_id' => ''
2451
            ]
2452
        );
2453
2454
        if (!$withinLock) {
2455
            $this->db->sql_query('COMMIT');
2456
        }
2457
2458
        return true;
2459
    }
2460
2461
    /**
2462
     * Delete processes marked as deleted
2463
     *
2464
     * @return void
2465
     */
2466
    public function CLI_deleteProcessesMarkedDeleted()
2467
    {
2468
        $this->db->exec_DELETEquery('tx_crawler_process', 'deleted = 1');
2469
    }
2470
2471
    /**
2472
     * Check if there are still resources left for the process with the given id
2473
     * Used to determine timeouts and to ensure a proper cleanup if there's a timeout
2474
     *
2475
     * @param  string  identification string for the process
2476
     * @return boolean determines if the process is still active / has resources
2477
     *
2478
     * FIXME: Please remove Transaction, not needed as only a select query.
2479
     */
2480
    public function CLI_checkIfProcessIsActive($pid)
2481
    {
2482
        $ret = false;
2483
        $this->db->sql_query('BEGIN');
2484
        $res = $this->db->exec_SELECTquery(
2485
            'process_id,active,ttl',
2486
            'tx_crawler_process',
2487
            'process_id = \'' . $pid . '\'  AND deleted=0',
2488
            '',
2489
            'ttl',
2490
            '0,1'
2491
        );
2492
        if ($row = $this->db->sql_fetch_assoc($res)) {
2493
            $ret = intVal($row['active']) == 1;
2494
        }
2495
        $this->db->sql_query('COMMIT');
2496
2497
        return $ret;
2498
    }
2499
2500
    /**
2501
     * Create a unique Id for the current process
2502
     *
2503
     * @return string  the ID
2504
     */
2505
    public function CLI_buildProcessId()
2506
    {
2507
        if (!$this->processID) {
2508
            $this->processID = GeneralUtility::shortMD5($this->microtime(true));
2509
        }
2510
        return $this->processID;
2511
    }
2512
2513
    /**
2514
     * @param bool $get_as_float
2515
     *
2516
     * @return mixed
2517
     */
2518
    protected function microtime($get_as_float = false)
2519
    {
2520
        return microtime($get_as_float);
2521
    }
2522
2523
    /**
2524
     * Prints a message to the stdout (only if debug-mode is enabled)
2525
     *
2526
     * @param  string $msg  the message
2527
     */
2528
    public function CLI_debug($msg)
2529
    {
2530
        if (intval($this->extensionSettings['processDebug'])) {
2531
            echo $msg . "\n";
2532
            flush();
2533
        }
2534
    }
2535
2536
    /**
2537
     * Get URL content by making direct request to TYPO3.
2538
     *
2539
     * @param  string $url          Page URL
2540
     * @param  int    $crawlerId    Crawler-ID
2541
     * @return array
2542
     */
2543
    protected function sendDirectRequest($url, $crawlerId)
2544
    {
2545
        $requestHeaders = $this->buildRequestHeaderArray(parse_url($url), $crawlerId);
0 ignored issues
show
Security Bug introduced by
It seems like parse_url($url) targeting parse_url() can also be of type false; however, AOE\Crawler\Controller\C...ildRequestHeaderArray() does only seem to accept array, did you maybe forget to handle an error condition?
Loading history...
2546
2547
        $cmd = escapeshellcmd($this->extensionSettings['phpPath']);
2548
        $cmd .= ' ';
2549
        $cmd .= escapeshellarg(ExtensionManagementUtility::extPath('crawler') . 'cli/bootstrap.php');
2550
        $cmd .= ' ';
2551
        $cmd .= escapeshellarg($this->getFrontendBasePath());
2552
        $cmd .= ' ';
2553
        $cmd .= escapeshellarg($url);
2554
        $cmd .= ' ';
2555
        $cmd .= escapeshellarg(base64_encode(serialize($requestHeaders)));
2556
2557
        $startTime = microtime(true);
2558
        $content = $this->executeShellCommand($cmd);
2559
        $this->log($url . ' ' . (microtime(true) - $startTime));
2560
2561
        $result = [
2562
            'request' => implode("\r\n", $requestHeaders) . "\r\n\r\n",
2563
            'headers' => '',
2564
            'content' => $content
2565
        ];
2566
2567
        return $result;
2568
    }
2569
2570
    /**
2571
     * Cleans up entries that stayed for too long in the queue. These are:
2572
     * - processed entries that are over 1.5 days in age
2573
     * - scheduled entries that are over 7 days old
2574
     *
2575
     * @return void
2576
     */
2577
    protected function cleanUpOldQueueEntries()
2578
    {
2579
        $processedAgeInSeconds = $this->extensionSettings['cleanUpProcessedAge'] * 86400; // 24*60*60 Seconds in 24 hours
2580
        $scheduledAgeInSeconds = $this->extensionSettings['cleanUpScheduledAge'] * 86400;
2581
2582
        $now = time();
2583
        $condition = '(exec_time<>0 AND exec_time<' . ($now - $processedAgeInSeconds) . ') OR scheduled<=' . ($now - $scheduledAgeInSeconds);
2584
        $this->flushQueue($condition);
2585
    }
2586
2587
    /**
2588
     * Initializes a TypoScript Frontend necessary for using TypoScript and TypoLink functions
2589
     *
2590
     * @param int $id
2591
     * @param int $typeNum
2592
     *
2593
     * @return void
2594
     */
2595
    protected function initTSFE($id = 1, $typeNum = 0)
2596
    {
2597
        EidUtility::initTCA();
2598
        if (!is_object($GLOBALS['TT'])) {
2599
            $GLOBALS['TT'] = new NullTimeTracker();
2600
            $GLOBALS['TT']->start();
2601
        }
2602
2603
        $GLOBALS['TSFE'] = GeneralUtility::makeInstance(TypoScriptFrontendController::class, $GLOBALS['TYPO3_CONF_VARS'], $id, $typeNum);
2604
        $GLOBALS['TSFE']->sys_page = GeneralUtility::makeInstance(PageRepository::class);
2605
        $GLOBALS['TSFE']->sys_page->init(true);
2606
        $GLOBALS['TSFE']->connectToDB();
2607
        $GLOBALS['TSFE']->initFEuser();
2608
        $GLOBALS['TSFE']->determineId();
2609
        $GLOBALS['TSFE']->initTemplate();
2610
        $GLOBALS['TSFE']->rootLine = $GLOBALS['TSFE']->sys_page->getRootLine($id, '');
2611
        $GLOBALS['TSFE']->getConfigArray();
2612
        PageGenerator::pagegenInit();
2613
    }
2614
}
2615