Completed
Push — master ( d63a29...5c0477 )
by Stefan
29:08 queued 02:04
created

CrawlerController   F

Complexity

Total Complexity 353

Size/Duplication

Total Lines 2612
Duplicated Lines 0 %

Coupling/Cohesion

Components 1
Dependencies 18

Test Coverage

Coverage 23.13%

Importance

Changes 0
Metric Value
dl 0
loc 2612
ccs 256
cts 1107
cp 0.2313
rs 0.6314
c 0
b 0
f 0
wmc 353
lcom 1
cbo 18

60 Methods

Rating   Name   Duplication   Size   Complexity  
A setExtensionSettings() 0 4 1
C getConfigurationsForBranch() 0 45 11
A parseParams() 0 15 3
C compileUrls() 0 25 7
B getLogEntriesForPageId() 0 30 6
B getLogEntriesForSetId() 0 29 6
B flushQueue() 0 15 5
A addQueueEntry_callBack() 0 19 3
A getCurrentTime() 0 4 1
C readUrl() 0 80 13
A readUrlFromArray() 0 23 1
B readUrl_exec() 0 29 4
C getFrontendBasePath() 0 23 8
A executeShellCommand() 0 5 1
A fe_init() 0 17 4
C getPageTreeAndUrls() 0 87 8
D drawURLs_addRowsForPage() 0 109 15
A getUnprocessedItemsCount() 0 11 1
D CLI_main() 0 41 10
F CLI_main_im() 0 98 17
B CLI_main_flush() 0 38 5
A getConfigurationKeys() 0 5 2
D CLI_run() 0 107 10
A CLI_runHooks() 0 12 4
B CLI_checkAndAcquireNewProcess() 0 56 5
B CLI_releaseProcesses() 0 62 5
A CLI_deleteProcessesMarkedDeleted() 0 4 1
A CLI_checkIfProcessIsActive() 0 19 2
A microtime() 0 4 1
A CLI_debug() 0 7 2
A cleanUpOldQueueEntries() 0 9 1
A initTSFE() 0 19 2
A getConfigurationHash() 0 5 1
A getAccessMode() 0 4 1
A setAccessMode() 0 4 1
A setDisabled() 0 10 3
A getDisabled() 0 8 2
A setProcessFilename() 0 4 1
A getProcessFilename() 0 4 1
A __construct() 0 19 3
D checkIfPageShouldBeSkipped() 0 57 16
A getUrlsForPageRow() 0 15 3
A noUnprocessedQueueEntriesForPageWithConfigurationHashExist() 0 8 1
D urlListFromUrlArray() 0 115 21
A drawURLs_PIfilter() 0 12 4
B getPageTSconfigForId() 0 22 4
D getUrlsForPageId() 0 134 26
A getBaseUrlForConfigurationRecord() 0 20 4
A hasGroupAccess() 0 12 4
F expandParameters() 0 110 24
B addUrl() 0 67 6
C getDuplicateRowsIfExist() 0 40 7
D requestUrl() 0 93 19
B getHttpResponseFromStream() 0 23 5
A log() 0 9 3
A buildRequestHeaderArray() 0 16 4
C getRequestUrlFrom302Header() 0 34 11
D expandExcludeString() 0 45 9
A CLI_buildProcessId() 0 7 2
B sendDirectRequest() 0 31 2

How to fix   Complexity   

Complex Class

Complex classes like CrawlerController often do a lot of different things. To break such a class down, we need to identify a cohesive component within that class. A common approach to find such a component is to look for fields/methods that share the same prefixes, or suffixes. You can also have a look at the cohesion graph to spot any un-connected, or weakly-connected components.

Once you have determined the fields that belong together, you can apply the Extract Class refactoring. If the component makes sense as a sub-class, Extract Subclass is also a candidate, and is often faster.

While breaking up the class, it is a good idea to analyze how other classes use CrawlerController, and based on these observations, apply Extract Interface, too.

1
<?php
2
namespace AOE\Crawler\Controller;
3
4
/***************************************************************
5
 *  Copyright notice
6
 *
7
 *  (c) 2017 AOE GmbH <[email protected]>
8
 *
9
 *  All rights reserved
10
 *
11
 *  This script is part of the TYPO3 project. The TYPO3 project is
12
 *  free software; you can redistribute it and/or modify
13
 *  it under the terms of the GNU General Public License as published by
14
 *  the Free Software Foundation; either version 3 of the License, or
15
 *  (at your option) any later version.
16
 *
17
 *  The GNU General Public License can be found at
18
 *  http://www.gnu.org/copyleft/gpl.html.
19
 *
20
 *  This script is distributed in the hope that it will be useful,
21
 *  but WITHOUT ANY WARRANTY; without even the implied warranty of
22
 *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
23
 *  GNU General Public License for more details.
24
 *
25
 *  This copyright notice MUST APPEAR in all copies of the script!
26
 ***************************************************************/
27
28
use AOE\Crawler\Command\CrawlerCommandLineController;
29
use AOE\Crawler\Command\FlushCommandLineController;
30
use AOE\Crawler\Command\QueueCommandLineController;
31
use AOE\Crawler\Domain\Model\Reason;
32
use AOE\Crawler\Event\EventDispatcher;
33
use AOE\Crawler\Utility\IconUtility;
34
use AOE\Crawler\Utility\SignalSlotUtility;
35
use TYPO3\CMS\Backend\Utility\BackendUtility;
36
use TYPO3\CMS\Backend\Tree\View\PageTreeView;
37
use TYPO3\CMS\Core\Authentication\BackendUserAuthentication;
38
use TYPO3\CMS\Core\Database\DatabaseConnection;
39
use TYPO3\CMS\Core\Log\LogLevel;
40
use TYPO3\CMS\Core\TimeTracker\NullTimeTracker;
41
use TYPO3\CMS\Core\Utility\DebugUtility;
42
use TYPO3\CMS\Core\Utility\ExtensionManagementUtility;
43
use TYPO3\CMS\Core\Utility\GeneralUtility;
44
use TYPO3\CMS\Core\Utility\MathUtility;
45
use TYPO3\CMS\Frontend\Controller\TypoScriptFrontendController;
46
use TYPO3\CMS\Frontend\Page\PageGenerator;
47
use TYPO3\CMS\Frontend\Page\PageRepository;
48
use TYPO3\CMS\Frontend\Utility\EidUtility;
49
50
/**
51
 * Class CrawlerController
52
 *
53
 * @package AOE\Crawler\Controller
54
 */
55
class CrawlerController
56
{
57
    const CLI_STATUS_NOTHING_PROCCESSED = 0;
58
    const CLI_STATUS_REMAIN = 1; //queue not empty
59
    const CLI_STATUS_PROCESSED = 2; //(some) queue items where processed
60
    const CLI_STATUS_ABORTED = 4; //instance didn't finish
61
    const CLI_STATUS_POLLABLE_PROCESSED = 8;
62
63
    /**
64
     * @var integer
65
     */
66
    public $setID = 0;
67
68
    /**
69
     * @var string
70
     */
71
    public $processID = '';
72
73
    /**
74
     * One hour is max stalled time for the CLI
75
     * If the process had the status "start" for 3600 seconds, it will be regarded stalled and a new process is started
76
     *
77
     * @var integer
78
     */
79
    public $max_CLI_exec_time = 3600;
80
81
    /**
82
     * @var array
83
     */
84
    public $duplicateTrack = [];
85
86
    /**
87
     * @var array
88
     */
89
    public $downloadUrls = [];
90
91
    /**
92
     * @var array
93
     */
94
    public $incomingProcInstructions = [];
95
96
    /**
97
     * @var array
98
     */
99
    public $incomingConfigurationSelection = [];
100
101
    /**
102
     * @var bool
103
     */
104
    public $registerQueueEntriesInternallyOnly = false;
105
106
    /**
107
     * @var array
108
     */
109
    public $queueEntries = [];
110
111
    /**
112
     * @var array
113
     */
114
    public $urlList = [];
115
116
    /**
117
     * @var boolean
118
     */
119
    public $debugMode = false;
120
121
    /**
122
     * @var array
123
     */
124
    public $extensionSettings = [];
125
126
    /**
127
     * Mount Point
128
     *
129
     * @var boolean
130
     */
131
    public $MP = false;
132
133
    /**
134
     * @var string
135
     */
136
    protected $processFilename;
137
138
    /**
139
     * Holds the internal access mode can be 'gui','cli' or 'cli_im'
140
     *
141
     * @var string
142
     */
143
    protected $accessMode;
144
145
    /**
146
     * @var DatabaseConnection
147
     */
148
    private $db;
149
150
    /**
151
     * @var BackendUserAuthentication
152
     */
153
    private $backendUser;
154
155
    /**
156
     * @var integer
157
     */
158
    private $scheduledTime = 0;
159
160
    /**
161
     * @var integer
162
     */
163
    private $reqMinute = 0;
164
165
    /**
166
     * @var bool
167
     */
168
    private $submitCrawlUrls = false;
169
170
    /**
171
     * @var bool
172
     */
173
    private $downloadCrawlUrls = false;
174
175
    /**
176
     * Method to set the accessMode can be gui, cli or cli_im
177
     *
178
     * @return string
179
     */
180
    public function getAccessMode()
181
    {
182
        return $this->accessMode;
183
    }
184
185
    /**
186
     * @param string $accessMode
187
     */
188
    public function setAccessMode($accessMode)
189
    {
190
        $this->accessMode = $accessMode;
191
    }
192
193
    /**
194
     * Set disabled status to prevent processes from being processed
195
     *
196
     * @param  bool $disabled (optional, defaults to true)
197
     * @return void
198
     */
199
    public function setDisabled($disabled = true)
200
    {
201
        if ($disabled) {
202
            GeneralUtility::writeFile($this->processFilename, '');
203
        } else {
204
            if (is_file($this->processFilename)) {
205
                unlink($this->processFilename);
206
            }
207
        }
208
    }
209
210
    /**
211
     * Get disable status
212
     *
213
     * @return bool true if disabled
214
     */
215
    public function getDisabled()
216
    {
217
        if (is_file($this->processFilename)) {
218
            return true;
219
        } else {
220
            return false;
221
        }
222
    }
223
224
    /**
225
     * @param string $filenameWithPath
226
     *
227
     * @return void
228
     */
229
    public function setProcessFilename($filenameWithPath)
230
    {
231
        $this->processFilename = $filenameWithPath;
232
    }
233
234
    /**
235
     * @return string
236
     */
237
    public function getProcessFilename()
238
    {
239
        return $this->processFilename;
240
    }
241
242
    /************************************
243
     *
244
     * Getting URLs based on Page TSconfig
245
     *
246
     ************************************/
247
248 5
    public function __construct()
249
    {
250 5
        $this->db = $GLOBALS['TYPO3_DB'];
251 5
        $this->backendUser = $GLOBALS['BE_USER'];
252 5
        $this->processFilename = PATH_site . 'typo3temp/tx_crawler.proc';
253
254 5
        $settings = unserialize($GLOBALS['TYPO3_CONF_VARS']['EXT']['extConf']['crawler']);
255 5
        $settings = is_array($settings) ? $settings : [];
256
257
        // read ext_em_conf_template settings and set
258 5
        $this->setExtensionSettings($settings);
259
260
        // set defaults:
261 5
        if (MathUtility::convertToPositiveInteger($this->extensionSettings['countInARun']) == 0) {
262
            $this->extensionSettings['countInARun'] = 100;
263
        }
264
265 5
        $this->extensionSettings['processLimit'] = MathUtility::forceIntegerInRange($this->extensionSettings['processLimit'], 1, 99, 1);
266 5
    }
267
268
    /**
269
     * Sets the extensions settings (unserialized pendant of $TYPO3_CONF_VARS['EXT']['extConf']['crawler']).
270
     *
271
     * @param array $extensionSettings
272
     * @return void
273
     */
274 5
    public function setExtensionSettings(array $extensionSettings)
275
    {
276 5
        $this->extensionSettings = $extensionSettings;
277 5
    }
278
279
    /**
280
     * Check if the given page should be crawled
281
     *
282
     * @param array $pageRow
283
     * @return false|string false if the page should be crawled (not excluded), true / skipMessage if it should be skipped
284
     */
285 4
    public function checkIfPageShouldBeSkipped(array $pageRow)
286
    {
287 4
        $skipPage = false;
288 4
        $skipMessage = 'Skipped'; // message will be overwritten later
289
290
        // if page is hidden
291 4
        if (!$this->extensionSettings['crawlHiddenPages']) {
292 4
            if ($pageRow['hidden']) {
293
                $skipPage = true;
294
                $skipMessage = 'Because page is hidden';
295
            }
296
        }
297
298 4
        if (!$skipPage) {
299 4
            if (GeneralUtility::inList('3,4', $pageRow['doktype']) || $pageRow['doktype'] >= 199) {
300
                $skipPage = true;
301
                $skipMessage = 'Because doktype is not allowed';
302
            }
303
        }
304
305 4
        if (!$skipPage) {
306 4
            if (is_array($GLOBALS['TYPO3_CONF_VARS']['EXTCONF']['crawler']['excludeDoktype'])) {
307
                foreach ($GLOBALS['TYPO3_CONF_VARS']['EXTCONF']['crawler']['excludeDoktype'] as $key => $doktypeList) {
308
                    if (GeneralUtility::inList($doktypeList, $pageRow['doktype'])) {
309
                        $skipPage = true;
310
                        $skipMessage = 'Doktype was excluded by "' . $key . '"';
311
                        break;
312
                    }
313
                }
314
            }
315
        }
316
317 4
        if (!$skipPage) {
318
            // veto hook
319 4
            if (is_array($GLOBALS['TYPO3_CONF_VARS']['EXTCONF']['crawler']['pageVeto'])) {
320
                foreach ($GLOBALS['TYPO3_CONF_VARS']['EXTCONF']['crawler']['pageVeto'] as $key => $func) {
321
                    $params = [
322
                        'pageRow' => $pageRow
323
                    ];
324
                    // expects "false" if page is ok and "true" or a skipMessage if this page should _not_ be crawled
325
                    $veto = GeneralUtility::callUserFunction($func, $params, $this);
326
                    if ($veto !== false) {
327
                        $skipPage = true;
328
                        if (is_string($veto)) {
329
                            $skipMessage = $veto;
330
                        } else {
331
                            $skipMessage = 'Veto from hook "' . htmlspecialchars($key) . '"';
332
                        }
333
                        // no need to execute other hooks if a previous one return a veto
334
                        break;
335
                    }
336
                }
337
            }
338
        }
339
340 4
        return $skipPage ? $skipMessage : false;
341
    }
342
343
    /**
344
     * Wrapper method for getUrlsForPageId()
345
     * It returns an array of configurations and no urls!
346
     *
347
     * @param array $pageRow Page record with at least dok-type and uid columns.
348
     * @param string $skipMessage
349
     * @return array
350
     * @see getUrlsForPageId()
351
     */
352 4
    public function getUrlsForPageRow(array $pageRow, &$skipMessage = '')
353
    {
354 4
        $message = $this->checkIfPageShouldBeSkipped($pageRow);
355
356 4
        if ($message === false) {
357 4
            $forceSsl = ($pageRow['url_scheme'] === 2) ? true : false;
358 4
            $res = $this->getUrlsForPageId($pageRow['uid'], $forceSsl);
359 4
            $skipMessage = '';
360
        } else {
361
            $skipMessage = $message;
362
            $res = [];
363
        }
364
365 4
        return $res;
366
    }
367
368
    /**
369
     * This method is used to count if there are ANY unprocessed queue entries
370
     * of a given page_id and the configuration which matches a given hash.
371
     * If there if none, we can skip an inner detail check
372
     *
373
     * @param  int $uid
374
     * @param  string $configurationHash
375
     * @return boolean
376
     */
377 4
    protected function noUnprocessedQueueEntriesForPageWithConfigurationHashExist($uid, $configurationHash)
378
    {
379 4
        $configurationHash = $this->db->fullQuoteStr($configurationHash, 'tx_crawler_queue');
380 4
        $res = $this->db->exec_SELECTquery('count(*) as anz', 'tx_crawler_queue', "page_id=" . intval($uid) . " AND configuration_hash=" . $configurationHash . " AND exec_time=0");
381 4
        $row = $this->db->sql_fetch_assoc($res);
382
383 4
        return ($row['anz'] == 0);
384
    }
385
386
    /**
387
     * Creates a list of URLs from input array (and submits them to queue if asked for)
388
     * See Web > Info module script + "indexed_search"'s crawler hook-client using this!
389
     *
390
     * @param    array        Information about URLs from pageRow to crawl.
391
     * @param    array        Page row
392
     * @param    integer        Unix time to schedule indexing to, typically time()
393
     * @param    integer        Number of requests per minute (creates the interleave between requests)
394
     * @param    boolean        If set, submits the URLs to queue
395
     * @param    boolean        If set (and submitcrawlUrls is false) will fill $downloadUrls with entries)
396
     * @param    array        Array which is passed by reference and contains the an id per url to secure we will not crawl duplicates
397
     * @param    array        Array which will be filled with URLS for download if flag is set.
398
     * @param    array        Array of processing instructions
399
     * @return    string        List of URLs (meant for display in backend module)
400
     *
401
     */
402 4
    public function urlListFromUrlArray(
403
    array $vv,
404
    array $pageRow,
405
    $scheduledTime,
406
    $reqMinute,
407
    $submitCrawlUrls,
408
    $downloadCrawlUrls,
409
    array &$duplicateTrack,
410
    array &$downloadUrls,
411
    array $incomingProcInstructions
412
    ) {
413 4
        $urlList = '';
414
        // realurl support (thanks to Ingo Renner)
415 4
        if (ExtensionManagementUtility::isLoaded('realurl') && $vv['subCfg']['realurl']) {
416
417
            /** @var tx_realurl $urlObj */
418
            $urlObj = GeneralUtility::makeInstance('tx_realurl');
419
420
            if (!empty($vv['subCfg']['baseUrl'])) {
421
                $urlParts = parse_url($vv['subCfg']['baseUrl']);
422
                $host = strtolower($urlParts['host']);
423
                $urlObj->host = $host;
424
425
                // First pass, finding configuration OR pointer string:
426
                $urlObj->extConf = isset($GLOBALS['TYPO3_CONF_VARS']['EXTCONF']['realurl'][$urlObj->host]) ? $GLOBALS['TYPO3_CONF_VARS']['EXTCONF']['realurl'][$urlObj->host] : $GLOBALS['TYPO3_CONF_VARS']['EXTCONF']['realurl']['_DEFAULT'];
427
428
                // If it turned out to be a string pointer, then look up the real config:
429
                if (is_string($urlObj->extConf)) {
430
                    $urlObj->extConf = is_array($GLOBALS['TYPO3_CONF_VARS']['EXTCONF']['realurl'][$urlObj->extConf]) ? $GLOBALS['TYPO3_CONF_VARS']['EXTCONF']['realurl'][$urlObj->extConf] : $GLOBALS['TYPO3_CONF_VARS']['EXTCONF']['realurl']['_DEFAULT'];
431
                }
432
            }
433
434
            if (!$GLOBALS['TSFE']->sys_page) {
435
                $GLOBALS['TSFE']->sys_page = GeneralUtility::makeInstance('TYPO3\CMS\Frontend\Page\PageRepository');
436
            }
437
            if (!$GLOBALS['TSFE']->csConvObj) {
438
                $GLOBALS['TSFE']->csConvObj = GeneralUtility::makeInstance('TYPO3\CMS\Core\Charset\CharsetConverter');
439
            }
440
            if (!$GLOBALS['TSFE']->tmpl->rootLine[0]['uid']) {
441
                $GLOBALS['TSFE']->tmpl->rootLine[0]['uid'] = $urlObj->extConf['pagePath']['rootpage_id'];
442
            }
443
        }
444
445 4
        if (is_array($vv['URLs'])) {
446 4
            $configurationHash = $this->getConfigurationHash($vv);
447 4
            $skipInnerCheck = $this->noUnprocessedQueueEntriesForPageWithConfigurationHashExist($pageRow['uid'], $configurationHash);
448
449 4
            foreach ($vv['URLs'] as $urlQuery) {
450 4
                if ($this->drawURLs_PIfilter($vv['subCfg']['procInstrFilter'], $incomingProcInstructions)) {
451
452
                    // Calculate cHash:
453 4
                    if ($vv['subCfg']['cHash']) {
454
                        /* @var $cacheHash \TYPO3\CMS\Frontend\Page\CacheHashCalculator */
455
                        $cacheHash = GeneralUtility::makeInstance('TYPO3\CMS\Frontend\Page\CacheHashCalculator');
456
                        $urlQuery .= '&cHash=' . $cacheHash->generateForParameters($urlQuery);
457
                    }
458
459
                    // Create key by which to determine unique-ness:
460 4
                    $uKey = $urlQuery . '|' . $vv['subCfg']['userGroups'] . '|' . $vv['subCfg']['baseUrl'] . '|' . $vv['subCfg']['procInstrFilter'];
461
462
                    // realurl support (thanks to Ingo Renner)
463 4
                    $urlQuery = 'index.php' . $urlQuery;
464 4
                    if (ExtensionManagementUtility::isLoaded('realurl') && $vv['subCfg']['realurl']) {
465
                        $params = [
466
                            'LD' => [
467
                                'totalURL' => $urlQuery
468
                            ],
469
                            'TCEmainHook' => true
470
                        ];
471
                        $urlObj->encodeSpURL($params);
0 ignored issues
show
Bug introduced by
The variable $urlObj does not seem to be defined for all execution paths leading up to this point.

If you define a variable conditionally, it can happen that it is not defined for all execution paths.

Let’s take a look at an example:

function myFunction($a) {
    switch ($a) {
        case 'foo':
            $x = 1;
            break;

        case 'bar':
            $x = 2;
            break;
    }

    // $x is potentially undefined here.
    echo $x;
}

In the above example, the variable $x is defined if you pass “foo” or “bar” as argument for $a. However, since the switch statement has no default case statement, if you pass any other value, the variable $x would be undefined.

Available Fixes

  1. Check for existence of the variable explicitly:

    function myFunction($a) {
        switch ($a) {
            case 'foo':
                $x = 1;
                break;
    
            case 'bar':
                $x = 2;
                break;
        }
    
        if (isset($x)) { // Make sure it's always set.
            echo $x;
        }
    }
    
  2. Define a default value for the variable:

    function myFunction($a) {
        $x = ''; // Set a default which gets overridden for certain paths.
        switch ($a) {
            case 'foo':
                $x = 1;
                break;
    
            case 'bar':
                $x = 2;
                break;
        }
    
        echo $x;
    }
    
  3. Add a value for the missing path:

    function myFunction($a) {
        switch ($a) {
            case 'foo':
                $x = 1;
                break;
    
            case 'bar':
                $x = 2;
                break;
    
            // We add support for the missing case.
            default:
                $x = '';
                break;
        }
    
        echo $x;
    }
    
Loading history...
472
                        $urlQuery = $params['LD']['totalURL'];
473
                    }
474
475
                    // Scheduled time:
476 4
                    $schTime = $scheduledTime + round(count($duplicateTrack) * (60 / $reqMinute));
477 4
                    $schTime = floor($schTime / 60) * 60;
478
479 4
                    if (isset($duplicateTrack[$uKey])) {
480
481
                        //if the url key is registered just display it and do not resubmit is
482
                        $urlList = '<em><span class="typo3-dimmed">' . htmlspecialchars($urlQuery) . '</span></em><br/>';
483
                    } else {
484 4
                        $urlList = '[' . date('d.m.y H:i', $schTime) . '] ' . htmlspecialchars($urlQuery);
485 4
                        $this->urlList[] = '[' . date('d.m.y H:i', $schTime) . '] ' . $urlQuery;
486
487 4
                        $theUrl = ($vv['subCfg']['baseUrl'] ? $vv['subCfg']['baseUrl'] : GeneralUtility::getIndpEnv('TYPO3_SITE_URL')) . $urlQuery;
488
489
                        // Submit for crawling!
490 4
                        if ($submitCrawlUrls) {
491 4
                            $added = $this->addUrl(
492 4
                            $pageRow['uid'],
493 4
                            $theUrl,
494 4
                            $vv['subCfg'],
495 4
                            $scheduledTime,
496 4
                            $configurationHash,
497 4
                            $skipInnerCheck
498
                            );
499 4
                            if ($added === false) {
500 4
                                $urlList .= ' (Url already existed)';
501
                            }
502
                        } elseif ($downloadCrawlUrls) {
503
                            $downloadUrls[$theUrl] = $theUrl;
504
                        }
505
506 4
                        $urlList .= '<br />';
507
                    }
508 4
                    $duplicateTrack[$uKey] = true;
509
                }
510
            }
511
        } else {
512
            $urlList = 'ERROR - no URL generated';
513
        }
514
515 4
        return $urlList;
516
    }
517
518
    /**
519
     * Returns true if input processing instruction is among registered ones.
520
     *
521
     * @param string $piString PI to test
522
     * @param array $incomingProcInstructions Processing instructions
523
     * @return boolean
524
     */
525
    public function drawURLs_PIfilter($piString, array $incomingProcInstructions)
526
    {
527
        if (empty($incomingProcInstructions)) {
528
            return true;
529
        }
530
531
        foreach ($incomingProcInstructions as $pi) {
532
            if (GeneralUtility::inList($piString, $pi)) {
533
                return true;
534
            }
535
        }
536
    }
537
538 4
    public function getPageTSconfigForId($id)
539
    {
540 4
        if (!$this->MP) {
541 4
            $pageTSconfig = BackendUtility::getPagesTSconfig($id);
542
        } else {
543
            list(, $mountPointId) = explode('-', $this->MP);
544
            $pageTSconfig = BackendUtility::getPagesTSconfig($mountPointId);
545
        }
546
547
        // Call a hook to alter configuration
548 4
        if (is_array($GLOBALS['TYPO3_CONF_VARS']['EXTCONF']['crawler']['getPageTSconfigForId'])) {
549
            $params = [
550
                'pageId' => $id,
551
                'pageTSConfig' => &$pageTSconfig
552
            ];
553
            foreach ($GLOBALS['TYPO3_CONF_VARS']['EXTCONF']['crawler']['getPageTSconfigForId'] as $userFunc) {
554
                GeneralUtility::callUserFunction($userFunc, $params, $this);
555
            }
556
        }
557
558 4
        return $pageTSconfig;
559
    }
560
561
    /**
562
     * This methods returns an array of configurations.
563
     * And no urls!
564
     *
565
     * @param integer $id Page ID
566
     * @param bool $forceSsl Use https
567
     * @return array
568
     */
569 4
    protected function getUrlsForPageId($id, $forceSsl = false)
570
    {
571
572
        /**
573
         * Get configuration from tsConfig
574
         */
575
576
        // Get page TSconfig for page ID:
577 4
        $pageTSconfig = $this->getPageTSconfigForId($id);
578
579 4
        $res = [];
580
581 4
        if (is_array($pageTSconfig) && is_array($pageTSconfig['tx_crawler.']['crawlerCfg.'])) {
582 3
            $crawlerCfg = $pageTSconfig['tx_crawler.']['crawlerCfg.'];
583
584 3
            if (is_array($crawlerCfg['paramSets.'])) {
585 3
                foreach ($crawlerCfg['paramSets.'] as $key => $values) {
586 3
                    if (!is_array($values)) {
587
588
                        // Sub configuration for a single configuration string:
589 3
                        $subCfg = (array)$crawlerCfg['paramSets.'][$key . '.'];
590 3
                        $subCfg['key'] = $key;
591
592 3
                        if (strcmp($subCfg['procInstrFilter'], '')) {
593 3
                            $subCfg['procInstrFilter'] = implode(',', GeneralUtility::trimExplode(',', $subCfg['procInstrFilter']));
594
                        }
595 3
                        $pidOnlyList = implode(',', GeneralUtility::trimExplode(',', $subCfg['pidsOnly'], true));
596
597
                        // process configuration if it is not page-specific or if the specific page is the current page:
598 3
                        if (!strcmp($subCfg['pidsOnly'], '') || GeneralUtility::inList($pidOnlyList, $id)) {
599
600
                                // add trailing slash if not present
601 3
                            if (!empty($subCfg['baseUrl']) && substr($subCfg['baseUrl'], -1) != '/') {
602
                                $subCfg['baseUrl'] .= '/';
603
                            }
604
605
                            // Explode, process etc.:
606 3
                            $res[$key] = [];
607 3
                            $res[$key]['subCfg'] = $subCfg;
608 3
                            $res[$key]['paramParsed'] = $this->parseParams($values);
609 3
                            $res[$key]['paramExpanded'] = $this->expandParameters($res[$key]['paramParsed'], $id);
610 3
                            $res[$key]['origin'] = 'pagets';
611
612
                            // recognize MP value
613 3
                            if (!$this->MP) {
614 3
                                $res[$key]['URLs'] = $this->compileUrls($res[$key]['paramExpanded'], ['?id=' . $id]);
615
                            } else {
616 3
                                $res[$key]['URLs'] = $this->compileUrls($res[$key]['paramExpanded'], ['?id=' . $id . '&MP=' . $this->MP]);
617
                            }
618
                        }
619
                    }
620
                }
621
            }
622
        }
623
624
        /**
625
         * Get configuration from tx_crawler_configuration records
626
         */
627
628
        // get records along the rootline
629 4
        $rootLine = BackendUtility::BEgetRootLine($id);
630
631 4
        foreach ($rootLine as $page) {
632 4
            $configurationRecordsForCurrentPage = BackendUtility::getRecordsByField(
633 4
                'tx_crawler_configuration',
634 4
                'pid',
635 4
                intval($page['uid']),
636 4
                BackendUtility::BEenableFields('tx_crawler_configuration') . BackendUtility::deleteClause('tx_crawler_configuration')
637
            );
638
639 4
            if (is_array($configurationRecordsForCurrentPage)) {
640 1
                foreach ($configurationRecordsForCurrentPage as $configurationRecord) {
641
642
                        // check access to the configuration record
643 1
                    if (empty($configurationRecord['begroups']) || $GLOBALS['BE_USER']->isAdmin() || $this->hasGroupAccess($GLOBALS['BE_USER']->user['usergroup_cached_list'], $configurationRecord['begroups'])) {
644 1
                        $pidOnlyList = implode(',', GeneralUtility::trimExplode(',', $configurationRecord['pidsonly'], true));
645
646
                        // process configuration if it is not page-specific or if the specific page is the current page:
647 1
                        if (!strcmp($configurationRecord['pidsonly'], '') || GeneralUtility::inList($pidOnlyList, $id)) {
648 1
                            $key = $configurationRecord['name'];
649
650
                            // don't overwrite previously defined paramSets
651 1
                            if (!isset($res[$key])) {
652
653
                                    /* @var $TSparserObject \TYPO3\CMS\Core\TypoScript\Parser\TypoScriptParser */
654 1
                                $TSparserObject = GeneralUtility::makeInstance('TYPO3\CMS\Core\TypoScript\Parser\TypoScriptParser');
655 1
                                $TSparserObject->parse($configurationRecord['processing_instruction_parameters_ts']);
656
657
                                $subCfg = [
658 1
                                    'procInstrFilter' => $configurationRecord['processing_instruction_filter'],
659 1
                                    'procInstrParams.' => $TSparserObject->setup,
660 1
                                    'baseUrl' => $this->getBaseUrlForConfigurationRecord(
661 1
                                        $configurationRecord['base_url'],
662 1
                                        $configurationRecord['sys_domain_base_url'],
663 1
                                        $forceSsl
664
                                    ),
665 1
                                    'realurl' => $configurationRecord['realurl'],
666 1
                                    'cHash' => $configurationRecord['chash'],
667 1
                                    'userGroups' => $configurationRecord['fegroups'],
668 1
                                    'exclude' => $configurationRecord['exclude'],
669 1
                                    'rootTemplatePid' => (int) $configurationRecord['root_template_pid'],
670 1
                                    'key' => $key,
671
                                ];
672
673
                                // add trailing slash if not present
674 1
                                if (!empty($subCfg['baseUrl']) && substr($subCfg['baseUrl'], -1) != '/') {
675
                                    $subCfg['baseUrl'] .= '/';
676
                                }
677 1
                                if (!in_array($id, $this->expandExcludeString($subCfg['exclude']))) {
678 1
                                    $res[$key] = [];
679 1
                                    $res[$key]['subCfg'] = $subCfg;
680 1
                                    $res[$key]['paramParsed'] = $this->parseParams($configurationRecord['configuration']);
681 1
                                    $res[$key]['paramExpanded'] = $this->expandParameters($res[$key]['paramParsed'], $id);
682 1
                                    $res[$key]['URLs'] = $this->compileUrls($res[$key]['paramExpanded'], ['?id=' . $id]);
683 4
                                    $res[$key]['origin'] = 'tx_crawler_configuration_' . $configurationRecord['uid'];
684
                                }
685
                            }
686
                        }
687
                    }
688
                }
689
            }
690
        }
691
692 4
        if (is_array($GLOBALS['TYPO3_CONF_VARS']['EXTCONF']['crawler']['processUrls'])) {
693
            foreach ($GLOBALS['TYPO3_CONF_VARS']['EXTCONF']['crawler']['processUrls'] as $func) {
694
                $params = [
695
                    'res' => &$res,
696
                ];
697
                GeneralUtility::callUserFunction($func, $params, $this);
698
            }
699
        }
700
701 4
        return $res;
702
    }
703
704
    /**
705
     * Checks if a domain record exist and returns the base-url based on the record. If not the given baseUrl string is used.
706
     *
707
     * @param string $baseUrl
708
     * @param integer $sysDomainUid
709
     * @param bool $ssl
710
     * @return string
711
     */
712 1
    protected function getBaseUrlForConfigurationRecord($baseUrl, $sysDomainUid, $ssl = false)
713
    {
714 1
        $sysDomainUid = intval($sysDomainUid);
715 1
        $urlScheme = ($ssl === false) ? 'http' : 'https';
716
717 1
        if ($sysDomainUid > 0) {
718
            $res = $this->db->exec_SELECTquery(
719
                '*',
720
                'sys_domain',
721
                'uid = ' . $sysDomainUid .
722
                BackendUtility::BEenableFields('sys_domain') .
723
                BackendUtility::deleteClause('sys_domain')
724
            );
725
            $row = $this->db->sql_fetch_assoc($res);
726
            if ($row['domainName'] != '') {
727
                return $urlScheme . '://' . $row['domainName'];
728
            }
729
        }
730 1
        return $baseUrl;
731
    }
732
733
    public function getConfigurationsForBranch($rootid, $depth)
734
    {
735
        $configurationsForBranch = [];
736
737
        $pageTSconfig = $this->getPageTSconfigForId($rootid);
738
        if (is_array($pageTSconfig) && is_array($pageTSconfig['tx_crawler.']['crawlerCfg.']) && is_array($pageTSconfig['tx_crawler.']['crawlerCfg.']['paramSets.'])) {
739
            $sets = $pageTSconfig['tx_crawler.']['crawlerCfg.']['paramSets.'];
740
            if (is_array($sets)) {
741
                foreach ($sets as $key => $value) {
742
                    if (!is_array($value)) {
743
                        continue;
744
                    }
745
                    $configurationsForBranch[] = substr($key, -1) == '.' ? substr($key, 0, -1) : $key;
746
                }
747
            }
748
        }
749
        $pids = [];
750
        $rootLine = BackendUtility::BEgetRootLine($rootid);
751
        foreach ($rootLine as $node) {
752
            $pids[] = $node['uid'];
753
        }
754
        /* @var PageTreeView $tree */
755
        $tree = GeneralUtility::makeInstance(PageTreeView::class);
756
        $perms_clause = $GLOBALS['BE_USER']->getPagePermsClause(1);
757
        $tree->init('AND ' . $perms_clause);
758
        $tree->getTree($rootid, $depth, '');
759
        foreach ($tree->tree as $node) {
760
            $pids[] = $node['row']['uid'];
761
        }
762
763
        $res = $this->db->exec_SELECTquery(
764
            '*',
765
            'tx_crawler_configuration',
766
            'pid IN (' . implode(',', $pids) . ') ' .
767
            BackendUtility::BEenableFields('tx_crawler_configuration') .
768
            BackendUtility::deleteClause('tx_crawler_configuration') . ' ' .
769
            BackendUtility::versioningPlaceholderClause('tx_crawler_configuration') . ' '
770
        );
771
772
        while ($row = $this->db->sql_fetch_assoc($res)) {
773
            $configurationsForBranch[] = $row['name'];
774
        }
775
        $this->db->sql_free_result($res);
776
        return $configurationsForBranch;
777
    }
778
779
    /**
780
     * Check if a user has access to an item
781
     * (e.g. get the group list of the current logged in user from $GLOBALS['TSFE']->gr_list)
782
     *
783
     * @see \TYPO3\CMS\Frontend\Page\PageRepository::getMultipleGroupsWhereClause()
784
     * @param  string $groupList    Comma-separated list of (fe_)group UIDs from a user
785
     * @param  string $accessList   Comma-separated list of (fe_)group UIDs of the item to access
786
     * @return bool                 TRUE if at least one of the users group UIDs is in the access list or the access list is empty
787
     */
788
    public function hasGroupAccess($groupList, $accessList)
789
    {
790
        if (empty($accessList)) {
791
            return true;
792
        }
793
        foreach (GeneralUtility::intExplode(',', $groupList) as $groupUid) {
794
            if (GeneralUtility::inList($accessList, $groupUid)) {
795
                return true;
796
            }
797
        }
798
        return false;
799
    }
800
801
    /**
802
     * Parse GET vars of input Query into array with key=>value pairs
803
     *
804
     * @param string $inputQuery Input query string
805
     * @return array
806
     */
807 4
    public function parseParams($inputQuery)
808
    {
809
        // Extract all GET parameters into an ARRAY:
810 4
        $paramKeyValues = [];
811 4
        $GETparams = explode('&', $inputQuery);
812
813 4
        foreach ($GETparams as $paramAndValue) {
814 4
            list($p, $v) = explode('=', $paramAndValue, 2);
815 4
            if (strlen($p)) {
816 4
                $paramKeyValues[rawurldecode($p)] = rawurldecode($v);
817
            }
818
        }
819
820 4
        return $paramKeyValues;
821
    }
822
823
    /**
824
     * Will expand the parameters configuration to individual values. This follows a certain syntax of the value of each parameter.
825
     * Syntax of values:
826
     * - Basically: If the value is wrapped in [...] it will be expanded according to the following syntax, otherwise the value is taken literally
827
     * - Configuration is splitted by "|" and the parts are processed individually and finally added together
828
     * - For each configuration part:
829
     *         - "[int]-[int]" = Integer range, will be expanded to all values in between, values included, starting from low to high (max. 1000). Example "1-34" or "-40--30"
830
     *         - "_TABLE:[TCA table name];[_PID:[optional page id, default is current page]];[_ENABLELANG:1]" = Look up of table records from PID, filtering out deleted records. Example "_TABLE:tt_content; _PID:123"
831
     *        _ENABLELANG:1 picks only original records without their language overlays
832
     *         - Default: Literal value
833
     *
834
     * @param array $paramArray Array with key (GET var name) and values (value of GET var which is configuration for expansion)
835
     * @param integer $pid Current page ID
836
     * @return array
837
     */
838 4
    public function expandParameters($paramArray, $pid)
839
    {
840 4
        global $TCA;
841
842
        // Traverse parameter names:
843 4
        foreach ($paramArray as $p => $v) {
844 4
            $v = trim($v);
845
846
            // If value is encapsulated in square brackets it means there are some ranges of values to find, otherwise the value is literal
847 4
            if (substr($v, 0, 1) === '[' && substr($v, -1) === ']') {
848
                // So, find the value inside brackets and reset the paramArray value as an array.
849 4
                $v = substr($v, 1, -1);
850 4
                $paramArray[$p] = [];
851
852
                // Explode parts and traverse them:
853 4
                $parts = explode('|', $v);
854 4
                foreach ($parts as $pV) {
855
856
                        // Look for integer range: (fx. 1-34 or -40--30 // reads minus 40 to minus 30)
857 4
                    if (preg_match('/^(-?[0-9]+)\s*-\s*(-?[0-9]+)$/', trim($pV), $reg)) {
858
859
                        // Swap if first is larger than last:
860
                        if ($reg[1] > $reg[2]) {
861
                            $temp = $reg[2];
862
                            $reg[2] = $reg[1];
863
                            $reg[1] = $temp;
864
                        }
865
866
                        // Traverse range, add values:
867
                        $runAwayBrake = 1000; // Limit to size of range!
868
                        for ($a = $reg[1]; $a <= $reg[2];$a++) {
869
                            $paramArray[$p][] = $a;
870
                            $runAwayBrake--;
871
                            if ($runAwayBrake <= 0) {
872
                                break;
873
                            }
874
                        }
875 4
                    } elseif (substr(trim($pV), 0, 7) == '_TABLE:') {
876
877
                        // Parse parameters:
878
                        $subparts = GeneralUtility::trimExplode(';', $pV);
879
                        $subpartParams = [];
880
                        foreach ($subparts as $spV) {
881
                            list($pKey, $pVal) = GeneralUtility::trimExplode(':', $spV);
882
                            $subpartParams[$pKey] = $pVal;
883
                        }
884
885
                        // Table exists:
886
                        if (isset($TCA[$subpartParams['_TABLE']])) {
887
                            $lookUpPid = isset($subpartParams['_PID']) ? intval($subpartParams['_PID']) : $pid;
888
                            $pidField = isset($subpartParams['_PIDFIELD']) ? trim($subpartParams['_PIDFIELD']) : 'pid';
889
                            $where = isset($subpartParams['_WHERE']) ? $subpartParams['_WHERE'] : '';
890
                            $addTable = isset($subpartParams['_ADDTABLE']) ? $subpartParams['_ADDTABLE'] : '';
891
892
                            $fieldName = $subpartParams['_FIELD'] ? $subpartParams['_FIELD'] : 'uid';
893
                            if ($fieldName === 'uid' || $TCA[$subpartParams['_TABLE']]['columns'][$fieldName]) {
894
                                $andWhereLanguage = '';
895
                                $transOrigPointerField = $TCA[$subpartParams['_TABLE']]['ctrl']['transOrigPointerField'];
896
897
                                if ($subpartParams['_ENABLELANG'] && $transOrigPointerField) {
898
                                    $andWhereLanguage = ' AND ' . $this->db->quoteStr($transOrigPointerField, $subpartParams['_TABLE']) . ' <= 0 ';
899
                                }
900
901
                                $where = $this->db->quoteStr($pidField, $subpartParams['_TABLE']) . '=' . intval($lookUpPid) . ' ' .
902
                                    $andWhereLanguage . $where;
903
904
                                $rows = $this->db->exec_SELECTgetRows(
905
                                    $fieldName,
906
                                    $subpartParams['_TABLE'] . $addTable,
907
                                    $where . BackendUtility::deleteClause($subpartParams['_TABLE']),
908
                                    '',
909
                                    '',
910
                                    '',
911
                                    $fieldName
912
                                );
913
914
                                if (is_array($rows)) {
915
                                    $paramArray[$p] = array_merge($paramArray[$p], array_keys($rows));
916
                                }
917
                            }
918
                        }
919
                    } else { // Just add value:
920 4
                        $paramArray[$p][] = $pV;
921
                    }
922
                    // Hook for processing own expandParameters place holder
923 4
                    if (is_array($GLOBALS['TYPO3_CONF_VARS']['SC_OPTIONS']['crawler/class.tx_crawler_lib.php']['expandParameters'])) {
924
                        $_params = [
925
                            'pObj' => &$this,
926
                            'paramArray' => &$paramArray,
927
                            'currentKey' => $p,
928
                            'currentValue' => $pV,
929
                            'pid' => $pid
930
                        ];
931
                        foreach ($GLOBALS['TYPO3_CONF_VARS']['SC_OPTIONS']['crawler/class.tx_crawler_lib.php']['expandParameters'] as $key => $_funcRef) {
932 4
                            GeneralUtility::callUserFunction($_funcRef, $_params, $this);
933
                        }
934
                    }
935
                }
936
937
                // Make unique set of values and sort array by key:
938 4
                $paramArray[$p] = array_unique($paramArray[$p]);
939 4
                ksort($paramArray);
940
            } else {
941
                // Set the literal value as only value in array:
942 4
                $paramArray[$p] = [$v];
943
            }
944
        }
945
946 4
        return $paramArray;
947
    }
948
949
    /**
950
     * Compiling URLs from parameter array (output of expandParameters())
951
     * The number of URLs will be the multiplication of the number of parameter values for each key
952
     *
953
     * @param array $paramArray Output of expandParameters(): Array with keys (GET var names) and for each an array of values
954
     * @param array $urls URLs accumulated in this array (for recursion)
955
     * @return array
956
     */
957 4
    public function compileUrls($paramArray, $urls = [])
958
    {
959 4
        if (count($paramArray) && is_array($urls)) {
960
            // shift first off stack:
961 4
            reset($paramArray);
962 4
            $varName = key($paramArray);
963 4
            $valueSet = array_shift($paramArray);
964
965
            // Traverse value set:
966 4
            $newUrls = [];
967 4
            foreach ($urls as $url) {
968 4
                foreach ($valueSet as $val) {
969 4
                    $newUrls[] = $url . (strcmp($val, '') ? '&' . rawurlencode($varName) . '=' . rawurlencode($val) : '');
970
971 4
                    if (count($newUrls) > MathUtility::forceIntegerInRange($this->extensionSettings['maxCompileUrls'], 1, 1000000000, 10000)) {
972 4
                        break;
973
                    }
974
                }
975
            }
976 4
            $urls = $newUrls;
977 4
            $urls = $this->compileUrls($paramArray, $urls);
978
        }
979
980 4
        return $urls;
981
    }
982
983
    /************************************
984
     *
985
     * Crawler log
986
     *
987
     ************************************/
988
989
    /**
990
     * Return array of records from crawler queue for input page ID
991
     *
992
     * @param integer $id Page ID for which to look up log entries.
993
     * @param string$filter Filter: "all" => all entries, "pending" => all that is not yet run, "finished" => all complete ones
994
     * @param boolean $doFlush If TRUE, then entries selected at DELETED(!) instead of selected!
995
     * @param boolean $doFullFlush
996
     * @param integer $itemsPerPage Limit the amount of entries per page default is 10
997
     * @return array
998
     */
999
    public function getLogEntriesForPageId($id, $filter = '', $doFlush = false, $doFullFlush = false, $itemsPerPage = 10)
1000
    {
1001
        // FIXME: Write Unit tests for Filters
1002
        switch ($filter) {
1003
            case 'pending':
1004
                $addWhere = ' AND exec_time=0';
1005
                break;
1006
            case 'finished':
1007
                $addWhere = ' AND exec_time>0';
1008
                break;
1009
            default:
1010
                $addWhere = '';
1011
                break;
1012
        }
1013
1014
        // FIXME: Write unit test that ensures that the right records are deleted.
1015
        if ($doFlush) {
1016
            $this->flushQueue(($doFullFlush ? '1=1' : ('page_id=' . intval($id))) . $addWhere);
1017
            return [];
1018
        } else {
1019
            return $this->db->exec_SELECTgetRows(
1020
                '*',
1021
                'tx_crawler_queue',
1022
                'page_id=' . intval($id) . $addWhere,
1023
                '',
1024
                'scheduled DESC',
1025
                (intval($itemsPerPage) > 0 ? intval($itemsPerPage) : '')
1026
            );
1027
        }
1028
    }
1029
1030
    /**
1031
     * Return array of records from crawler queue for input set ID
1032
     *
1033
     * @param integer $set_id Set ID for which to look up log entries.
1034
     * @param string $filter Filter: "all" => all entries, "pending" => all that is not yet run, "finished" => all complete ones
1035
     * @param boolean $doFlush If TRUE, then entries selected at DELETED(!) instead of selected!
1036
     * @param integer $itemsPerPage Limit the amount of entires per page default is 10
1037
     * @return array
1038
     */
1039
    public function getLogEntriesForSetId($set_id, $filter = '', $doFlush = false, $doFullFlush = false, $itemsPerPage = 10)
1040
    {
1041
        // FIXME: Write Unit tests for Filters
1042
        switch ($filter) {
1043
            case 'pending':
1044
                $addWhere = ' AND exec_time=0';
1045
                break;
1046
            case 'finished':
1047
                $addWhere = ' AND exec_time>0';
1048
                break;
1049
            default:
1050
                $addWhere = '';
1051
                break;
1052
        }
1053
        // FIXME: Write unit test that ensures that the right records are deleted.
1054
        if ($doFlush) {
1055
            $this->flushQueue($doFullFlush ? '' : ('set_id=' . intval($set_id) . $addWhere));
1056
            return [];
1057
        } else {
1058
            return $this->db->exec_SELECTgetRows(
1059
                '*',
1060
                'tx_crawler_queue',
1061
                'set_id=' . intval($set_id) . $addWhere,
1062
                '',
1063
                'scheduled DESC',
1064
                (intval($itemsPerPage) > 0 ? intval($itemsPerPage) : '')
1065
            );
1066
        }
1067
    }
1068
1069
    /**
1070
     * Removes queue entries
1071
     *
1072
     * @param string $where SQL related filter for the entries which should be removed
1073
     * @return void
1074
     */
1075
    protected function flushQueue($where = '')
1076
    {
1077
        $realWhere = strlen($where) > 0 ? $where : '1=1';
1078
1079
        if (EventDispatcher::getInstance()->hasObserver('queueEntryFlush')) {
1080
            $groups = $this->db->exec_SELECTgetRows('DISTINCT set_id', 'tx_crawler_queue', $realWhere);
1081
            if (is_array($groups)) {
1082
                foreach ($groups as $group) {
1083
                    EventDispatcher::getInstance()->post('queueEntryFlush', $group['set_id'], $this->db->exec_SELECTgetRows('uid, set_id', 'tx_crawler_queue', $realWhere . ' AND set_id="' . $group['set_id'] . '"'));
1084
                }
1085
            }
1086
        }
1087
1088
        $this->db->exec_DELETEquery('tx_crawler_queue', $realWhere);
1089
    }
1090
1091
    /**
1092
     * Adding call back entries to log (called from hooks typically, see indexed search class "class.crawler.php"
1093
     *
1094
     * @param integer $setId Set ID
1095
     * @param array $params Parameters to pass to call back function
1096
     * @param string $callBack Call back object reference, eg. 'EXT:indexed_search/class.crawler.php:&tx_indexedsearch_crawler'
1097
     * @param integer $page_id Page ID to attach it to
1098
     * @param integer $schedule Time at which to activate
1099
     * @return void
1100
     */
1101
    public function addQueueEntry_callBack($setId, $params, $callBack, $page_id = 0, $schedule = 0)
1102
    {
1103
        if (!is_array($params)) {
1104
            $params = [];
1105
        }
1106
        $params['_CALLBACKOBJ'] = $callBack;
1107
1108
        // Compile value array:
1109
        $fieldArray = [
1110
            'page_id' => intval($page_id),
1111
            'parameters' => serialize($params),
1112
            'scheduled' => intval($schedule) ? intval($schedule) : $this->getCurrentTime(),
1113
            'exec_time' => 0,
1114
            'set_id' => intval($setId),
1115
            'result_data' => '',
1116
        ];
1117
1118
        $this->db->exec_INSERTquery('tx_crawler_queue', $fieldArray);
1119
    }
1120
1121
    /************************************
1122
     *
1123
     * URL setting
1124
     *
1125
     ************************************/
1126
1127
    /**
1128
     * Setting a URL for crawling:
1129
     *
1130
     * @param integer $id Page ID
1131
     * @param string $url Complete URL
1132
     * @param array $subCfg Sub configuration array (from TS config)
1133
     * @param integer $tstamp Scheduled-time
1134
     * @param string $configurationHash (optional) configuration hash
1135
     * @param bool $skipInnerDuplicationCheck (optional) skip inner duplication check
1136
     * @return bool
1137
     */
1138 4
    public function addUrl(
1139
        $id,
1140
        $url,
1141
        array $subCfg,
1142
        $tstamp,
1143
        $configurationHash = '',
1144
        $skipInnerDuplicationCheck = false
1145
    ) {
1146 4
        $urlAdded = false;
1147 4
        $rows = [];
1148
1149
        // Creating parameters:
1150
        $parameters = [
1151 4
            'url' => $url
1152
        ];
1153
1154
        // fe user group simulation:
1155 4
        $uGs = implode(',', array_unique(GeneralUtility::intExplode(',', $subCfg['userGroups'], true)));
1156 4
        if ($uGs) {
1157
            $parameters['feUserGroupList'] = $uGs;
1158
        }
1159
1160
        // Setting processing instructions
1161 4
        $parameters['procInstructions'] = GeneralUtility::trimExplode(',', $subCfg['procInstrFilter']);
1162 4
        if (is_array($subCfg['procInstrParams.'])) {
1163 4
            $parameters['procInstrParams'] = $subCfg['procInstrParams.'];
1164
        }
1165
1166
        // Possible TypoScript Template Parents
1167 4
        $parameters['rootTemplatePid'] = $subCfg['rootTemplatePid'];
1168
1169
        // Compile value array:
1170 4
        $parameters_serialized = serialize($parameters);
1171
        $fieldArray = [
1172 4
            'page_id' => intval($id),
1173 4
            'parameters' => $parameters_serialized,
1174 4
            'parameters_hash' => GeneralUtility::shortMD5($parameters_serialized),
1175 4
            'configuration_hash' => $configurationHash,
1176 4
            'scheduled' => $tstamp,
1177 4
            'exec_time' => 0,
1178 4
            'set_id' => intval($this->setID),
1179 4
            'result_data' => '',
1180 4
            'configuration' => $subCfg['key'],
1181
        ];
1182
1183 4
        if ($this->registerQueueEntriesInternallyOnly) {
1184
            //the entries will only be registered and not stored to the database
1185
            $this->queueEntries[] = $fieldArray;
1186
        } else {
1187 4
            if (!$skipInnerDuplicationCheck) {
1188
                // check if there is already an equal entry
1189 4
                $rows = $this->getDuplicateRowsIfExist($tstamp, $fieldArray);
1190
            }
1191
1192 4
            if (count($rows) == 0) {
1193 4
                $this->db->exec_INSERTquery('tx_crawler_queue', $fieldArray);
1194 4
                $uid = $this->db->sql_insert_id();
1195 4
                $rows[] = $uid;
1196 4
                $urlAdded = true;
1197 4
                EventDispatcher::getInstance()->post('urlAddedToQueue', $this->setID, ['uid' => $uid, 'fieldArray' => $fieldArray]);
1198
            } else {
1199 2
                EventDispatcher::getInstance()->post('duplicateUrlInQueue', $this->setID, ['rows' => $rows, 'fieldArray' => $fieldArray]);
1200
            }
1201
        }
1202
1203 4
        return $urlAdded;
1204
    }
1205
1206
    /**
1207
     * This method determines duplicates for a queue entry with the same parameters and this timestamp.
1208
     * If the timestamp is in the past, it will check if there is any unprocessed queue entry in the past.
1209
     * If the timestamp is in the future it will check, if the queued entry has exactly the same timestamp
1210
     *
1211
     * @param int $tstamp
1212
     * @param array $fieldArray
1213
     *
1214
     * @return array
1215
     */
1216 4
    protected function getDuplicateRowsIfExist($tstamp, $fieldArray)
1217
    {
1218 4
        $rows = [];
1219
1220 4
        $currentTime = $this->getCurrentTime();
1221
1222
        //if this entry is scheduled with "now"
1223 4
        if ($tstamp <= $currentTime) {
1224 1
            if ($this->extensionSettings['enableTimeslot']) {
1225 1
                $timeBegin = $currentTime - 100;
1226 1
                $timeEnd = $currentTime + 100;
1227 1
                $where = ' ((scheduled BETWEEN ' . $timeBegin . ' AND ' . $timeEnd . ' ) OR scheduled <= ' . $currentTime . ') ';
1228
            } else {
1229 1
                $where = 'scheduled <= ' . $currentTime;
1230
            }
1231 3
        } elseif ($tstamp > $currentTime) {
1232
            //entry with a timestamp in the future need to have the same schedule time
1233 3
            $where = 'scheduled = ' . $tstamp ;
1234
        }
1235
1236 4
        if (!empty($where)) {
1237 4
            $result = $this->db->exec_SELECTgetRows(
1238 4
                'qid',
1239 4
                'tx_crawler_queue',
1240
                $where .
1241 4
                ' AND NOT exec_time' .
1242 4
                ' AND NOT process_id ' .
1243 4
                ' AND page_id=' . intval($fieldArray['page_id']) .
1244 4
                ' AND parameters_hash = ' . $this->db->fullQuoteStr($fieldArray['parameters_hash'], 'tx_crawler_queue')
1245
            );
1246
1247 4
            if (is_array($result)) {
1248 4
                foreach ($result as $value) {
1249 2
                    $rows[] = $value['qid'];
1250
                }
1251
            }
1252
        }
1253
1254 4
        return $rows;
1255
    }
1256
1257
    /**
1258
     * Returns the current system time
1259
     *
1260
     * @return int
1261
     */
1262
    public function getCurrentTime()
1263
    {
1264
        return time();
1265
    }
1266
1267
    /************************************
1268
     *
1269
     * URL reading
1270
     *
1271
     ************************************/
1272
1273
    /**
1274
     * Read URL for single queue entry
1275
     *
1276
     * @param integer $queueId
1277
     * @param boolean $force If set, will process even if exec_time has been set!
1278
     * @return integer
1279
     */
1280
    public function readUrl($queueId, $force = false)
1281
    {
1282
        $ret = 0;
1283
        if ($this->debugMode) {
1284
            GeneralUtility::devlog('crawler-readurl start ' . microtime(true), __FUNCTION__);
1285
        }
1286
        // Get entry:
1287
        list($queueRec) = $this->db->exec_SELECTgetRows(
1288
            '*',
1289
            'tx_crawler_queue',
1290
            'qid=' . intval($queueId) . ($force ? '' : ' AND exec_time=0 AND process_scheduled > 0')
1291
        );
1292
1293
        if (!is_array($queueRec)) {
1294
            return;
1295
        }
1296
1297
        $parameters = unserialize($queueRec['parameters']);
1298
        if ($parameters['rootTemplatePid']) {
1299
            $this->initTSFE((int)$parameters['rootTemplatePid']);
1300
        } else {
1301
            GeneralUtility::sysLog(
1302
                'Page with (' . $queueRec['page_id'] . ') could not be crawled, please check your crawler configuration. Perhaps no Root Template Pid is set',
1303
                'crawler',
1304
                GeneralUtility::SYSLOG_SEVERITY_WARNING
1305
            );
1306
        }
1307
1308
        SignalSlotUtility::emitSignal(
1309
            __CLASS__,
1310
            SignalSlotUtility::SIGNNAL_QUEUEITEM_PREPROCESS,
1311
            [$queueId, &$queueRec]
1312
        );
1313
1314
        // Set exec_time to lock record:
1315
        $field_array = ['exec_time' => $this->getCurrentTime()];
1316
1317
        if (isset($this->processID)) {
1318
            //if mulitprocessing is used we need to store the id of the process which has handled this entry
1319
            $field_array['process_id_completed'] = $this->processID;
1320
        }
1321
        $this->db->exec_UPDATEquery('tx_crawler_queue', 'qid=' . intval($queueId), $field_array);
1322
1323
        $result = $this->readUrl_exec($queueRec);
1324
        $resultData = unserialize($result['content']);
1325
1326
        //atm there's no need to point to specific pollable extensions
1327
        if (is_array($GLOBALS['TYPO3_CONF_VARS']['EXTCONF']['crawler']['pollSuccess'])) {
1328
            foreach ($GLOBALS['TYPO3_CONF_VARS']['EXTCONF']['crawler']['pollSuccess'] as $pollable) {
1329
                // only check the success value if the instruction is runnig
1330
                // it is important to name the pollSuccess key same as the procInstructions key
1331
                if (is_array($resultData['parameters']['procInstructions']) && in_array(
1332
                    $pollable,
1333
                        $resultData['parameters']['procInstructions']
1334
                )
1335
                ) {
1336
                    if (!empty($resultData['success'][$pollable]) && $resultData['success'][$pollable]) {
1337
                        $ret |= self::CLI_STATUS_POLLABLE_PROCESSED;
1338
                    }
1339
                }
1340
            }
1341
        }
1342
1343
        // Set result in log which also denotes the end of the processing of this entry.
1344
        $field_array = ['result_data' => serialize($result)];
1345
1346
        SignalSlotUtility::emitSignal(
1347
            __CLASS__,
1348
            SignalSlotUtility::SIGNNAL_QUEUEITEM_POSTPROCESS,
1349
            [$queueId, &$field_array]
1350
        );
1351
1352
        $this->db->exec_UPDATEquery('tx_crawler_queue', 'qid=' . intval($queueId), $field_array);
1353
1354
        if ($this->debugMode) {
1355
            GeneralUtility::devlog('crawler-readurl stop ' . microtime(true), __FUNCTION__);
1356
        }
1357
1358
        return $ret;
1359
    }
1360
1361
    /**
1362
     * Read URL for not-yet-inserted log-entry
1363
     *
1364
     * @param array $field_array Queue field array,
1365
     *
1366
     * @return string
1367
     */
1368
    public function readUrlFromArray($field_array)
1369
    {
1370
1371
            // Set exec_time to lock record:
1372
        $field_array['exec_time'] = $this->getCurrentTime();
1373
        $this->db->exec_INSERTquery('tx_crawler_queue', $field_array);
1374
        $queueId = $field_array['qid'] = $this->db->sql_insert_id();
1375
1376
        $result = $this->readUrl_exec($field_array);
1377
1378
        // Set result in log which also denotes the end of the processing of this entry.
1379
        $field_array = ['result_data' => serialize($result)];
1380
1381
        SignalSlotUtility::emitSignal(
1382
            __CLASS__,
1383
            SignalSlotUtility::SIGNNAL_QUEUEITEM_POSTPROCESS,
1384
            [$queueId, &$field_array]
1385
        );
1386
1387
        $this->db->exec_UPDATEquery('tx_crawler_queue', 'qid=' . intval($queueId), $field_array);
1388
1389
        return $result;
1390
    }
1391
1392
    /**
1393
     * Read URL for a queue record
1394
     *
1395
     * @param array $queueRec Queue record
1396
     * @return string
1397
     */
1398
    public function readUrl_exec($queueRec)
1399
    {
1400
        // Decode parameters:
1401
        $parameters = unserialize($queueRec['parameters']);
1402
        $result = 'ERROR';
1403
        if (is_array($parameters)) {
1404
            if ($parameters['_CALLBACKOBJ']) { // Calling object:
1405
                $objRef = $parameters['_CALLBACKOBJ'];
1406
                $callBackObj = &GeneralUtility::getUserObj($objRef);
1407
                if (is_object($callBackObj)) {
1408
                    unset($parameters['_CALLBACKOBJ']);
1409
                    $result = ['content' => serialize($callBackObj->crawler_execute($parameters, $this))];
1410
                } else {
1411
                    $result = ['content' => 'No object: ' . $objRef];
1412
                }
1413
            } else { // Regular FE request:
1414
1415
                // Prepare:
1416
                $crawlerId = $queueRec['qid'] . ':' . md5($queueRec['qid'] . '|' . $queueRec['set_id'] . '|' . $GLOBALS['TYPO3_CONF_VARS']['SYS']['encryptionKey']);
1417
1418
                // Get result:
1419
                $result = $this->requestUrl($parameters['url'], $crawlerId);
1420
1421
                EventDispatcher::getInstance()->post('urlCrawled', $queueRec['set_id'], ['url' => $parameters['url'], 'result' => $result]);
1422
            }
1423
        }
1424
1425
        return $result;
1426
    }
1427
1428
    /**
1429
     * Gets the content of a URL.
1430
     *
1431
     * @param string $originalUrl URL to read
1432
     * @param string $crawlerId Crawler ID string (qid + hash to verify)
1433
     * @param integer $timeout Timeout time
1434
     * @param integer $recursion Recursion limiter for 302 redirects
1435
     * @return array
1436
     */
1437
    public function requestUrl($originalUrl, $crawlerId, $timeout = 2, $recursion = 10)
1438
    {
1439
        if (!$recursion) {
1440
            return false;
1441
        }
1442
1443
        // Parse URL, checking for scheme:
1444
        $url = parse_url($originalUrl);
1445
1446
        if ($url === false) {
1447
            if (TYPO3_DLOG) {
1448
                GeneralUtility::devLog(sprintf('Could not parse_url() for string "%s"', $url), 'crawler', 4, ['crawlerId' => $crawlerId]);
1449
            }
1450
            return false;
1451
        }
1452
1453
        if (!in_array($url['scheme'], ['','http','https'])) {
1454
            if (TYPO3_DLOG) {
1455
                GeneralUtility::devLog(sprintf('Scheme does not match for url "%s"', $url), 'crawler', 4, ['crawlerId' => $crawlerId]);
1456
            }
1457
            return false;
1458
        }
1459
1460
        // direct request
1461
        if ($this->extensionSettings['makeDirectRequests']) {
1462
            $result = $this->sendDirectRequest($originalUrl, $crawlerId);
1463
            return $result;
1464
        }
1465
1466
        $reqHeaders = $this->buildRequestHeaderArray($url, $crawlerId);
1467
1468
        // thanks to Pierrick Caillon for adding proxy support
1469
        $rurl = $url;
1470
1471
        if ($GLOBALS['TYPO3_CONF_VARS']['SYS']['curlUse'] && $GLOBALS['TYPO3_CONF_VARS']['SYS']['curlProxyServer']) {
1472
            $rurl = parse_url($GLOBALS['TYPO3_CONF_VARS']['SYS']['curlProxyServer']);
1473
            $url['path'] = $url['scheme'] . '://' . $url['host'] . ($url['port'] > 0 ? ':' . $url['port'] : '') . $url['path'];
1474
            $reqHeaders = $this->buildRequestHeaderArray($url, $crawlerId);
1475
        }
1476
1477
        $host = $rurl['host'];
1478
1479
        if ($url['scheme'] == 'https') {
1480
            $host = 'ssl://' . $host;
1481
            $port = ($rurl['port'] > 0) ? $rurl['port'] : 443;
1482
        } else {
1483
            $port = ($rurl['port'] > 0) ? $rurl['port'] : 80;
1484
        }
1485
1486
        $startTime = microtime(true);
1487
        $fp = fsockopen($host, $port, $errno, $errstr, $timeout);
1488
1489
        if (!$fp) {
1490
            if (TYPO3_DLOG) {
1491
                GeneralUtility::devLog(sprintf('Error while opening "%s"', $url), 'crawler', 4, ['crawlerId' => $crawlerId]);
1492
            }
1493
            return false;
1494
        } else {
1495
            // Request message:
1496
            $msg = implode("\r\n", $reqHeaders) . "\r\n\r\n";
1497
            fputs($fp, $msg);
1498
1499
            // Read response:
1500
            $d = $this->getHttpResponseFromStream($fp);
1501
            fclose($fp);
1502
1503
            $time = microtime(true) - $startTime;
1504
            $this->log($originalUrl . ' ' . $time);
1505
1506
            // Implode content and headers:
1507
            $result = [
1508
                'request' => $msg,
1509
                'headers' => implode('', $d['headers']),
1510
                'content' => implode('', (array)$d['content'])
1511
            ];
1512
1513
            if (($this->extensionSettings['follow30x']) && ($newUrl = $this->getRequestUrlFrom302Header($d['headers'], $url['user'], $url['pass']))) {
1514
                $result = array_merge(['parentRequest' => $result], $this->requestUrl($newUrl, $crawlerId, $recursion--));
0 ignored issues
show
Bug introduced by
It seems like $newUrl defined by $this->getRequestUrlFrom...['user'], $url['pass']) on line 1513 can also be of type boolean; however, AOE\Crawler\Controller\C...ontroller::requestUrl() does only seem to accept string, maybe add an additional type check?

If a method or function can return multiple different values and unless you are sure that you only can receive a single value in this context, we recommend to add an additional type check:

/**
 * @return array|string
 */
function returnsDifferentValues($x) {
    if ($x) {
        return 'foo';
    }

    return array();
}

$x = returnsDifferentValues($y);
if (is_array($x)) {
    // $x is an array.
}

If this a common case that PHP Analyzer should handle natively, please let us know by opening an issue.

Loading history...
1515
                $newRequestUrl = $this->requestUrl($newUrl, $crawlerId, $timeout, --$recursion);
0 ignored issues
show
Bug introduced by
It seems like $newUrl defined by $this->getRequestUrlFrom...['user'], $url['pass']) on line 1513 can also be of type boolean; however, AOE\Crawler\Controller\C...ontroller::requestUrl() does only seem to accept string, maybe add an additional type check?

If a method or function can return multiple different values and unless you are sure that you only can receive a single value in this context, we recommend to add an additional type check:

/**
 * @return array|string
 */
function returnsDifferentValues($x) {
    if ($x) {
        return 'foo';
    }

    return array();
}

$x = returnsDifferentValues($y);
if (is_array($x)) {
    // $x is an array.
}

If this a common case that PHP Analyzer should handle natively, please let us know by opening an issue.

Loading history...
1516
1517
                if (is_array($newRequestUrl)) {
1518
                    $result = array_merge(['parentRequest' => $result], $newRequestUrl);
1519
                } else {
1520
                    if (TYPO3_DLOG) {
1521
                        GeneralUtility::devLog(sprintf('Error while opening "%s"', $url), 'crawler', 4, ['crawlerId' => $crawlerId]);
1522
                    }
1523
                    return false;
1524
                }
1525
            }
1526
1527
            return $result;
1528
        }
1529
    }
1530
1531
    /**
1532
     * Gets the base path of the website frontend.
1533
     * (e.g. if you call http://mydomain.com/cms/index.php in
1534
     * the browser the base path is "/cms/")
1535
     *
1536
     * @return string Base path of the website frontend
1537
     */
1538
    protected function getFrontendBasePath()
1539
    {
1540
        $frontendBasePath = '/';
1541
1542
        // Get the path from the extension settings:
1543
        if (isset($this->extensionSettings['frontendBasePath']) && $this->extensionSettings['frontendBasePath']) {
1544
            $frontendBasePath = $this->extensionSettings['frontendBasePath'];
1545
            // If empty, try to use config.absRefPrefix:
1546
        } elseif (isset($GLOBALS['TSFE']->absRefPrefix) && !empty($GLOBALS['TSFE']->absRefPrefix)) {
1547
            $frontendBasePath = $GLOBALS['TSFE']->absRefPrefix;
1548
            // If not in CLI mode the base path can be determined from $_SERVER environment:
1549
        } elseif (!defined('TYPO3_REQUESTTYPE_CLI') || !TYPO3_REQUESTTYPE_CLI) {
1550
            $frontendBasePath = GeneralUtility::getIndpEnv('TYPO3_SITE_PATH');
1551
        }
1552
1553
        // Base path must be '/<pathSegements>/':
1554
        if ($frontendBasePath != '/') {
1555
            $frontendBasePath = '/' . ltrim($frontendBasePath, '/');
1556
            $frontendBasePath = rtrim($frontendBasePath, '/') . '/';
1557
        }
1558
1559
        return $frontendBasePath;
1560
    }
1561
1562
    /**
1563
     * Executes a shell command and returns the outputted result.
1564
     *
1565
     * @param string $command Shell command to be executed
1566
     * @return string Outputted result of the command execution
1567
     */
1568
    protected function executeShellCommand($command)
1569
    {
1570
        $result = shell_exec($command);
1571
        return $result;
1572
    }
1573
1574
    /**
1575
     * Reads HTTP response from the given stream.
1576
     *
1577
     * @param  resource $streamPointer  Pointer to connection stream.
1578
     * @return array                    Associative array with the following items:
1579
     *                                  headers <array> Response headers sent by server.
1580
     *                                  content <array> Content, with each line as an array item.
1581
     */
1582 1
    protected function getHttpResponseFromStream($streamPointer)
1583
    {
1584 1
        $response = ['headers' => [], 'content' => []];
1585
1586 1
        if (is_resource($streamPointer)) {
1587
            // read headers
1588 1
            while ($line = fgets($streamPointer, '2048')) {
1589 1
                $line = trim($line);
1590 1
                if ($line !== '') {
1591 1
                    $response['headers'][] = $line;
1592
                } else {
1593 1
                    break;
1594
                }
1595
            }
1596
1597
            // read content
1598 1
            while ($line = fgets($streamPointer, '2048')) {
1599 1
                $response['content'][] = $line;
1600
            }
1601
        }
1602
1603 1
        return $response;
1604
    }
1605
1606
    /**
1607
     * @param message
1608
     */
1609
    protected function log($message)
1610
    {
1611
        if (!empty($this->extensionSettings['logFileName'])) {
1612
            $fileResult = @file_put_contents($this->extensionSettings['logFileName'], date('Ymd His') . ' ' . $message . PHP_EOL, FILE_APPEND);
1613
            if (!$fileResult) {
1614
                GeneralUtility::devLog('File "' . $this->extensionSettings['logFileName'] . '" could not be written, please check file permissions.', 'crawler', LogLevel::INFO);
1615
            }
1616
        }
1617
    }
1618
1619
    /**
1620
     * Builds HTTP request headers.
1621
     *
1622
     * @param array $url
1623
     * @param string $crawlerId
1624
     *
1625
     * @return array
1626
     */
1627
    protected function buildRequestHeaderArray(array $url, $crawlerId)
1628
    {
1629
        $reqHeaders = [];
1630
        $reqHeaders[] = 'GET ' . $url['path'] . ($url['query'] ? '?' . $url['query'] : '') . ' HTTP/1.0';
1631
        $reqHeaders[] = 'Host: ' . $url['host'];
1632
        if (stristr($url['query'], 'ADMCMD_previewWS')) {
1633
            $reqHeaders[] = 'Cookie: $Version="1"; be_typo_user="1"; $Path=/';
1634
        }
1635
        $reqHeaders[] = 'Connection: close';
1636
        if ($url['user'] != '') {
1637
            $reqHeaders[] = 'Authorization: Basic ' . base64_encode($url['user'] . ':' . $url['pass']);
1638
        }
1639
        $reqHeaders[] = 'X-T3crawler: ' . $crawlerId;
1640
        $reqHeaders[] = 'User-Agent: TYPO3 crawler';
1641
        return $reqHeaders;
1642
    }
1643
1644
    /**
1645
     * Check if the submitted HTTP-Header contains a redirect location and built new crawler-url
1646
     *
1647
     * @param array $headers HTTP Header
1648
     * @param string $user HTTP Auth. User
1649
     * @param string $pass HTTP Auth. Password
1650
     * @return bool|string
1651
     */
1652
    protected function getRequestUrlFrom302Header($headers, $user = '', $pass = '')
1653
    {
1654
        $header = [];
1655
        if (!is_array($headers)) {
1656
            return false;
1657
        }
1658
        if (!(stristr($headers[0], '301 Moved') || stristr($headers[0], '302 Found') || stristr($headers[0], '302 Moved'))) {
1659
            return false;
1660
        }
1661
1662
        foreach ($headers as $hl) {
1663
            $tmp = explode(": ", $hl);
1664
            $header[trim($tmp[0])] = trim($tmp[1]);
1665
            if (trim($tmp[0]) == 'Location') {
1666
                break;
1667
            }
1668
        }
1669
        if (!array_key_exists('Location', $header)) {
1670
            return false;
1671
        }
1672
1673
        if ($user != '') {
1674
            if (!($tmp = parse_url($header['Location']))) {
1675
                return false;
1676
            }
1677
            $newUrl = $tmp['scheme'] . '://' . $user . ':' . $pass . '@' . $tmp['host'] . $tmp['path'];
1678
            if ($tmp['query'] != '') {
1679
                $newUrl .= '?' . $tmp['query'];
1680
            }
1681
        } else {
1682
            $newUrl = $header['Location'];
1683
        }
1684
        return $newUrl;
1685
    }
1686
1687
    /**************************
1688
     *
1689
     * tslib_fe hooks:
1690
     *
1691
     **************************/
1692
1693
    /**
1694
     * Initialization hook (called after database connection)
1695
     * Takes the "HTTP_X_T3CRAWLER" header and looks up queue record and verifies if the session comes from the system (by comparing hashes)
1696
     *
1697
     * @param array $params Parameters from frontend
1698
     * @param object $ref TSFE object (reference under PHP5)
1699
     * @return void
1700
     *
1701
     * FIXME: Look like this is not used, in commit 9910d3f40cce15f4e9b7bcd0488bf21f31d53ebc it's added as public,
1702
     * FIXME: I think this can be removed. (TNM)
1703
     */
1704
    public function fe_init(&$params, $ref)
0 ignored issues
show
Unused Code introduced by
The parameter $ref is not used and could be removed.

This check looks from parameters that have been defined for a function or method, but which are not used in the method body.

Loading history...
1705
    {
1706
        // Authenticate crawler request:
1707
        if (isset($_SERVER['HTTP_X_T3CRAWLER'])) {
1708
            list($queueId, $hash) = explode(':', $_SERVER['HTTP_X_T3CRAWLER']);
1709
            list($queueRec) = $this->db->exec_SELECTgetSingleRow('*', 'tx_crawler_queue', 'qid=' . intval($queueId));
1710
1711
            // If a crawler record was found and hash was matching, set it up:
1712
            if (is_array($queueRec) && $hash === md5($queueRec['qid'] . '|' . $queueRec['set_id'] . '|' . $GLOBALS['TYPO3_CONF_VARS']['SYS']['encryptionKey'])) {
1713
                $params['pObj']->applicationData['tx_crawler']['running'] = true;
1714
                $params['pObj']->applicationData['tx_crawler']['parameters'] = unserialize($queueRec['parameters']);
1715
                $params['pObj']->applicationData['tx_crawler']['log'] = [];
1716
            } else {
1717
                die('No crawler entry found!');
0 ignored issues
show
Coding Style Compatibility introduced by
The method fe_init() contains an exit expression.

An exit expression should only be used in rare cases. For example, if you write a short command line script.

In most cases however, using an exit expression makes the code untestable and often causes incompatibilities with other libraries. Thus, unless you are absolutely sure it is required here, we recommend to refactor your code to avoid its usage.

Loading history...
1718
            }
1719
        }
1720
    }
1721
1722
    /*****************************
1723
     *
1724
     * Compiling URLs to crawl - tools
1725
     *
1726
     *****************************/
1727
1728
    /**
1729
     * @param integer $id Root page id to start from.
1730
     * @param integer $depth Depth of tree, 0=only id-page, 1= on sublevel, 99 = infinite
1731
     * @param integer $scheduledTime Unix Time when the URL is timed to be visited when put in queue
1732
     * @param integer $reqMinute Number of requests per minute (creates the interleave between requests)
1733
     * @param boolean $submitCrawlUrls If set, submits the URLs to queue in database (real crawling)
1734
     * @param boolean $downloadCrawlUrls If set (and submitcrawlUrls is false) will fill $downloadUrls with entries)
1735
     * @param array $incomingProcInstructions Array of processing instructions
1736
     * @param array $configurationSelection Array of configuration keys
1737
     * @return string
1738
     */
1739
    public function getPageTreeAndUrls(
1740
        $id,
1741
        $depth,
1742
        $scheduledTime,
1743
        $reqMinute,
1744
        $submitCrawlUrls,
1745
        $downloadCrawlUrls,
1746
        array $incomingProcInstructions,
1747
        array $configurationSelection
1748
    ) {
1749
        global $BACK_PATH;
1750
        global $LANG;
1751
        if (!is_object($LANG)) {
1752
            $LANG = GeneralUtility::makeInstance('language');
1753
            $LANG->init(0);
1754
        }
1755
        $this->scheduledTime = $scheduledTime;
1756
        $this->reqMinute = $reqMinute;
1757
        $this->submitCrawlUrls = $submitCrawlUrls;
1758
        $this->downloadCrawlUrls = $downloadCrawlUrls;
1759
        $this->incomingProcInstructions = $incomingProcInstructions;
1760
        $this->incomingConfigurationSelection = $configurationSelection;
1761
1762
        $this->duplicateTrack = [];
1763
        $this->downloadUrls = [];
1764
1765
        // Drawing tree:
1766
        /* @var PageTreeView $tree */
1767
        $tree = GeneralUtility::makeInstance(PageTreeView::class);
1768
        $perms_clause = $GLOBALS['BE_USER']->getPagePermsClause(1);
1769
        $tree->init('AND ' . $perms_clause);
1770
1771
        $pageInfo = BackendUtility::readPageAccess($id, $perms_clause);
1772
        if (is_array($pageInfo)) {
1773
            // Set root row:
1774
            $tree->tree[] = [
1775
                'row' => $pageInfo,
1776
                'HTML' => IconUtility::getIconForRecord('pages', $pageInfo)
1777
            ];
1778
        }
1779
1780
        // Get branch beneath:
1781
        if ($depth) {
1782
            $tree->getTree($id, $depth, '');
1783
        }
1784
1785
        // Traverse page tree:
1786
        $code = '';
1787
1788
        foreach ($tree->tree as $data) {
1789
            $this->MP = false;
1790
1791
            // recognize mount points
1792
            if ($data['row']['doktype'] == 7) {
1793
                $mountpage = $this->db->exec_SELECTgetRows('*', 'pages', 'uid = ' . $data['row']['uid']);
1794
1795
                // fetch mounted pages
1796
                $this->MP = $mountpage[0]['mount_pid'] . '-' . $data['row']['uid'];
0 ignored issues
show
Documentation Bug introduced by
The property $MP was declared of type boolean, but $mountpage[0]['mount_pid...' . $data['row']['uid'] is of type string. Maybe add a type cast?

This check looks for assignments to scalar types that may be of the wrong type.

To ensure the code behaves as expected, it may be a good idea to add an explicit type cast.

$answer = 42;

$correct = false;

$correct = (bool) $answer;
Loading history...
1797
1798
                $mountTree = GeneralUtility::makeInstance(PageTreeView::class);
1799
                $mountTree->init('AND ' . $perms_clause);
1800
                $mountTree->getTree($mountpage[0]['mount_pid'], $depth, '');
1801
1802
                foreach ($mountTree->tree as $mountData) {
1803
                    $code .= $this->drawURLs_addRowsForPage(
1804
                        $mountData['row'],
1805
                        $mountData['HTML'] . BackendUtility::getRecordTitle('pages', $mountData['row'], true)
1806
                    );
1807
                }
1808
1809
                // replace page when mount_pid_ol is enabled
1810
                if ($mountpage[0]['mount_pid_ol']) {
1811
                    $data['row']['uid'] = $mountpage[0]['mount_pid'];
1812
                } else {
1813
                    // if the mount_pid_ol is not set the MP must not be used for the mountpoint page
1814
                    $this->MP = false;
1815
                }
1816
            }
1817
1818
            $code .= $this->drawURLs_addRowsForPage(
1819
                $data['row'],
1820
                $data['HTML'] . BackendUtility::getRecordTitle('pages', $data['row'], true)
1821
            );
1822
        }
1823
1824
        return $code;
1825
    }
1826
1827
    /**
1828
     * Expands exclude string
1829
     *
1830
     * @param string $excludeString Exclude string
1831
     * @return array
1832
     */
1833 1
    public function expandExcludeString($excludeString)
1834
    {
1835
        // internal static caches;
1836 1
        static $expandedExcludeStringCache;
1837 1
        static $treeCache;
1838
1839 1
        if (empty($expandedExcludeStringCache[$excludeString])) {
1840 1
            $pidList = [];
1841
1842 1
            if (!empty($excludeString)) {
1843
                /** @var PageTreeView $tree */
1844
                $tree = GeneralUtility::makeInstance(PageTreeView::class);
1845
                $tree->init('AND ' . $this->backendUser->getPagePermsClause(1));
1846
1847
                $excludeParts = GeneralUtility::trimExplode(',', $excludeString);
1848
1849
                foreach ($excludeParts as $excludePart) {
1850
                    list($pid, $depth) = GeneralUtility::trimExplode('+', $excludePart);
1851
1852
                    // default is "page only" = "depth=0"
1853
                    if (empty($depth)) {
1854
                        $depth = (stristr($excludePart, '+')) ? 99 : 0;
1855
                    }
1856
1857
                    $pidList[] = $pid;
1858
1859
                    if ($depth > 0) {
1860
                        if (empty($treeCache[$pid][$depth])) {
1861
                            $tree->reset();
1862
                            $tree->getTree($pid, $depth);
1863
                            $treeCache[$pid][$depth] = $tree->tree;
1864
                        }
1865
1866
                        foreach ($treeCache[$pid][$depth] as $data) {
1867
                            $pidList[] = $data['row']['uid'];
1868
                        }
1869
                    }
1870
                }
1871
            }
1872
1873 1
            $expandedExcludeStringCache[$excludeString] = array_unique($pidList);
1874
        }
1875
1876 1
        return $expandedExcludeStringCache[$excludeString];
1877
    }
1878
1879
    /**
1880
     * Create the rows for display of the page tree
1881
     * For each page a number of rows are shown displaying GET variable configuration
1882
     *
1883
     * @param    array        Page row
1884
     * @param    string        Page icon and title for row
1885
     * @return    string        HTML <tr> content (one or more)
1886
     */
1887
    public function drawURLs_addRowsForPage(array $pageRow, $pageTitleAndIcon)
1888
    {
1889
        $skipMessage = '';
1890
1891
        // Get list of configurations
1892
        $configurations = $this->getUrlsForPageRow($pageRow, $skipMessage);
1893
1894
        if (count($this->incomingConfigurationSelection) > 0) {
1895
            // remove configuration that does not match the current selection
1896
            foreach ($configurations as $confKey => $confArray) {
1897
                if (!in_array($confKey, $this->incomingConfigurationSelection)) {
1898
                    unset($configurations[$confKey]);
1899
                }
1900
            }
1901
        }
1902
1903
        // Traverse parameter combinations:
1904
        $c = 0;
1905
        $content = '';
1906
        if (count($configurations)) {
1907
            foreach ($configurations as $confKey => $confArray) {
1908
1909
                    // Title column:
1910
                if (!$c) {
1911
                    $titleClm = '<td rowspan="' . count($configurations) . '">' . $pageTitleAndIcon . '</td>';
1912
                } else {
1913
                    $titleClm = '';
1914
                }
1915
1916
                if (!in_array($pageRow['uid'], $this->expandExcludeString($confArray['subCfg']['exclude']))) {
1917
1918
                        // URL list:
1919
                    $urlList = $this->urlListFromUrlArray(
1920
                        $confArray,
1921
                        $pageRow,
1922
                        $this->scheduledTime,
1923
                        $this->reqMinute,
1924
                        $this->submitCrawlUrls,
1925
                        $this->downloadCrawlUrls,
1926
                        $this->duplicateTrack,
1927
                        $this->downloadUrls,
1928
                        $this->incomingProcInstructions // if empty the urls won't be filtered by processing instructions
1929
                    );
1930
1931
                    // Expanded parameters:
1932
                    $paramExpanded = '';
1933
                    $calcAccu = [];
1934
                    $calcRes = 1;
1935
                    foreach ($confArray['paramExpanded'] as $gVar => $gVal) {
1936
                        $paramExpanded .= '
1937
                            <tr>
1938
                                <td class="bgColor4-20">' . htmlspecialchars('&' . $gVar . '=') . '<br/>' .
1939
                                                '(' . count($gVal) . ')' .
1940
                                                '</td>
1941
                                <td class="bgColor4" nowrap="nowrap">' . nl2br(htmlspecialchars(implode(chr(10), $gVal))) . '</td>
1942
                            </tr>
1943
                        ';
1944
                        $calcRes *= count($gVal);
1945
                        $calcAccu[] = count($gVal);
1946
                    }
1947
                    $paramExpanded = '<table class="lrPadding c-list param-expanded">' . $paramExpanded . '</table>';
1948
                    $paramExpanded .= 'Comb: ' . implode('*', $calcAccu) . '=' . $calcRes;
1949
1950
                    // Options
1951
                    $optionValues = '';
1952
                    if ($confArray['subCfg']['userGroups']) {
1953
                        $optionValues .= 'User Groups: ' . $confArray['subCfg']['userGroups'] . '<br/>';
1954
                    }
1955
                    if ($confArray['subCfg']['baseUrl']) {
1956
                        $optionValues .= 'Base Url: ' . $confArray['subCfg']['baseUrl'] . '<br/>';
1957
                    }
1958
                    if ($confArray['subCfg']['procInstrFilter']) {
1959
                        $optionValues .= 'ProcInstr: ' . $confArray['subCfg']['procInstrFilter'] . '<br/>';
1960
                    }
1961
1962
                    // Compile row:
1963
                    $content .= '
1964
                        <tr class="bgColor' . ($c % 2 ? '-20' : '-10') . '">
1965
                            ' . $titleClm . '
1966
                            <td>' . htmlspecialchars($confKey) . '</td>
1967
                            <td>' . nl2br(htmlspecialchars(rawurldecode(trim(str_replace('&', chr(10) . '&', GeneralUtility::implodeArrayForUrl('', $confArray['paramParsed'])))))) . '</td>
1968
                            <td>' . $paramExpanded . '</td>
1969
                            <td nowrap="nowrap">' . $urlList . '</td>
1970
                            <td nowrap="nowrap">' . $optionValues . '</td>
1971
                            <td nowrap="nowrap">' . DebugUtility::viewArray($confArray['subCfg']['procInstrParams.']) . '</td>
1972
                        </tr>';
1973
                } else {
1974
                    $content .= '<tr class="bgColor' . ($c % 2 ? '-20' : '-10') . '">
1975
                            ' . $titleClm . '
1976
                            <td>' . htmlspecialchars($confKey) . '</td>
1977
                            <td colspan="5"><em>No entries</em> (Page is excluded in this configuration)</td>
1978
                        </tr>';
1979
                }
1980
1981
                $c++;
1982
            }
1983
        } else {
1984
            $message = !empty($skipMessage) ? ' (' . $skipMessage . ')' : '';
1985
1986
            // Compile row:
1987
            $content .= '
1988
                <tr class="bgColor-20" style="border-bottom: 1px solid black;">
1989
                    <td>' . $pageTitleAndIcon . '</td>
1990
                    <td colspan="6"><em>No entries</em>' . $message . '</td>
1991
                </tr>';
1992
        }
1993
1994
        return $content;
1995
    }
1996
1997
    /**
1998
     * @return int
1999
     */
2000
    public function getUnprocessedItemsCount()
2001
    {
2002
        $res = $this->db->exec_SELECTquery(
2003
            'count(*) as num',
2004
            'tx_crawler_queue',
2005
            'exec_time=0 AND process_scheduled=0 AND scheduled<=' . $this->getCurrentTime()
2006
        );
2007
2008
        $count = $this->db->sql_fetch_assoc($res);
2009
        return $count['num'];
2010
    }
2011
2012
    /*****************************
2013
     *
2014
     * CLI functions
2015
     *
2016
     *****************************/
2017
2018
    /**
2019
     * Main function for running from Command Line PHP script (cron job)
2020
     * See ext/crawler/cli/crawler_cli.phpsh for details
2021
     *
2022
     * @return int number of remaining items or false if error
2023
     */
2024
    public function CLI_main()
2025
    {
2026
        $this->setAccessMode('cli');
2027
        $result = self::CLI_STATUS_NOTHING_PROCCESSED;
2028
        $cliObj = GeneralUtility::makeInstance(CrawlerCommandLineController::class);
2029
2030
        if (isset($cliObj->cli_args['-h']) || isset($cliObj->cli_args['--help'])) {
2031
            $cliObj->cli_validateArgs();
2032
            $cliObj->cli_help();
2033
            exit;
0 ignored issues
show
Coding Style Compatibility introduced by
The method CLI_main() contains an exit expression.

An exit expression should only be used in rare cases. For example, if you write a short command line script.

In most cases however, using an exit expression makes the code untestable and often causes incompatibilities with other libraries. Thus, unless you are absolutely sure it is required here, we recommend to refactor your code to avoid its usage.

Loading history...
2034
        }
2035
2036
        if (!$this->getDisabled() && $this->CLI_checkAndAcquireNewProcess($this->CLI_buildProcessId())) {
2037
            $countInARun = $cliObj->cli_argValue('--countInARun') ? intval($cliObj->cli_argValue('--countInARun')) : $this->extensionSettings['countInARun'];
2038
            // Seconds
2039
            $sleepAfterFinish = $cliObj->cli_argValue('--sleepAfterFinish') ? intval($cliObj->cli_argValue('--sleepAfterFinish')) : $this->extensionSettings['sleepAfterFinish'];
2040
            // Milliseconds
2041
            $sleepTime = $cliObj->cli_argValue('--sleepTime') ? intval($cliObj->cli_argValue('--sleepTime')) : $this->extensionSettings['sleepTime'];
2042
2043
            try {
2044
                // Run process:
2045
                $result = $this->CLI_run($countInARun, $sleepTime, $sleepAfterFinish);
2046
            } catch (\Exception $e) {
2047
                $this->CLI_debug(get_class($e) . ': ' . $e->getMessage());
2048
                $result = self::CLI_STATUS_ABORTED;
2049
            }
2050
2051
            // Cleanup
2052
            $this->db->exec_DELETEquery('tx_crawler_process', 'assigned_items_count = 0');
2053
2054
            //TODO can't we do that in a clean way?
2055
            $releaseStatus = $this->CLI_releaseProcesses($this->CLI_buildProcessId());
0 ignored issues
show
Unused Code introduced by
$releaseStatus is not used, you could remove the assignment.

This check looks for variable assignements that are either overwritten by other assignments or where the variable is not used subsequently.

$myVar = 'Value';
$higher = false;

if (rand(1, 6) > 3) {
    $higher = true;
} else {
    $higher = false;
}

Both the $myVar assignment in line 1 and the $higher assignment in line 2 are dead. The first because $myVar is never used and the second because $higher is always overwritten for every possible time line.

Loading history...
2056
2057
            $this->CLI_debug("Unprocessed Items remaining:" . $this->getUnprocessedItemsCount() . " (" . $this->CLI_buildProcessId() . ")");
2058
            $result |= ($this->getUnprocessedItemsCount() > 0 ? self::CLI_STATUS_REMAIN : self::CLI_STATUS_NOTHING_PROCCESSED);
2059
        } else {
2060
            $result |= self::CLI_STATUS_ABORTED;
2061
        }
2062
2063
        return $result;
2064
    }
2065
2066
    /**
2067
     * Function executed by crawler_im.php cli script.
2068
     *
2069
     * @return void
2070
     */
2071
    public function CLI_main_im()
2072
    {
2073
        $this->setAccessMode('cli_im');
2074
2075
        $cliObj = GeneralUtility::makeInstance(QueueCommandLineController::class);
2076
2077
        // Force user to admin state and set workspace to "Live":
2078
        $this->backendUser->user['admin'] = 1;
2079
        $this->backendUser->setWorkspace(0);
2080
2081
        // Print help
2082
        if (!isset($cliObj->cli_args['_DEFAULT'][1])) {
2083
            $cliObj->cli_validateArgs();
2084
            $cliObj->cli_help();
2085
            exit;
0 ignored issues
show
Coding Style Compatibility introduced by
The method CLI_main_im() contains an exit expression.

An exit expression should only be used in rare cases. For example, if you write a short command line script.

In most cases however, using an exit expression makes the code untestable and often causes incompatibilities with other libraries. Thus, unless you are absolutely sure it is required here, we recommend to refactor your code to avoid its usage.

Loading history...
2086
        }
2087
2088
        $cliObj->cli_validateArgs();
2089
2090
        if ($cliObj->cli_argValue('-o') === 'exec') {
2091
            $this->registerQueueEntriesInternallyOnly = true;
2092
        }
2093
2094
        if (isset($cliObj->cli_args['_DEFAULT'][2])) {
2095
            // Crawler is called over TYPO3 BE
2096
            $pageId = MathUtility::forceIntegerInRange($cliObj->cli_args['_DEFAULT'][2], 0);
2097
        } else {
2098
            // Crawler is called over cli
2099
            $pageId = MathUtility::forceIntegerInRange($cliObj->cli_args['_DEFAULT'][1], 0);
2100
        }
2101
2102
        $configurationKeys = $this->getConfigurationKeys($cliObj);
2103
2104
        if (!is_array($configurationKeys)) {
2105
            $configurations = $this->getUrlsForPageId($pageId);
2106
            if (is_array($configurations)) {
2107
                $configurationKeys = array_keys($configurations);
2108
            } else {
2109
                $configurationKeys = [];
2110
            }
2111
        }
2112
2113
        if ($cliObj->cli_argValue('-o') === 'queue' || $cliObj->cli_argValue('-o') === 'exec') {
2114
            $reason = new Reason();
2115
            $reason->setReason(Reason::REASON_GUI_SUBMIT);
2116
            $reason->setDetailText('The cli script of the crawler added to the queue');
2117
            EventDispatcher::getInstance()->post(
2118
                'invokeQueueChange',
2119
                $this->setID,
2120
                ['reason' => $reason]
2121
            );
2122
        }
2123
2124
        if ($this->extensionSettings['cleanUpOldQueueEntries']) {
2125
            $this->cleanUpOldQueueEntries();
2126
        }
2127
2128
        $this->setID = (int) GeneralUtility::md5int(microtime());
2129
        $this->getPageTreeAndUrls(
2130
            $pageId,
2131
            MathUtility::forceIntegerInRange($cliObj->cli_argValue('-d'), 0, 99),
2132
            $this->getCurrentTime(),
2133
            MathUtility::forceIntegerInRange($cliObj->cli_isArg('-n') ? $cliObj->cli_argValue('-n') : 30, 1, 1000),
2134
            $cliObj->cli_argValue('-o') === 'queue' || $cliObj->cli_argValue('-o') === 'exec',
2135
            $cliObj->cli_argValue('-o') === 'url',
2136
            GeneralUtility::trimExplode(',', $cliObj->cli_argValue('-proc'), true),
2137
            $configurationKeys
2138
        );
2139
2140
        if ($cliObj->cli_argValue('-o') === 'url') {
2141
            $cliObj->cli_echo(implode(chr(10), $this->downloadUrls) . chr(10), true);
2142
        } elseif ($cliObj->cli_argValue('-o') === 'exec') {
2143
            $cliObj->cli_echo("Executing " . count($this->urlList) . " requests right away:\n\n");
2144
            $cliObj->cli_echo(implode(chr(10), $this->urlList) . chr(10));
2145
            $cliObj->cli_echo("\nProcessing:\n");
2146
2147
            foreach ($this->queueEntries as $queueRec) {
2148
                $p = unserialize($queueRec['parameters']);
2149
                $cliObj->cli_echo($p['url'] . ' (' . implode(',', $p['procInstructions']) . ') => ');
2150
2151
                $result = $this->readUrlFromArray($queueRec);
2152
2153
                $requestResult = unserialize($result['content']);
2154
                if (is_array($requestResult)) {
2155
                    $resLog = is_array($requestResult['log']) ? chr(10) . chr(9) . chr(9) . implode(chr(10) . chr(9) . chr(9), $requestResult['log']) : '';
2156
                    $cliObj->cli_echo('OK: ' . $resLog . chr(10));
2157
                } else {
2158
                    $cliObj->cli_echo('Error checking Crawler Result: ' . substr(preg_replace('/\s+/', ' ', strip_tags($result['content'])), 0, 30000) . '...' . chr(10));
2159
                }
2160
            }
2161
        } elseif ($cliObj->cli_argValue('-o') === 'queue') {
2162
            $cliObj->cli_echo("Putting " . count($this->urlList) . " entries in queue:\n\n");
2163
            $cliObj->cli_echo(implode(chr(10), $this->urlList) . chr(10));
2164
        } else {
2165
            $cliObj->cli_echo(count($this->urlList) . " entries found for processing. (Use -o to decide action):\n\n", true);
2166
            $cliObj->cli_echo(implode(chr(10), $this->urlList) . chr(10), true);
2167
        }
2168
    }
2169
2170
    /**
2171
     * Function executed by crawler_im.php cli script.
2172
     *
2173
     * @return bool
2174
     */
2175
    public function CLI_main_flush()
2176
    {
2177
        $this->setAccessMode('cli_flush');
2178
        $cliObj = GeneralUtility::makeInstance(FlushCommandLineController::class);
2179
2180
        // Force user to admin state and set workspace to "Live":
2181
        $this->backendUser->user['admin'] = 1;
2182
        $this->backendUser->setWorkspace(0);
2183
2184
        // Print help
2185
        if (!isset($cliObj->cli_args['_DEFAULT'][1])) {
2186
            $cliObj->cli_validateArgs();
2187
            $cliObj->cli_help();
2188
            exit;
0 ignored issues
show
Coding Style Compatibility introduced by
The method CLI_main_flush() contains an exit expression.

An exit expression should only be used in rare cases. For example, if you write a short command line script.

In most cases however, using an exit expression makes the code untestable and often causes incompatibilities with other libraries. Thus, unless you are absolutely sure it is required here, we recommend to refactor your code to avoid its usage.

Loading history...
2189
        }
2190
2191
        $cliObj->cli_validateArgs();
2192
        $pageId = MathUtility::forceIntegerInRange($cliObj->cli_args['_DEFAULT'][1], 0);
2193
        $fullFlush = ($pageId == 0);
2194
2195
        $mode = $cliObj->cli_argValue('-o');
2196
2197
        switch ($mode) {
2198
            case 'all':
2199
                $result = $this->getLogEntriesForPageId($pageId, '', true, $fullFlush);
2200
                break;
2201
            case 'finished':
2202
            case 'pending':
2203
                $result = $this->getLogEntriesForPageId($pageId, $mode, true, $fullFlush);
2204
                break;
2205
            default:
2206
                $cliObj->cli_validateArgs();
2207
                $cliObj->cli_help();
2208
                $result = false;
2209
        }
2210
2211
        return $result !== false;
2212
    }
2213
2214
    /**
2215
     * Obtains configuration keys from the CLI arguments
2216
     *
2217
     * @param  QueueCommandLineController $cliObj    Command line object
2218
     * @return mixed                        Array of keys or null if no keys found
2219
     */
2220
    protected function getConfigurationKeys(QueueCommandLineController &$cliObj)
2221
    {
2222
        $parameter = trim($cliObj->cli_argValue('-conf'));
2223
        return ($parameter != '' ? GeneralUtility::trimExplode(',', $parameter) : []);
2224
    }
2225
2226
    /**
2227
     * Running the functionality of the CLI (crawling URLs from queue)
2228
     *
2229
     * @param int $countInARun
2230
     * @param int $sleepTime
2231
     * @param int $sleepAfterFinish
2232
     * @return string
2233
     */
2234
    public function CLI_run($countInARun, $sleepTime, $sleepAfterFinish)
2235
    {
2236
        $result = 0;
2237
        $counter = 0;
2238
2239
        // First, run hooks:
2240
        $this->CLI_runHooks();
2241
2242
        // Clean up the queue
2243
        if (intval($this->extensionSettings['purgeQueueDays']) > 0) {
2244
            $purgeDate = $this->getCurrentTime() - 24 * 60 * 60 * intval($this->extensionSettings['purgeQueueDays']);
2245
            $del = $this->db->exec_DELETEquery(
2246
                'tx_crawler_queue',
2247
                'exec_time!=0 AND exec_time<' . $purgeDate
2248
            );
2249
            if (false == $del) {
2250
                GeneralUtility::devLog('Records could not be deleted.', 'crawler', LogLevel::INFO);
2251
            }
2252
        }
2253
2254
        // Select entries:
2255
        //TODO Shouldn't this reside within the transaction?
2256
        $rows = $this->db->exec_SELECTgetRows(
2257
            'qid,scheduled',
2258
            'tx_crawler_queue',
2259
            'exec_time=0
2260
                AND process_scheduled= 0
2261
                AND scheduled<=' . $this->getCurrentTime(),
2262
            '',
2263
            'scheduled, qid',
2264
        intval($countInARun)
2265
        );
2266
2267
        if (count($rows) > 0) {
2268
            $quidList = [];
2269
2270
            foreach ($rows as $r) {
0 ignored issues
show
Bug introduced by
The expression $rows of type null|array is not guaranteed to be traversable. How about adding an additional type check?

There are different options of fixing this problem.

  1. If you want to be on the safe side, you can add an additional type-check:

    $collection = json_decode($data, true);
    if ( ! is_array($collection)) {
        throw new \RuntimeException('$collection must be an array.');
    }
    
    foreach ($collection as $item) { /** ... */ }
    
  2. If you are sure that the expression is traversable, you might want to add a doc comment cast to improve IDE auto-completion and static analysis:

    /** @var array $collection */
    $collection = json_decode($data, true);
    
    foreach ($collection as $item) { /** .. */ }
    
  3. Mark the issue as a false-positive: Just hover the remove button, in the top-right corner of this issue for more options.

Loading history...
2271
                $quidList[] = $r['qid'];
2272
            }
2273
2274
            $processId = $this->CLI_buildProcessId();
2275
2276
            //reserve queue entries for process
2277
            $this->db->sql_query('BEGIN');
2278
            //TODO make sure we're not taking assigned queue-entires
2279
            $this->db->exec_UPDATEquery(
2280
                'tx_crawler_queue',
2281
                'qid IN (' . implode(',', $quidList) . ')',
2282
                [
2283
                    'process_scheduled' => intval($this->getCurrentTime()),
2284
                    'process_id' => $processId
2285
                ]
2286
            );
2287
2288
            //save the number of assigned queue entrys to determine who many have been processed later
2289
            $numberOfAffectedRows = $this->db->sql_affected_rows();
2290
            $this->db->exec_UPDATEquery(
2291
                'tx_crawler_process',
2292
                "process_id = '" . $processId . "'",
2293
                [
2294
                    'assigned_items_count' => intval($numberOfAffectedRows)
2295
                ]
2296
            );
2297
2298
            if ($numberOfAffectedRows == count($quidList)) {
2299
                $this->db->sql_query('COMMIT');
2300
            } else {
2301
                $this->db->sql_query('ROLLBACK');
2302
                $this->CLI_debug("Nothing processed due to multi-process collision (" . $this->CLI_buildProcessId() . ")");
2303
                return ($result | self::CLI_STATUS_ABORTED);
2304
            }
2305
2306
            foreach ($rows as $r) {
0 ignored issues
show
Bug introduced by
The expression $rows of type null|array is not guaranteed to be traversable. How about adding an additional type check?

There are different options of fixing this problem.

  1. If you want to be on the safe side, you can add an additional type-check:

    $collection = json_decode($data, true);
    if ( ! is_array($collection)) {
        throw new \RuntimeException('$collection must be an array.');
    }
    
    foreach ($collection as $item) { /** ... */ }
    
  2. If you are sure that the expression is traversable, you might want to add a doc comment cast to improve IDE auto-completion and static analysis:

    /** @var array $collection */
    $collection = json_decode($data, true);
    
    foreach ($collection as $item) { /** .. */ }
    
  3. Mark the issue as a false-positive: Just hover the remove button, in the top-right corner of this issue for more options.

Loading history...
2307
                $result |= $this->readUrl($r['qid']);
2308
2309
                $counter++;
2310
                usleep(intval($sleepTime)); // Just to relax the system
2311
2312
                // if during the start and the current read url the cli has been disable we need to return from the function
2313
                // mark the process NOT as ended.
2314
                if ($this->getDisabled()) {
2315
                    return ($result | self::CLI_STATUS_ABORTED);
2316
                }
2317
2318
                if (!$this->CLI_checkIfProcessIsActive($this->CLI_buildProcessId())) {
2319
                    $this->CLI_debug("conflict / timeout (" . $this->CLI_buildProcessId() . ")");
2320
2321
                    //TODO might need an additional returncode
2322
                    $result |= self::CLI_STATUS_ABORTED;
2323
                    break; //possible timeout
2324
                }
2325
            }
2326
2327
            sleep(intval($sleepAfterFinish));
2328
2329
            $msg = 'Rows: ' . $counter;
2330
            $this->CLI_debug($msg . " (" . $this->CLI_buildProcessId() . ")");
2331
        } else {
2332
            $this->CLI_debug("Nothing within queue which needs to be processed (" . $this->CLI_buildProcessId() . ")");
2333
        }
2334
2335
        if ($counter > 0) {
2336
            $result |= self::CLI_STATUS_PROCESSED;
2337
        }
2338
2339
        return $result;
2340
    }
2341
2342
    /**
2343
     * Activate hooks
2344
     *
2345
     * @return void
2346
     */
2347
    public function CLI_runHooks()
2348
    {
2349
        global $TYPO3_CONF_VARS;
2350
        if (is_array($TYPO3_CONF_VARS['EXTCONF']['crawler']['cli_hooks'])) {
2351
            foreach ($TYPO3_CONF_VARS['EXTCONF']['crawler']['cli_hooks'] as $objRef) {
2352
                $hookObj = &GeneralUtility::getUserObj($objRef);
2353
                if (is_object($hookObj)) {
2354
                    $hookObj->crawler_init($this);
2355
                }
2356
            }
2357
        }
2358
    }
2359
2360
    /**
2361
     * Try to acquire a new process with the given id
2362
     * also performs some auto-cleanup for orphan processes
2363
     * @todo preemption might not be the most elegant way to clean up
2364
     *
2365
     * @param string $id identification string for the process
2366
     * @return boolean
2367
     */
2368
    public function CLI_checkAndAcquireNewProcess($id)
2369
    {
2370
        $ret = true;
2371
2372
        $systemProcessId = getmypid();
2373
        if ($systemProcessId < 1) {
2374
            return false;
2375
        }
2376
2377
        $processCount = 0;
2378
        $orphanProcesses = [];
2379
2380
        $this->db->sql_query('BEGIN');
2381
2382
        $res = $this->db->exec_SELECTquery(
2383
            'process_id,ttl',
2384
            'tx_crawler_process',
2385
            'active=1 AND deleted=0'
2386
            );
2387
2388
        $currentTime = $this->getCurrentTime();
2389
2390
        while ($row = $this->db->sql_fetch_assoc($res)) {
2391
            if ($row['ttl'] < $currentTime) {
2392
                $orphanProcesses[] = $row['process_id'];
2393
            } else {
2394
                $processCount++;
2395
            }
2396
        }
2397
2398
        // if there are less than allowed active processes then add a new one
2399
        if ($processCount < intval($this->extensionSettings['processLimit'])) {
2400
            $this->CLI_debug("add process " . $this->CLI_buildProcessId() . " (" . ($processCount + 1) . "/" . intval($this->extensionSettings['processLimit']) . ")");
2401
2402
            // create new process record
2403
            $this->db->exec_INSERTquery(
2404
                'tx_crawler_process',
2405
                [
2406
                    'process_id' => $id,
2407
                    'active' => '1',
2408
                    'ttl' => ($currentTime + intval($this->extensionSettings['processMaxRunTime'])),
2409
                    'system_process_id' => $systemProcessId
2410
                ]
2411
                );
2412
        } else {
2413
            $this->CLI_debug("Processlimit reached (" . ($processCount) . "/" . intval($this->extensionSettings['processLimit']) . ")");
2414
            $ret = false;
2415
        }
2416
2417
        $this->CLI_releaseProcesses($orphanProcesses, true); // maybe this should be somehow included into the current lock
2418
        $this->CLI_deleteProcessesMarkedDeleted();
2419
2420
        $this->db->sql_query('COMMIT');
2421
2422
        return $ret;
2423
    }
2424
2425
    /**
2426
     * Release a process and the required resources
2427
     *
2428
     * @param  mixed    $releaseIds   string with a single process-id or array with multiple process-ids
2429
     * @param  boolean  $withinLock   show whether the DB-actions are included within an existing lock
2430
     * @return boolean
2431
     */
2432
    public function CLI_releaseProcesses($releaseIds, $withinLock = false)
2433
    {
2434
        if (!is_array($releaseIds)) {
2435
            $releaseIds = [$releaseIds];
2436
        }
2437
2438
        if (!count($releaseIds) > 0) {
2439
            return false;   //nothing to release
2440
        }
2441
2442
        if (!$withinLock) {
2443
            $this->db->sql_query('BEGIN');
2444
        }
2445
2446
        // some kind of 2nd chance algo - this way you need at least 2 processes to have a real cleanup
2447
        // this ensures that a single process can't mess up the entire process table
2448
2449
        // mark all processes as deleted which have no "waiting" queue-entires and which are not active
2450
        $this->db->exec_UPDATEquery(
2451
            'tx_crawler_queue',
2452
            'process_id IN (SELECT process_id FROM tx_crawler_process WHERE active=0 AND deleted=0)',
2453
            [
2454
                'process_scheduled' => 0,
2455
                'process_id' => ''
2456
            ]
2457
        );
2458
        $this->db->exec_UPDATEquery(
2459
            'tx_crawler_process',
2460
            'active=0 AND deleted=0
2461
            AND NOT EXISTS (
2462
                SELECT * FROM tx_crawler_queue
2463
                WHERE tx_crawler_queue.process_id = tx_crawler_process.process_id
2464
                AND tx_crawler_queue.exec_time = 0
2465
            )',
2466
            [
2467
                'deleted' => '1',
2468
                'system_process_id' => 0
2469
            ]
2470
        );
2471
        // mark all requested processes as non-active
2472
        $this->db->exec_UPDATEquery(
2473
            'tx_crawler_process',
2474
            'process_id IN (\'' . implode('\',\'', $releaseIds) . '\') AND deleted=0',
2475
            [
2476
                'active' => '0'
2477
            ]
2478
        );
2479
        $this->db->exec_UPDATEquery(
2480
            'tx_crawler_queue',
2481
            'exec_time=0 AND process_id IN ("' . implode('","', $releaseIds) . '")',
2482
            [
2483
                'process_scheduled' => 0,
2484
                'process_id' => ''
2485
            ]
2486
        );
2487
2488
        if (!$withinLock) {
2489
            $this->db->sql_query('COMMIT');
2490
        }
2491
2492
        return true;
2493
    }
2494
2495
    /**
2496
     * Delete processes marked as deleted
2497
     *
2498
     * @return void
2499
     */
2500
    public function CLI_deleteProcessesMarkedDeleted()
2501
    {
2502
        $this->db->exec_DELETEquery('tx_crawler_process', 'deleted = 1');
2503
    }
2504
2505
    /**
2506
     * Check if there are still resources left for the process with the given id
2507
     * Used to determine timeouts and to ensure a proper cleanup if there's a timeout
2508
     *
2509
     * @param  string  identification string for the process
2510
     * @return boolean determines if the process is still active / has resources
2511
     *
2512
     * FIXME: Please remove Transaction, not needed as only a select query.
2513
     */
2514
    public function CLI_checkIfProcessIsActive($pid)
2515
    {
2516
        $ret = false;
2517
        $this->db->sql_query('BEGIN');
2518
        $res = $this->db->exec_SELECTquery(
2519
            'process_id,active,ttl',
2520
            'tx_crawler_process',
2521
            'process_id = \'' . $pid . '\'  AND deleted=0',
2522
            '',
2523
            'ttl',
2524
            '0,1'
2525
        );
2526
        if ($row = $this->db->sql_fetch_assoc($res)) {
2527
            $ret = intVal($row['active']) == 1;
2528
        }
2529
        $this->db->sql_query('COMMIT');
2530
2531
        return $ret;
2532
    }
2533
2534
    /**
2535
     * Create a unique Id for the current process
2536
     *
2537
     * @return string  the ID
2538
     */
2539
    public function CLI_buildProcessId()
2540
    {
2541
        if (!$this->processID) {
2542
            $this->processID = GeneralUtility::shortMD5($this->microtime(true));
2543
        }
2544
        return $this->processID;
2545
    }
2546
2547
    /**
2548
     * @param bool $get_as_float
2549
     *
2550
     * @return mixed
2551
     */
2552
    protected function microtime($get_as_float = false)
2553
    {
2554
        return microtime($get_as_float);
2555
    }
2556
2557
    /**
2558
     * Prints a message to the stdout (only if debug-mode is enabled)
2559
     *
2560
     * @param  string $msg  the message
2561
     */
2562
    public function CLI_debug($msg)
2563
    {
2564
        if (intval($this->extensionSettings['processDebug'])) {
2565
            echo $msg . "\n";
2566
            flush();
2567
        }
2568
    }
2569
2570
    /**
2571
     * Get URL content by making direct request to TYPO3.
2572
     *
2573
     * @param  string $url          Page URL
2574
     * @param  int    $crawlerId    Crawler-ID
2575
     * @return array
2576
     */
2577
    protected function sendDirectRequest($url, $crawlerId)
2578
    {
2579
        $parsedUrl = parse_url($url);
2580
        if (!is_array($parsedUrl)) {
2581
            return [];
2582
        }
2583
2584
        $requestHeaders = $this->buildRequestHeaderArray($parsedUrl, $crawlerId);
2585
2586
        $cmd = escapeshellcmd($this->extensionSettings['phpPath']);
2587
        $cmd .= ' ';
2588
        $cmd .= escapeshellarg(ExtensionManagementUtility::extPath('crawler') . 'cli/bootstrap.php');
2589
        $cmd .= ' ';
2590
        $cmd .= escapeshellarg($this->getFrontendBasePath());
2591
        $cmd .= ' ';
2592
        $cmd .= escapeshellarg($url);
2593
        $cmd .= ' ';
2594
        $cmd .= escapeshellarg(base64_encode(serialize($requestHeaders)));
2595
2596
        $startTime = microtime(true);
2597
        $content = $this->executeShellCommand($cmd);
2598
        $this->log($url . ' ' . (microtime(true) - $startTime));
2599
2600
        $result = [
2601
            'request' => implode("\r\n", $requestHeaders) . "\r\n\r\n",
2602
            'headers' => '',
2603
            'content' => $content
2604
        ];
2605
2606
        return $result;
2607
    }
2608
2609
    /**
2610
     * Cleans up entries that stayed for too long in the queue. These are:
2611
     * - processed entries that are over 1.5 days in age
2612
     * - scheduled entries that are over 7 days old
2613
     *
2614
     * @return void
2615
     */
2616
    protected function cleanUpOldQueueEntries()
2617
    {
2618
        $processedAgeInSeconds = $this->extensionSettings['cleanUpProcessedAge'] * 86400; // 24*60*60 Seconds in 24 hours
2619
        $scheduledAgeInSeconds = $this->extensionSettings['cleanUpScheduledAge'] * 86400;
2620
2621
        $now = time();
2622
        $condition = '(exec_time<>0 AND exec_time<' . ($now - $processedAgeInSeconds) . ') OR scheduled<=' . ($now - $scheduledAgeInSeconds);
2623
        $this->flushQueue($condition);
2624
    }
2625
2626
    /**
2627
     * Initializes a TypoScript Frontend necessary for using TypoScript and TypoLink functions
2628
     *
2629
     * @param int $id
2630
     * @param int $typeNum
2631
     *
2632
     * @return void
2633
     */
2634
    protected function initTSFE($id = 1, $typeNum = 0)
2635
    {
2636
        EidUtility::initTCA();
2637
        if (!is_object($GLOBALS['TT'])) {
2638
            $GLOBALS['TT'] = new NullTimeTracker();
2639
            $GLOBALS['TT']->start();
2640
        }
2641
2642
        $GLOBALS['TSFE'] = GeneralUtility::makeInstance(TypoScriptFrontendController::class, $GLOBALS['TYPO3_CONF_VARS'], $id, $typeNum);
2643
        $GLOBALS['TSFE']->sys_page = GeneralUtility::makeInstance(PageRepository::class);
2644
        $GLOBALS['TSFE']->sys_page->init(true);
2645
        $GLOBALS['TSFE']->connectToDB();
2646
        $GLOBALS['TSFE']->initFEuser();
2647
        $GLOBALS['TSFE']->determineId();
2648
        $GLOBALS['TSFE']->initTemplate();
2649
        $GLOBALS['TSFE']->rootLine = $GLOBALS['TSFE']->sys_page->getRootLine($id, '');
2650
        $GLOBALS['TSFE']->getConfigArray();
2651
        PageGenerator::pagegenInit();
2652
    }
2653
2654
    /**
2655
     * Returns a md5 hash generated from a serialized configuration array.
2656
     *
2657
     * @param array $configuration
2658
     *
2659
     * @return string
2660
     */
2661 4
    protected function getConfigurationHash(array $configuration) {
2662 4
        unset($configuration['paramExpanded']);
2663 4
        unset($configuration['URLs']);
2664 4
        return md5(serialize($configuration));
2665
    }
2666
}
2667