Completed
Push — Testing/fix ( a7e6fd...8c2e07 )
by Tomas Norre
08:15
created

CrawlerController::getDuplicateRowsIfExist()   B

Complexity

Conditions 5
Paths 8

Size

Total Lines 49

Duplication

Lines 0
Ratio 0 %

Code Coverage

Tests 17
CRAP Score 6.2704

Importance

Changes 0
Metric Value
cc 5
nc 8
nop 2
dl 0
loc 49
ccs 17
cts 27
cp 0.6296
crap 6.2704
rs 8.8016
c 0
b 0
f 0
1
<?php
2
namespace AOE\Crawler\Controller;
3
4
/***************************************************************
5
 *  Copyright notice
6
 *
7
 *  (c) 2019 AOE GmbH <[email protected]>
8
 *
9
 *  All rights reserved
10
 *
11
 *  This script is part of the TYPO3 project. The TYPO3 project is
12
 *  free software; you can redistribute it and/or modify
13
 *  it under the terms of the GNU General Public License as published by
14
 *  the Free Software Foundation; either version 3 of the License, or
15
 *  (at your option) any later version.
16
 *
17
 *  The GNU General Public License can be found at
18
 *  http://www.gnu.org/copyleft/gpl.html.
19
 *
20
 *  This script is distributed in the hope that it will be useful,
21
 *  but WITHOUT ANY WARRANTY; without even the implied warranty of
22
 *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
23
 *  GNU General Public License for more details.
24
 *
25
 *  This copyright notice MUST APPEAR in all copies of the script!
26
 ***************************************************************/
27
28
use AOE\Crawler\Command\CrawlerCommandLineController;
29
use AOE\Crawler\Command\FlushCommandLineController;
30
use AOE\Crawler\Command\QueueCommandLineController;
31
use AOE\Crawler\Domain\Model\Reason;
32
use AOE\Crawler\Domain\Repository\ProcessRepository;
33
use AOE\Crawler\Domain\Repository\QueueRepository;
34
use AOE\Crawler\Event\EventDispatcher;
35
use AOE\Crawler\Utility\IconUtility;
36
use AOE\Crawler\Utility\SignalSlotUtility;
37
use TYPO3\CMS\Backend\Utility\BackendUtility;
38
use TYPO3\CMS\Backend\Tree\View\PageTreeView;
39
use TYPO3\CMS\Core\Authentication\BackendUserAuthentication;
40
use TYPO3\CMS\Core\Database\Connection;
41
use TYPO3\CMS\Core\Database\ConnectionPool;
42
use TYPO3\CMS\Core\Database\DatabaseConnection;
43
use TYPO3\CMS\Core\Database\Query\QueryBuilder;
44
use TYPO3\CMS\Core\Log\Logger;
45
use TYPO3\CMS\Core\Log\LogLevel;
46
use TYPO3\CMS\Core\TimeTracker\NullTimeTracker;
47
use TYPO3\CMS\Core\TimeTracker\TimeTracker;
48
use TYPO3\CMS\Core\Utility\DebugUtility;
49
use TYPO3\CMS\Core\Utility\ExtensionManagementUtility;
50
use TYPO3\CMS\Core\Utility\GeneralUtility;
51
use TYPO3\CMS\Core\Utility\MathUtility;
52
use TYPO3\CMS\Extbase\Object\ObjectManager;
53
use TYPO3\CMS\Frontend\Controller\TypoScriptFrontendController;
54
use TYPO3\CMS\Frontend\Page\PageGenerator;
55
use TYPO3\CMS\Frontend\Page\PageRepository;
56
use TYPO3\CMS\Frontend\Utility\EidUtility;
57
use TYPO3\CMS\Lang\LanguageService;
58
use TYPO3\CMS\Core\Database\Query\Restriction\DeletedRestriction;
59
use TYPO3\CMS\Core\Database\Query\Restriction\StartTimeRestriction;
60
use TYPO3\CMS\Core\Database\Query\Restriction\EndTimeRestriction;
61
use TYPO3\CMS\Core\TypoScript\Parser\TypoScriptParser;
62
63
/**
64
 * Class CrawlerController
65
 *
66
 * @package AOE\Crawler\Controller
67
 */
68
class CrawlerController
69
{
70
    const CLI_STATUS_NOTHING_PROCCESSED = 0;
71
    const CLI_STATUS_REMAIN = 1; //queue not empty
72
    const CLI_STATUS_PROCESSED = 2; //(some) queue items where processed
73
    const CLI_STATUS_ABORTED = 4; //instance didn't finish
74
    const CLI_STATUS_POLLABLE_PROCESSED = 8;
75
76
    /**
77
     * @var integer
78
     */
79
    public $setID = 0;
80
81
    /**
82
     * @var string
83
     */
84
    public $processID = '';
85
86
    /**
87
     * One hour is max stalled time for the CLI
88
     * If the process had the status "start" for 3600 seconds, it will be regarded stalled and a new process is started
89
     *
90
     * @var integer
91
     */
92
    public $max_CLI_exec_time = 3600;
93
94
    /**
95
     * @var array
96
     */
97
    public $duplicateTrack = [];
98
99
    /**
100
     * @var array
101
     */
102
    public $downloadUrls = [];
103
104
    /**
105
     * @var array
106
     */
107
    public $incomingProcInstructions = [];
108
109
    /**
110
     * @var array
111
     */
112
    public $incomingConfigurationSelection = [];
113
114
    /**
115
     * @var bool
116
     */
117
    public $registerQueueEntriesInternallyOnly = false;
118
119
    /**
120
     * @var array
121
     */
122
    public $queueEntries = [];
123
124
    /**
125
     * @var array
126
     */
127
    public $urlList = [];
128
129
    /**
130
     * @var boolean
131
     */
132
    public $debugMode = false;
133
134
    /**
135
     * @var array
136
     */
137
    public $extensionSettings = [];
138
139
    /**
140
     * Mount Point
141
     *
142
     * @var boolean
143
     */
144
    public $MP = false;
145
146
    /**
147
     * @var string
148
     */
149
    protected $processFilename;
150
151
    /**
152
     * Holds the internal access mode can be 'gui','cli' or 'cli_im'
153
     *
154
     * @var string
155
     */
156
    protected $accessMode;
157
158
    /**
159
     * @var BackendUserAuthentication
160
     */
161
    private $backendUser;
162
163
    /**
164
     * @var integer
165
     */
166
    private $scheduledTime = 0;
167
168
    /**
169
     * @var integer
170
     */
171
    private $reqMinute = 0;
172
173
    /**
174
     * @var bool
175
     */
176
    private $submitCrawlUrls = false;
177
178
    /**
179
     * @var bool
180
     */
181
    private $downloadCrawlUrls = false;
182
183
    /**
184
     * @var QueueRepository
185
     */
186
    protected  $queueRepository;
187
188
    /**
189
     * @var ProcessRepository
190
     */
191
    protected $processRepository;
192
193
    /**
194
     * @var string
195
     */
196
    protected $tableName = 'tx_crawler_queue';
197
198
    /**
199
     * @var array
200
     */
201
    private $cliArgs;
202
203
204
    /**
205
     * @var Logger
206
     */
207
    private $logger;
208
209
    /**
210
     * Method to set the accessMode can be gui, cli or cli_im
211
     *
212
     * @return string
213
     */
214 1
    public function getAccessMode()
215
    {
216 1
        return $this->accessMode;
217
    }
218
219
    /**
220
     * @param string $accessMode
221
     */
222 1
    public function setAccessMode($accessMode)
223
    {
224 1
        $this->accessMode = $accessMode;
225 1
    }
226
227
    /**
228
     * Set disabled status to prevent processes from being processed
229
     *
230
     * @param  bool $disabled (optional, defaults to true)
231
     * @return void
232
     */
233 3
    public function setDisabled($disabled = true)
234
    {
235 3
        if ($disabled) {
236 2
            GeneralUtility::writeFile($this->processFilename, '');
237
        } else {
238 1
            if (is_file($this->processFilename)) {
239 1
                unlink($this->processFilename);
240
            }
241
        }
242 3
    }
243
244
    /**
245
     * Get disable status
246
     *
247
     * @return bool true if disabled
248
     */
249 3
    public function getDisabled()
250
    {
251 3
        return is_file($this->processFilename);
252
    }
253
254
    /**
255
     * @param string $filenameWithPath
256
     *
257
     * @return void
258
     */
259 4
    public function setProcessFilename($filenameWithPath)
260
    {
261 4
        $this->processFilename = $filenameWithPath;
262 4
    }
263
264
    /**
265
     * @return string
266
     */
267 1
    public function getProcessFilename()
268
    {
269 1
        return $this->processFilename;
270
    }
271
272
    /**
273
     * @return Logger
274
     */
275
    private function getLogger(): Logger {
276
        if($this->logger === null) {
277
            $this->logger = GeneralUtility::makeInstance(\TYPO3\CMS\Core\Log\LogManager::class)->getLogger(__CLASS__);
278
        }
279
        return $this->logger;
280
    }
281
282
    /************************************
283
     *
284
     * Getting URLs based on Page TSconfig
285
     *
286
     ************************************/
287
288 34
    public function __construct()
289
    {
290 34
        $objectManager = GeneralUtility::makeInstance(ObjectManager::class);
291 34
        $this->queueRepository = $objectManager->get(QueueRepository::class);
292 34
        $this->processRepository = $objectManager->get(ProcessRepository::class);
293
294 34
        $this->backendUser = $GLOBALS['BE_USER'];
295 34
        $this->processFilename = PATH_site . 'typo3temp/tx_crawler.proc';
296
297 34
        $settings = unserialize($GLOBALS['TYPO3_CONF_VARS']['EXT']['extConf']['crawler']);
298 34
        $settings = is_array($settings) ? $settings : [];
299
300
        // read ext_em_conf_template settings and set
301 34
        $this->setExtensionSettings($settings);
302
303
        // set defaults:
304 34
        if (MathUtility::convertToPositiveInteger($this->extensionSettings['countInARun']) == 0) {
305
            $this->extensionSettings['countInARun'] = 100;
306
        }
307
308 34
        $this->extensionSettings['processLimit'] = MathUtility::forceIntegerInRange($this->extensionSettings['processLimit'], 1, 99, 1);
309 34
    }
310
311
    /**
312
     * Sets the extensions settings (unserialized pendant of $TYPO3_CONF_VARS['EXT']['extConf']['crawler']).
313
     *
314
     * @param array $extensionSettings
315
     * @return void
316
     */
317 43
    public function setExtensionSettings(array $extensionSettings)
318
    {
319 43
        $this->extensionSettings = $extensionSettings;
320 43
    }
321
322
    /**
323
     * Check if the given page should be crawled
324
     *
325
     * @param array $pageRow
326
     * @return false|string false if the page should be crawled (not excluded), true / skipMessage if it should be skipped
327
     */
328 8
    public function checkIfPageShouldBeSkipped(array $pageRow)
329
    {
330 8
        $skipPage = false;
331 8
        $skipMessage = 'Skipped'; // message will be overwritten later
332
333
        // if page is hidden
334 8
        if (!$this->extensionSettings['crawlHiddenPages']) {
335 8
            if ($pageRow['hidden']) {
336 1
                $skipPage = true;
337 1
                $skipMessage = 'Because page is hidden';
338
            }
339
        }
340
341 8
        if (!$skipPage) {
342 7
            if (GeneralUtility::inList('3,4', $pageRow['doktype']) || $pageRow['doktype'] >= 199) {
343 3
                $skipPage = true;
344 3
                $skipMessage = 'Because doktype is not allowed';
345
            }
346
        }
347
348 8
        if (!$skipPage) {
349 4
            if (is_array($GLOBALS['TYPO3_CONF_VARS']['EXTCONF']['crawler']['excludeDoktype'])) {
350 2
                foreach ($GLOBALS['TYPO3_CONF_VARS']['EXTCONF']['crawler']['excludeDoktype'] as $key => $doktypeList) {
351 1
                    if (GeneralUtility::inList($doktypeList, $pageRow['doktype'])) {
352 1
                        $skipPage = true;
353 1
                        $skipMessage = 'Doktype was excluded by "' . $key . '"';
354 1
                        break;
355
                    }
356
                }
357
            }
358
        }
359
360 8
        if (!$skipPage) {
361
            // veto hook
362 3
            if (is_array($GLOBALS['TYPO3_CONF_VARS']['EXTCONF']['crawler']['pageVeto'])) {
363
                foreach ($GLOBALS['TYPO3_CONF_VARS']['EXTCONF']['crawler']['pageVeto'] as $key => $func) {
364
                    $params = [
365
                        'pageRow' => $pageRow
366
                    ];
367
                    // expects "false" if page is ok and "true" or a skipMessage if this page should _not_ be crawled
368
                    $veto = GeneralUtility::callUserFunction($func, $params, $this);
369
                    if ($veto !== false) {
370
                        $skipPage = true;
371
                        if (is_string($veto)) {
372
                            $skipMessage = $veto;
373
                        } else {
374
                            $skipMessage = 'Veto from hook "' . htmlspecialchars($key) . '"';
375
                        }
376
                        // no need to execute other hooks if a previous one return a veto
377
                        break;
378
                    }
379
                }
380
            }
381
        }
382
383 8
        return $skipPage ? $skipMessage : false;
384
    }
385
386
    /**
387
     * Wrapper method for getUrlsForPageId()
388
     * It returns an array of configurations and no urls!
389
     *
390
     * @param array $pageRow Page record with at least dok-type and uid columns.
391
     * @param string $skipMessage
392
     * @return array
393
     * @see getUrlsForPageId()
394
     */
395 4
    public function getUrlsForPageRow(array $pageRow, &$skipMessage = '')
396
    {
397 4
        $message = $this->checkIfPageShouldBeSkipped($pageRow);
398
399 4
        if ($message === false) {
400 3
            $forceSsl = ($pageRow['url_scheme'] === 2) ? true : false;
401 3
            $res = $this->getUrlsForPageId($pageRow['uid'], $forceSsl);
402 3
            $skipMessage = '';
403
        } else {
404 1
            $skipMessage = $message;
405 1
            $res = [];
406
        }
407
408 4
        return $res;
409
    }
410
411
    /**
412
     * This method is used to count if there are ANY unprocessed queue entries
413
     * of a given page_id and the configuration which matches a given hash.
414
     * If there if none, we can skip an inner detail check
415
     *
416
     * @param  int $uid
417
     * @param  string $configurationHash
418
     * @return boolean
419
     */
420 5
    protected function noUnprocessedQueueEntriesForPageWithConfigurationHashExist($uid, $configurationHash)
421
    {
422 5
        $queryBuilder = GeneralUtility::makeInstance(ConnectionPool::class)->getQueryBuilderForTable($this->tableName);
423 5
        $noUnprocessedQueueEntriesFound = true;
424
425
        $result = $queryBuilder
426 5
            ->count('*')
427 5
            ->from($this->tableName)
428 5
            ->where(
429 5
                $queryBuilder->expr()->eq('page_id', intval($uid)),
430 5
                $queryBuilder->expr()->eq('configuration_hash', $queryBuilder->createNamedParameter($configurationHash)),
431 5
                $queryBuilder->expr()->eq('exec_time', 0)
432
            )
433 5
            ->execute()
434 5
            ->fetchColumn();
435
436 5
        if ($result) {
437 3
            $noUnprocessedQueueEntriesFound = false;
438
        }
439
440 5
        return $noUnprocessedQueueEntriesFound;
441
    }
442
443
    /**
444
     * Creates a list of URLs from input array (and submits them to queue if asked for)
445
     * See Web > Info module script + "indexed_search"'s crawler hook-client using this!
446
     *
447
     * @param    array        Information about URLs from pageRow to crawl.
448
     * @param    array        Page row
449
     * @param    integer        Unix time to schedule indexing to, typically time()
450
     * @param    integer        Number of requests per minute (creates the interleave between requests)
451
     * @param    boolean        If set, submits the URLs to queue
452
     * @param    boolean        If set (and submitcrawlUrls is false) will fill $downloadUrls with entries)
453
     * @param    array        Array which is passed by reference and contains the an id per url to secure we will not crawl duplicates
454
     * @param    array        Array which will be filled with URLS for download if flag is set.
455
     * @param    array        Array of processing instructions
456
     * @return    string        List of URLs (meant for display in backend module)
457
     *
458
     */
459 2
    public function urlListFromUrlArray(
460
    array $vv,
461
    array $pageRow,
462
    $scheduledTime,
463
    $reqMinute,
464
    $submitCrawlUrls,
465
    $downloadCrawlUrls,
466
    array &$duplicateTrack,
467
    array &$downloadUrls,
468
    array $incomingProcInstructions
469
    ) {
470 2
        $urlList = '';
471
        // realurl support (thanks to Ingo Renner)
472 2
        if (ExtensionManagementUtility::isLoaded('realurl') && $vv['subCfg']['realurl']) {
473
474
            /** @var tx_realurl $urlObj */
475
            $urlObj = GeneralUtility::makeInstance('tx_realurl');
476
477
            if (!empty($vv['subCfg']['baseUrl'])) {
478
                $urlParts = parse_url($vv['subCfg']['baseUrl']);
479
                $host = strtolower($urlParts['host']);
480
                $urlObj->host = $host;
481
482
                // First pass, finding configuration OR pointer string:
483
                $urlObj->extConf = isset($GLOBALS['TYPO3_CONF_VARS']['EXTCONF']['realurl'][$urlObj->host]) ? $GLOBALS['TYPO3_CONF_VARS']['EXTCONF']['realurl'][$urlObj->host] : $GLOBALS['TYPO3_CONF_VARS']['EXTCONF']['realurl']['_DEFAULT'];
484
485
                // If it turned out to be a string pointer, then look up the real config:
486
                if (is_string($urlObj->extConf)) {
487
                    $urlObj->extConf = is_array($GLOBALS['TYPO3_CONF_VARS']['EXTCONF']['realurl'][$urlObj->extConf]) ? $GLOBALS['TYPO3_CONF_VARS']['EXTCONF']['realurl'][$urlObj->extConf] : $GLOBALS['TYPO3_CONF_VARS']['EXTCONF']['realurl']['_DEFAULT'];
488
                }
489
            }
490
491
            if (!$GLOBALS['TSFE']->sys_page) {
492
                $GLOBALS['TSFE']->sys_page = GeneralUtility::makeInstance('TYPO3\CMS\Frontend\Page\PageRepository');
493
            }
494
495
            if (!$GLOBALS['TSFE']->tmpl->rootLine[0]['uid']) {
496
                $GLOBALS['TSFE']->tmpl->rootLine[0]['uid'] = $urlObj->extConf['pagePath']['rootpage_id'];
497
            }
498
        }
499
500 2
        if (is_array($vv['URLs'])) {
501 2
            $configurationHash = $this->getConfigurationHash($vv);
502 2
            $skipInnerCheck = $this->noUnprocessedQueueEntriesForPageWithConfigurationHashExist($pageRow['uid'], $configurationHash);
503
504 2
            foreach ($vv['URLs'] as $urlQuery) {
505 2
                if ($this->drawURLs_PIfilter($vv['subCfg']['procInstrFilter'], $incomingProcInstructions)) {
506
507
                    // Calculate cHash:
508 2
                    if ($vv['subCfg']['cHash']) {
509
                        /* @var $cacheHash \TYPO3\CMS\Frontend\Page\CacheHashCalculator */
510
                        $cacheHash = GeneralUtility::makeInstance('TYPO3\CMS\Frontend\Page\CacheHashCalculator');
511
                        $urlQuery .= '&cHash=' . $cacheHash->generateForParameters($urlQuery);
512
                    }
513
514
                    // Create key by which to determine unique-ness:
515 2
                    $uKey = $urlQuery . '|' . $vv['subCfg']['userGroups'] . '|' . $vv['subCfg']['baseUrl'] . '|' . $vv['subCfg']['procInstrFilter'];
516
517
                    // realurl support (thanks to Ingo Renner)
518 2
                    $urlQuery = 'index.php' . $urlQuery;
519 2
                    if (ExtensionManagementUtility::isLoaded('realurl') && $vv['subCfg']['realurl']) {
520
                        $params = [
521
                            'LD' => [
522
                                'totalURL' => $urlQuery
523
                            ],
524
                            'TCEmainHook' => true
525
                        ];
526
                        $urlObj->encodeSpURL($params);
0 ignored issues
show
Bug introduced by
The variable $urlObj does not seem to be defined for all execution paths leading up to this point.

If you define a variable conditionally, it can happen that it is not defined for all execution paths.

Let’s take a look at an example:

function myFunction($a) {
    switch ($a) {
        case 'foo':
            $x = 1;
            break;

        case 'bar':
            $x = 2;
            break;
    }

    // $x is potentially undefined here.
    echo $x;
}

In the above example, the variable $x is defined if you pass “foo” or “bar” as argument for $a. However, since the switch statement has no default case statement, if you pass any other value, the variable $x would be undefined.

Available Fixes

  1. Check for existence of the variable explicitly:

    function myFunction($a) {
        switch ($a) {
            case 'foo':
                $x = 1;
                break;
    
            case 'bar':
                $x = 2;
                break;
        }
    
        if (isset($x)) { // Make sure it's always set.
            echo $x;
        }
    }
    
  2. Define a default value for the variable:

    function myFunction($a) {
        $x = ''; // Set a default which gets overridden for certain paths.
        switch ($a) {
            case 'foo':
                $x = 1;
                break;
    
            case 'bar':
                $x = 2;
                break;
        }
    
        echo $x;
    }
    
  3. Add a value for the missing path:

    function myFunction($a) {
        switch ($a) {
            case 'foo':
                $x = 1;
                break;
    
            case 'bar':
                $x = 2;
                break;
    
            // We add support for the missing case.
            default:
                $x = '';
                break;
        }
    
        echo $x;
    }
    
Loading history...
527
                        $urlQuery = $params['LD']['totalURL'];
528
                    }
529
530
                    // Scheduled time:
531 2
                    $schTime = $scheduledTime + round(count($duplicateTrack) * (60 / $reqMinute));
532 2
                    $schTime = floor($schTime / 60) * 60;
533
534 2
                    if (isset($duplicateTrack[$uKey])) {
535
536
                        //if the url key is registered just display it and do not resubmit is
537
                        $urlList = '<em><span class="typo3-dimmed">' . htmlspecialchars($urlQuery) . '</span></em><br/>';
538
                    } else {
539 2
                        $urlList = '[' . date('d.m.y H:i', $schTime) . '] ' . htmlspecialchars($urlQuery);
540 2
                        $this->urlList[] = '[' . date('d.m.y H:i', $schTime) . '] ' . $urlQuery;
541
542 2
                        $theUrl = ($vv['subCfg']['baseUrl'] ? $vv['subCfg']['baseUrl'] : GeneralUtility::getIndpEnv('TYPO3_SITE_URL')) . $urlQuery;
543
544
                        // Submit for crawling!
545 2
                        if ($submitCrawlUrls) {
546 2
                            $added = $this->addUrl(
547 2
                            $pageRow['uid'],
548 2
                            $theUrl,
549 2
                            $vv['subCfg'],
550 2
                            $scheduledTime,
551 2
                            $configurationHash,
552 2
                            $skipInnerCheck
553
                            );
554 2
                            if ($added === false) {
555 2
                                $urlList .= ' (Url already existed)';
556
                            }
557
                        } elseif ($downloadCrawlUrls) {
558
                            $downloadUrls[$theUrl] = $theUrl;
559
                        }
560
561 2
                        $urlList .= '<br />';
562
                    }
563 2
                    $duplicateTrack[$uKey] = true;
564
                }
565
            }
566
        } else {
567
            $urlList = 'ERROR - no URL generated';
568
        }
569
570 2
        return $urlList;
571
    }
572
573
    /**
574
     * Returns true if input processing instruction is among registered ones.
575
     *
576
     * @param string $piString PI to test
577
     * @param array $incomingProcInstructions Processing instructions
578
     * @return boolean
579
     */
580 5
    public function drawURLs_PIfilter($piString, array $incomingProcInstructions)
581
    {
582 5
        if (empty($incomingProcInstructions)) {
583 1
            return true;
584
        }
585
586 4
        foreach ($incomingProcInstructions as $pi) {
587 4
            if (GeneralUtility::inList($piString, $pi)) {
588 2
                return true;
589
            }
590
        }
591 2
    }
592
593 2
    public function getPageTSconfigForId($id)
594
    {
595 2
        if (!$this->MP) {
596 2
            $pageTSconfig = BackendUtility::getPagesTSconfig($id);
597
        } else {
598
            list(, $mountPointId) = explode('-', $this->MP);
599
            $pageTSconfig = BackendUtility::getPagesTSconfig($mountPointId);
600
        }
601
602
        // Call a hook to alter configuration
603 2
        if (is_array($GLOBALS['TYPO3_CONF_VARS']['EXTCONF']['crawler']['getPageTSconfigForId'])) {
604
            $params = [
605
                'pageId' => $id,
606
                'pageTSConfig' => &$pageTSconfig
607
            ];
608
            foreach ($GLOBALS['TYPO3_CONF_VARS']['EXTCONF']['crawler']['getPageTSconfigForId'] as $userFunc) {
609
                GeneralUtility::callUserFunction($userFunc, $params, $this);
610
            }
611
        }
612
613 2
        return $pageTSconfig;
614
    }
615
616
    /**
617
     * This methods returns an array of configurations.
618
     * And no urls!
619
     *
620
     * @param integer $id Page ID
621
     * @param bool $forceSsl Use https
622
     * @return array
623
     */
624 2
    public function getUrlsForPageId($id, $forceSsl = false)
625
    {
626
627
        /**
628
         * Get configuration from tsConfig
629
         */
630
631
        // Get page TSconfig for page ID:
632 2
        $pageTSconfig = $this->getPageTSconfigForId($id);
633
634 2
        $res = [];
635
636 2
        if (is_array($pageTSconfig) && is_array($pageTSconfig['tx_crawler.']['crawlerCfg.'])) {
637 1
            $crawlerCfg = $pageTSconfig['tx_crawler.']['crawlerCfg.'];
638
639 1
            if (is_array($crawlerCfg['paramSets.'])) {
640 1
                foreach ($crawlerCfg['paramSets.'] as $key => $values) {
641 1
                    if (is_array($values)) {
642 1
                        $key = str_replace('.', '', $key);
643
                        // Sub configuration for a single configuration string:
644 1
                        $subCfg = (array)$crawlerCfg['paramSets.'][$key . '.'];
645 1
                        $subCfg['key'] = $key;
646
647 1
                        if (strcmp($subCfg['procInstrFilter'], '')) {
648 1
                            $subCfg['procInstrFilter'] = implode(',', GeneralUtility::trimExplode(',', $subCfg['procInstrFilter']));
649
                        }
650 1
                        $pidOnlyList = implode(',', GeneralUtility::trimExplode(',', $subCfg['pidsOnly'], true));
651
652
                        // process configuration if it is not page-specific or if the specific page is the current page:
653 1
                        if (!strcmp($subCfg['pidsOnly'], '') || GeneralUtility::inList($pidOnlyList, $id)) {
654
655
                                // add trailing slash if not present
656 1
                            if (!empty($subCfg['baseUrl']) && substr($subCfg['baseUrl'], -1) != '/') {
657
                                $subCfg['baseUrl'] .= '/';
658
                            }
659
660
                            // Explode, process etc.:
661 1
                            $res[$key] = [];
662 1
                            $res[$key]['subCfg'] = $subCfg;
663 1
                            $res[$key]['paramParsed'] = $this->parseParams($crawlerCfg['paramSets.'][$key]);
664 1
                            $res[$key]['paramExpanded'] = $this->expandParameters($res[$key]['paramParsed'], $id);
665 1
                            $res[$key]['origin'] = 'pagets';
666
667
                            // recognize MP value
668 1
                            if (!$this->MP) {
669 1
                                $res[$key]['URLs'] = $this->compileUrls($res[$key]['paramExpanded'], ['?id=' . $id]);
670
                            } else {
671
                                $res[$key]['URLs'] = $this->compileUrls($res[$key]['paramExpanded'], ['?id=' . $id . '&MP=' . $this->MP]);
672
                            }
673
                        }
674
                    }
675
                }
676
            }
677
        }
678
679
        /**
680
         * Get configuration from tx_crawler_configuration records
681
         */
682
683
        // get records along the rootline
684 2
        $rootLine = BackendUtility::BEgetRootLine($id);
685
686
687 2
        $queryBuilder = GeneralUtility::makeInstance(ConnectionPool::class)->getQueryBuilderForTable('tx_crawler_configuration');
688 2
        $queryBuilder->getRestrictions()->removeAll()->add(GeneralUtility::makeInstance(DeletedRestriction::class));
689
690 2
        foreach ($rootLine as $page) {
691
            $configurationRecordsForCurrentPage = $queryBuilder
692 2
                ->select('*')
693 2
                ->from('tx_crawler_configuration')
694 2
                ->where(
695 2
                    $queryBuilder->expr()->eq('pid', $page['uid']),
696 2
                    substr(BackendUtility::BEenableFields('tx_crawler_configuration'), 4) . BackendUtility::deleteClause('tx_crawler_configuration')
0 ignored issues
show
Deprecated Code introduced by
The method TYPO3\CMS\Backend\Utilit...Utility::deleteClause() has been deprecated with message: since TYPO3 v9, will be removed in TYPO3 v10.0, the DeletedRestriction functionality should be used instead.

This method has been deprecated. The supplier of the class has supplied an explanatory message.

The explanatory message should give you some clue as to whether and when the method will be removed from the class and what other method or class to use instead.

Loading history...
697
                )
698 2
                ->execute()
699 2
                ->fetchAll();
700
701 2
            if (is_array($configurationRecordsForCurrentPage)) {
702 2
                foreach ($configurationRecordsForCurrentPage as $configurationRecord) {
703
704
                        // check access to the configuration record
705 1
                    if (empty($configurationRecord['begroups']) || $GLOBALS['BE_USER']->isAdmin() || $this->hasGroupAccess($GLOBALS['BE_USER']->user['usergroup_cached_list'], $configurationRecord['begroups'])) {
706 1
                        $pidOnlyList = implode(',', GeneralUtility::trimExplode(',', $configurationRecord['pidsonly'], true));
707
708
                        // process configuration if it is not page-specific or if the specific page is the current page:
709 1
                        if (!strcmp($configurationRecord['pidsonly'], '') || GeneralUtility::inList($pidOnlyList, $id)) {
710 1
                            $key = $configurationRecord['name'];
711
712
                            // don't overwrite previously defined paramSets
713 1
                            if (!isset($res[$key])) {
714
715
                                    /* @var $TSparserObject \TYPO3\CMS\Core\TypoScript\Parser\TypoScriptParser */
716 1
                                $TSparserObject = GeneralUtility::makeInstance(TypoScriptParser::class);
717 1
                                $TSparserObject->parse($configurationRecord['processing_instruction_parameters_ts']);
718
719 1
                                $isCrawlingProtocolHttps = $this->isCrawlingProtocolHttps($configurationRecord['force_ssl'], $forceSsl);
720
721
                                $subCfg = [
722 1
                                    'procInstrFilter' => $configurationRecord['processing_instruction_filter'],
723 1
                                    'procInstrParams.' => $TSparserObject->setup,
724 1
                                    'baseUrl' => $this->getBaseUrlForConfigurationRecord(
725 1
                                        $configurationRecord['base_url'],
726 1
                                        $configurationRecord['sys_domain_base_url'],
727 1
                                        $isCrawlingProtocolHttps
728
                                    ),
729 1
                                    'realurl' => $configurationRecord['realurl'],
730 1
                                    'cHash' => $configurationRecord['chash'],
731 1
                                    'userGroups' => $configurationRecord['fegroups'],
732 1
                                    'exclude' => $configurationRecord['exclude'],
733 1
                                    'rootTemplatePid' => (int) $configurationRecord['root_template_pid'],
734 1
                                    'key' => $key
735
                                ];
736
737
                                // add trailing slash if not present
738 1
                                if (!empty($subCfg['baseUrl']) && substr($subCfg['baseUrl'], -1) != '/') {
739
                                    $subCfg['baseUrl'] .= '/';
740
                                }
741 1
                                if (!in_array($id, $this->expandExcludeString($subCfg['exclude']))) {
742 1
                                    $res[$key] = [];
743 1
                                    $res[$key]['subCfg'] = $subCfg;
744 1
                                    $res[$key]['paramParsed'] = $this->parseParams($configurationRecord['configuration']);
745 1
                                    $res[$key]['paramExpanded'] = $this->expandParameters($res[$key]['paramParsed'], $id);
746 1
                                    $res[$key]['URLs'] = $this->compileUrls($res[$key]['paramExpanded'], ['?id=' . $id]);
747 1
                                    $res[$key]['origin'] = 'tx_crawler_configuration_' . $configurationRecord['uid'];
748
                                }
749
                            }
750
                        }
751
                    }
752
                }
753
            }
754
        }
755
756 2
        if (is_array($GLOBALS['TYPO3_CONF_VARS']['EXTCONF']['crawler']['processUrls'])) {
757
            foreach ($GLOBALS['TYPO3_CONF_VARS']['EXTCONF']['crawler']['processUrls'] as $func) {
758
                $params = [
759
                    'res' => &$res,
760
                ];
761
                GeneralUtility::callUserFunction($func, $params, $this);
762
            }
763
        }
764
765 2
        return $res;
766
    }
767
768
    /**
769
     * Checks if a domain record exist and returns the base-url based on the record. If not the given baseUrl string is used.
770
     *
771
     * @param string $baseUrl
772
     * @param integer $sysDomainUid
773
     * @param bool $ssl
774
     * @return string
775
     */
776 4
    protected function getBaseUrlForConfigurationRecord($baseUrl, $sysDomainUid, $ssl = false)
777
    {
778 4
        $queryBuilder = GeneralUtility::makeInstance(ConnectionPool::class)->getQueryBuilderForTable($this->tableName);
779 4
        $sysDomainUid = intval($sysDomainUid);
780 4
        $urlScheme = ($ssl === false) ? 'http' : 'https';
781
782 4
        if ($sysDomainUid > 0) {
783
            $statement = $queryBuilder
784 2
                ->from('sys_domain')
785 2
                ->select('*')
786 2
                ->where(
787 2
                    $queryBuilder->expr()->eq('uid', intval($sysDomainUid))
788
                )
789 2
                ->execute();
790
791 2
            $row = $statement->fetch(0);
792 2
            if ($row['domainName'] != '') {
793 1
                return $urlScheme . '://' . $row['domainName'];
794
            }
795
        }
796 3
        return $baseUrl;
797
    }
798
799
    /**
800
     * @param $rootid
801
     * @param $depth
802
     * @return array
803
     *
804
     * TODO: Write Functional Tests
805
     */
806
    public function getConfigurationsForBranch($rootid, $depth)
807
    {
808
        $configurationsForBranch = [];
809
810
        $pageTSconfig = $this->getPageTSconfigForId($rootid);
811
        if (is_array($pageTSconfig) && is_array($pageTSconfig['tx_crawler.']['crawlerCfg.']) && is_array($pageTSconfig['tx_crawler.']['crawlerCfg.']['paramSets.'])) {
812
            $sets = $pageTSconfig['tx_crawler.']['crawlerCfg.']['paramSets.'];
813
            if (is_array($sets)) {
814
                foreach ($sets as $key => $value) {
815
                    if (!is_array($value)) {
816
                        continue;
817
                    }
818
                    $configurationsForBranch[] = substr($key, -1) == '.' ? substr($key, 0, -1) : $key;
819
                }
820
            }
821
        }
822
        $pids = [];
823
        $rootLine = BackendUtility::BEgetRootLine($rootid);
824
        foreach ($rootLine as $node) {
825
            $pids[] = $node['uid'];
826
        }
827
        /* @var PageTreeView $tree */
828
        $tree = GeneralUtility::makeInstance(PageTreeView::class);
829
        $perms_clause = $GLOBALS['BE_USER']->getPagePermsClause(1);
830
        $tree->init('AND ' . $perms_clause);
831
        $tree->getTree($rootid, $depth, '');
832
        foreach ($tree->tree as $node) {
833
            $pids[] = $node['row']['uid'];
834
        }
835
836
        $queryBuilder = $this->getQueryBuilder('tx_crawler_configuration');
837
838
        $queryBuilder->getRestrictions()
839
            ->removeAll()
840
            ->add(GeneralUtility::makeInstance(DeletedRestriction::class))
841
            ->add(GeneralUtility::makeInstance(StartTimeRestriction::class))
842
            ->add(GeneralUtility::makeInstance(EndTimeRestriction::class));
843
844
        $statement = $queryBuilder
845
            ->select('name')
846
            ->from('tx_crawler_configuration')
847
            ->where(
848
                $queryBuilder->expr()->in('pid', $queryBuilder->createNamedParameter($pids, Connection::PARAM_INT_ARRAY))
849
            )
850
        ->execute();
851
852
        while($row = $statement->fetch()) {
853
            $configurationsForBranch[] = $row['name'];
854
        }
855
856
        return $configurationsForBranch;
857
    }
858
859
    /**
860
     * Get querybuilder for given table
861
     *
862
     * @param string $table
863
     * @return \TYPO3\CMS\Core\Database\Query\QueryBuilder
864
     */
865 9
    private function getQueryBuilder(string $table) {
866
867 9
        return GeneralUtility::makeInstance(ConnectionPool::class)
868 9
            ->getConnectionForTable($table)
869 9
            ->createQueryBuilder();
870
    }
871
872
    /**
873
     * Check if a user has access to an item
874
     * (e.g. get the group list of the current logged in user from $GLOBALS['TSFE']->gr_list)
875
     *
876
     * @see \TYPO3\CMS\Frontend\Page\PageRepository::getMultipleGroupsWhereClause()
877
     * @param  string $groupList    Comma-separated list of (fe_)group UIDs from a user
878
     * @param  string $accessList   Comma-separated list of (fe_)group UIDs of the item to access
879
     * @return bool                 TRUE if at least one of the users group UIDs is in the access list or the access list is empty
880
     */
881 3
    public function hasGroupAccess($groupList, $accessList)
882
    {
883 3
        if (empty($accessList)) {
884 1
            return true;
885
        }
886 2
        foreach (GeneralUtility::intExplode(',', $groupList) as $groupUid) {
887 2
            if (GeneralUtility::inList($accessList, $groupUid)) {
888 1
                return true;
889
            }
890
        }
891 1
        return false;
892
    }
893
894
    /**
895
     * Parse GET vars of input Query into array with key=>value pairs
896
     *
897
     * @param string $inputQuery Input query string
898
     * @return array
899
     */
900 5
    public function parseParams($inputQuery)
901
    {
902
        //echo '<pre>', var_dump($inputQuery), '</pre>';
903
        // Extract all GET parameters into an ARRAY:
904 5
        $paramKeyValues = [];
905 5
        $GETparams = explode('&', $inputQuery);
906
907 5
        foreach ($GETparams as $paramAndValue) {
908 5
            list($p, $v) = explode('=', $paramAndValue, 2);
909 5
            if (strlen($p)) {
910 4
                $paramKeyValues[rawurldecode($p)] = rawurldecode($v);
911
            }
912
        }
913
914 5
        return $paramKeyValues;
915
    }
916
917
    /**
918
     * Will expand the parameters configuration to individual values. This follows a certain syntax of the value of each parameter.
919
     * Syntax of values:
920
     * - Basically: If the value is wrapped in [...] it will be expanded according to the following syntax, otherwise the value is taken literally
921
     * - Configuration is splitted by "|" and the parts are processed individually and finally added together
922
     * - For each configuration part:
923
     *         - "[int]-[int]" = Integer range, will be expanded to all values in between, values included, starting from low to high (max. 1000). Example "1-34" or "-40--30"
924
     *         - "_TABLE:[TCA table name];[_PID:[optional page id, default is current page]];[_ENABLELANG:1]" = Look up of table records from PID, filtering out deleted records. Example "_TABLE:tt_content; _PID:123"
925
     *        _ENABLELANG:1 picks only original records without their language overlays
926
     *         - Default: Literal value
927
     *
928
     * @param array $paramArray Array with key (GET var name) and values (value of GET var which is configuration for expansion)
929
     * @param integer $pid Current page ID
930
     * @return array
931
     *
932
     * TODO: Write Functional Tests
933
     */
934 2
    public function expandParameters($paramArray, $pid)
935
    {
936 2
        global $TCA;
937
938
        // Traverse parameter names:
939 2
        foreach ($paramArray as $p => $v) {
940 2
            $v = trim($v);
941
942
            // If value is encapsulated in square brackets it means there are some ranges of values to find, otherwise the value is literal
943 2
            if (substr($v, 0, 1) === '[' && substr($v, -1) === ']') {
944
                // So, find the value inside brackets and reset the paramArray value as an array.
945 2
                $v = substr($v, 1, -1);
946 2
                $paramArray[$p] = [];
947
948
                // Explode parts and traverse them:
949 2
                $parts = explode('|', $v);
950 2
                foreach ($parts as $pV) {
951
952
                        // Look for integer range: (fx. 1-34 or -40--30 // reads minus 40 to minus 30)
953 2
                    if (preg_match('/^(-?[0-9]+)\s*-\s*(-?[0-9]+)$/', trim($pV), $reg)) {
954
955
                        // Swap if first is larger than last:
956
                        if ($reg[1] > $reg[2]) {
957
                            $temp = $reg[2];
958
                            $reg[2] = $reg[1];
959
                            $reg[1] = $temp;
960
                        }
961
962
                        // Traverse range, add values:
963
                        $runAwayBrake = 1000; // Limit to size of range!
964
                        for ($a = $reg[1]; $a <= $reg[2];$a++) {
965
                            $paramArray[$p][] = $a;
966
                            $runAwayBrake--;
967
                            if ($runAwayBrake <= 0) {
968
                                break;
969
                            }
970
                        }
971 2
                    } elseif (substr(trim($pV), 0, 7) == '_TABLE:') {
972
973
                        // Parse parameters:
974
                        $subparts = GeneralUtility::trimExplode(';', $pV);
975
                        $subpartParams = [];
976
                        foreach ($subparts as $spV) {
977
                            list($pKey, $pVal) = GeneralUtility::trimExplode(':', $spV);
978
                            $subpartParams[$pKey] = $pVal;
979
                        }
980
981
                        // Table exists:
982
                        if (isset($TCA[$subpartParams['_TABLE']])) {
983
                            $lookUpPid = isset($subpartParams['_PID']) ? intval($subpartParams['_PID']) : $pid;
984
                            $pidField = isset($subpartParams['_PIDFIELD']) ? trim($subpartParams['_PIDFIELD']) : 'pid';
985
                            $where = isset($subpartParams['_WHERE']) ? $subpartParams['_WHERE'] : '';
986
                            $addTable = isset($subpartParams['_ADDTABLE']) ? $subpartParams['_ADDTABLE'] : '';
987
988
                            $fieldName = $subpartParams['_FIELD'] ? $subpartParams['_FIELD'] : 'uid';
989
                            if ($fieldName === 'uid' || $TCA[$subpartParams['_TABLE']]['columns'][$fieldName]) {
990
                                $queryBuilder = $this->getQueryBuilder($subpartParams['_TABLE']);
991
992
                                $queryBuilder->getRestrictions()
993
                                    ->removeAll()
994
                                    ->add(GeneralUtility::makeInstance(DeletedRestriction::class));
995
996
                                $queryBuilder
997
                                    ->select($fieldName)
998
                                    ->from($subpartParams['_TABLE'])
999
                                    // TODO: Check if this works as intended!
1000
                                    ->add('from', $addTable)
1001
                                    ->where(
1002
                                        $queryBuilder->expr()->eq($queryBuilder->quoteIdentifier($pidField), $queryBuilder->createNamedParameter($lookUpPid, \PDO::PARAM_INT)),
1003
                                        $where
1004
                                    );
1005
                                $transOrigPointerField = $TCA[$subpartParams['_TABLE']]['ctrl']['transOrigPointerField'];
1006
1007
                                if ($subpartParams['_ENABLELANG'] && $transOrigPointerField) {
1008
                                    $queryBuilder->andWhere(
1009
                                        $queryBuilder->expr()->lte(
1010
                                            $queryBuilder->quoteIdentifier($transOrigPointerField), 0
1011
                                        )
1012
                                    );
1013
                                }
1014
1015
                                $statement = $queryBuilder->execute();
1016
1017
                                $rows = [];
1018
                                while($row = $statement->fetch()) {
1019
                                    $rows[$fieldName] = $row;
1020
                                }
1021
1022
                                if (is_array($rows)) {
1023
                                    $paramArray[$p] = array_merge($paramArray[$p], array_keys($rows));
1024
                                }
1025
                            }
1026
                        }
1027
                    } else { // Just add value:
1028 2
                        $paramArray[$p][] = $pV;
1029
                    }
1030
                    // Hook for processing own expandParameters place holder
1031 2
                    if (is_array($GLOBALS['TYPO3_CONF_VARS']['SC_OPTIONS']['crawler/class.tx_crawler_lib.php']['expandParameters'])) {
1032
                        $_params = [
1033
                            'pObj' => &$this,
1034
                            'paramArray' => &$paramArray,
1035
                            'currentKey' => $p,
1036
                            'currentValue' => $pV,
1037
                            'pid' => $pid
1038
                        ];
1039
                        foreach ($GLOBALS['TYPO3_CONF_VARS']['SC_OPTIONS']['crawler/class.tx_crawler_lib.php']['expandParameters'] as $key => $_funcRef) {
1040
                            GeneralUtility::callUserFunction($_funcRef, $_params, $this);
1041
                        }
1042
                    }
1043
                }
1044
1045
                // Make unique set of values and sort array by key:
1046 2
                $paramArray[$p] = array_unique($paramArray[$p]);
1047 2
                ksort($paramArray);
1048
            } else {
1049
                // Set the literal value as only value in array:
1050 2
                $paramArray[$p] = [$v];
1051
            }
1052
        }
1053
1054 2
        return $paramArray;
1055
    }
1056
1057
    /**
1058
     * Compiling URLs from parameter array (output of expandParameters())
1059
     * The number of URLs will be the multiplication of the number of parameter values for each key
1060
     *
1061
     * @param array $paramArray Output of expandParameters(): Array with keys (GET var names) and for each an array of values
1062
     * @param array $urls URLs accumulated in this array (for recursion)
1063
     * @return array
1064
     */
1065 5
    public function compileUrls($paramArray, $urls = [])
1066
    {
1067 5
        if (count($paramArray) && is_array($urls)) {
1068
            // shift first off stack:
1069 4
            reset($paramArray);
1070 4
            $varName = key($paramArray);
1071 4
            $valueSet = array_shift($paramArray);
1072
1073
            // Traverse value set:
1074 4
            $newUrls = [];
1075 4
            foreach ($urls as $url) {
1076 3
                foreach ($valueSet as $val) {
1077 3
                    $newUrls[] = $url . (strcmp($val, '') ? '&' . rawurlencode($varName) . '=' . rawurlencode($val) : '');
1078
1079 3
                    if (count($newUrls) > MathUtility::forceIntegerInRange($this->extensionSettings['maxCompileUrls'], 1, 1000000000, 10000)) {
1080
                        break;
1081
                    }
1082
                }
1083
            }
1084 4
            $urls = $newUrls;
1085 4
            $urls = $this->compileUrls($paramArray, $urls);
1086
        }
1087
1088 5
        return $urls;
1089
    }
1090
1091
    /************************************
1092
     *
1093
     * Crawler log
1094
     *
1095
     ************************************/
1096
1097
    /**
1098
     * Return array of records from crawler queue for input page ID
1099
     *
1100
     * @param integer $id Page ID for which to look up log entries.
1101
     * @param string$filter Filter: "all" => all entries, "pending" => all that is not yet run, "finished" => all complete ones
1102
     * @param boolean $doFlush If TRUE, then entries selected at DELETED(!) instead of selected!
1103
     * @param boolean $doFullFlush
1104
     * @param integer $itemsPerPage Limit the amount of entries per page default is 10
1105
     * @return array
1106
     */
1107 4
    public function getLogEntriesForPageId($id, $filter = '', $doFlush = false, $doFullFlush = false, $itemsPerPage = 10)
1108
    {
1109 4
        $queryBuilder = GeneralUtility::makeInstance(ConnectionPool::class)->getQueryBuilderForTable($this->tableName);
1110
        $queryBuilder
1111 4
            ->select('*')
1112 4
            ->from($this->tableName)
1113 4
            ->where(
1114 4
                $queryBuilder->expr()->eq('page_id', $queryBuilder->createNamedParameter($id, \PDO::PARAM_INT))
1115
            )
1116 4
            ->orderBy('scheduled', 'DESC');
1117
1118 4
        $expressionBuilder = GeneralUtility::makeInstance(ConnectionPool::class)
1119 4
            ->getConnectionForTable($this->tableName)
1120 4
            ->getExpressionBuilder();
1121 4
        $query = $expressionBuilder->andX();
1122
        // PHPStorm adds the highlight that the $addWhere is immediately overwritten,
1123
        // but the $query = $expressionBuilder->andX() ensures that the $addWhere is written correctly with AND
1124
        // between the statements, it's not a mistake in the code.
1125 4
        $addWhere = '';
0 ignored issues
show
Unused Code introduced by
$addWhere is not used, you could remove the assignment.

This check looks for variable assignements that are either overwritten by other assignments or where the variable is not used subsequently.

$myVar = 'Value';
$higher = false;

if (rand(1, 6) > 3) {
    $higher = true;
} else {
    $higher = false;
}

Both the $myVar assignment in line 1 and the $higher assignment in line 2 are dead. The first because $myVar is never used and the second because $higher is always overwritten for every possible time line.

Loading history...
1126 4
        switch ($filter) {
1127 4
            case 'pending':
1128
                $queryBuilder->andWhere($queryBuilder->expr()->eq('exec_time', 0));
1129
                $addWhere = $query->add($expressionBuilder->eq('exec_time', 0));
0 ignored issues
show
Unused Code introduced by
$addWhere is not used, you could remove the assignment.

This check looks for variable assignements that are either overwritten by other assignments or where the variable is not used subsequently.

$myVar = 'Value';
$higher = false;

if (rand(1, 6) > 3) {
    $higher = true;
} else {
    $higher = false;
}

Both the $myVar assignment in line 1 and the $higher assignment in line 2 are dead. The first because $myVar is never used and the second because $higher is always overwritten for every possible time line.

Loading history...
1130
                break;
1131 4
            case 'finished':
1132
                $queryBuilder->andWhere($queryBuilder->expr()->gt('exec_time', 0));
1133
                $addWhere = $query->add($expressionBuilder->gt('exec_time', 0));
0 ignored issues
show
Unused Code introduced by
$addWhere is not used, you could remove the assignment.

This check looks for variable assignements that are either overwritten by other assignments or where the variable is not used subsequently.

$myVar = 'Value';
$higher = false;

if (rand(1, 6) > 3) {
    $higher = true;
} else {
    $higher = false;
}

Both the $myVar assignment in line 1 and the $higher assignment in line 2 are dead. The first because $myVar is never used and the second because $higher is always overwritten for every possible time line.

Loading history...
1134
                break;
1135
        }
1136
1137
        // FIXME: Write unit test that ensures that the right records are deleted.
1138 4
        if ($doFlush) {
1139 2
            $addWhere = $query->add($expressionBuilder->eq('page_id', intval($id)));
1140 2
            $this->flushQueue($doFullFlush ? '1=1' : $addWhere);
1141 2
            return [];
1142
        } else {
1143
1144 2
            if($itemsPerPage > 0) {
1145
                $queryBuilder
1146 2
                    ->setMaxResults((int)$itemsPerPage);
1147
            }
1148
1149 2
            return $queryBuilder->execute()->fetchAll();
1150
        }
1151
    }
1152
1153
    /**
1154
     * Return array of records from crawler queue for input set ID
1155
     *
1156
     * @param integer $set_id Set ID for which to look up log entries.
1157
     * @param string $filter Filter: "all" => all entries, "pending" => all that is not yet run, "finished" => all complete ones
1158
     * @param boolean $doFlush If TRUE, then entries selected at DELETED(!) instead of selected!
1159
     * @param integer $itemsPerPage Limit the amount of entires per page default is 10
1160
     * @return array
1161
     */
1162 6
    public function getLogEntriesForSetId($set_id, $filter = '', $doFlush = false, $doFullFlush = false, $itemsPerPage = 10)
1163
    {
1164 6
        $queryBuilder = GeneralUtility::makeInstance(ConnectionPool::class)->getQueryBuilderForTable($this->tableName);
1165
        $queryBuilder
1166 6
            ->select('*')
1167 6
            ->from($this->tableName)
1168 6
            ->where(
1169 6
                $queryBuilder->expr()->eq('set_id', $queryBuilder->createNamedParameter($set_id, \PDO::PARAM_INT))
1170
            )
1171 6
            ->orderBy('scheduled', 'DESC');
1172
1173 6
        $expressionBuilder = GeneralUtility::makeInstance(ConnectionPool::class)
1174 6
            ->getConnectionForTable($this->tableName)
1175 6
            ->getExpressionBuilder();
1176 6
        $query = $expressionBuilder->andX();
1177
        // FIXME: Write Unit tests for Filters
1178
        // PHPStorm adds the highlight that the $addWhere is immediately overwritten,
1179
        // but the $query = $expressionBuilder->andX() ensures that the $addWhere is written correctly with AND
1180
        // between the statements, it's not a mistake in the code.
1181 6
        $addWhere = '';
0 ignored issues
show
Unused Code introduced by
$addWhere is not used, you could remove the assignment.

This check looks for variable assignements that are either overwritten by other assignments or where the variable is not used subsequently.

$myVar = 'Value';
$higher = false;

if (rand(1, 6) > 3) {
    $higher = true;
} else {
    $higher = false;
}

Both the $myVar assignment in line 1 and the $higher assignment in line 2 are dead. The first because $myVar is never used and the second because $higher is always overwritten for every possible time line.

Loading history...
1182 6
        switch ($filter) {
1183 6
            case 'pending':
1184 1
                $queryBuilder->andWhere($queryBuilder->expr()->eq('exec_time', 0));
1185 1
                $addWhere = $query->add($expressionBuilder->eq('exec_time', 0));
0 ignored issues
show
Unused Code introduced by
$addWhere is not used, you could remove the assignment.

This check looks for variable assignements that are either overwritten by other assignments or where the variable is not used subsequently.

$myVar = 'Value';
$higher = false;

if (rand(1, 6) > 3) {
    $higher = true;
} else {
    $higher = false;
}

Both the $myVar assignment in line 1 and the $higher assignment in line 2 are dead. The first because $myVar is never used and the second because $higher is always overwritten for every possible time line.

Loading history...
1186 1
                break;
1187 5
            case 'finished':
1188 1
                $queryBuilder->andWhere($queryBuilder->expr()->gt('exec_time', 0));
1189 1
                $addWhere = $query->add($expressionBuilder->gt('exec_time', 0));
0 ignored issues
show
Unused Code introduced by
$addWhere is not used, you could remove the assignment.

This check looks for variable assignements that are either overwritten by other assignments or where the variable is not used subsequently.

$myVar = 'Value';
$higher = false;

if (rand(1, 6) > 3) {
    $higher = true;
} else {
    $higher = false;
}

Both the $myVar assignment in line 1 and the $higher assignment in line 2 are dead. The first because $myVar is never used and the second because $higher is always overwritten for every possible time line.

Loading history...
1190 1
                break;
1191
        }
1192
        // FIXME: Write unit test that ensures that the right records are deleted.
1193 6
        if ($doFlush) {
1194 4
            $addWhere = $query->add($expressionBuilder->eq('set_id', intval($set_id)));
1195 4
            $this->flushQueue($doFullFlush ? '' : $addWhere);
1196 4
            return [];
1197
        } else {
1198 2
            if($itemsPerPage > 0) {
1199
                $queryBuilder
1200 2
                    ->setMaxResults((int)$itemsPerPage);
1201
            }
1202
1203 2
            return $queryBuilder->execute()->fetchAll();
1204
        }
1205
    }
1206
1207
    /**
1208
     * Removes queue entries
1209
     *
1210
     * @param string $where SQL related filter for the entries which should be removed
1211
     * @return void
1212
     */
1213 9
    protected function flushQueue($where = '')
1214
    {
1215 9
        $realWhere = strlen($where) > 0 ? $where : '1=1';
1216
1217 9
        $queryBuilder = $this->getQueryBuilder($this->tableName);
1218
1219 9
        if (EventDispatcher::getInstance()->hasObserver('queueEntryFlush')) {
1220
1221
            $groups = $queryBuilder
1222
                ->select('DISTINCT set_id')
1223
                ->from($this->tableName)
1224
                ->where($realWhere)
1225
                ->execute()
1226
                ->fetchAll();
1227
            if (is_array($groups)) {
1228
                foreach ($groups as $group) {
1229
                    $subSet = $queryBuilder
1230
                        ->select('uid', 'set_id')
1231
                        ->from($this->tableName)
1232
                        ->where(
1233
                            $realWhere,
1234
                            $queryBuilder->expr()->eq('set_id', $group['set_id'])
1235
                        )
1236
                        ->execute()
1237
                        ->fetchAll();
1238
                    EventDispatcher::getInstance()->post('queueEntryFlush', $group['set_id'], $subSet);
1239
                }
1240
            }
1241
        }
1242
1243
        $queryBuilder
1244 9
            ->delete($this->tableName)
1245 9
            ->where($realWhere)
1246 9
            ->execute();
1247 9
    }
1248
1249
    /**
1250
     * Adding call back entries to log (called from hooks typically, see indexed search class "class.crawler.php"
1251
     *
1252
     * @param integer $setId Set ID
1253
     * @param array $params Parameters to pass to call back function
1254
     * @param string $callBack Call back object reference, eg. 'EXT:indexed_search/class.crawler.php:&tx_indexedsearch_crawler'
1255
     * @param integer $page_id Page ID to attach it to
1256
     * @param integer $schedule Time at which to activate
1257
     * @return void
1258
     */
1259
    public function addQueueEntry_callBack($setId, $params, $callBack, $page_id = 0, $schedule = 0)
1260
    {
1261
        if (!is_array($params)) {
1262
            $params = [];
1263
        }
1264
        $params['_CALLBACKOBJ'] = $callBack;
1265
1266
        GeneralUtility::makeInstance(ConnectionPool::class)->getConnectionForTable('tx_crawler_queue')
1267
            ->insert(
1268
                'tx_crawler_queue',
1269
                [
1270
                    'page_id' => intval($page_id),
1271
                    'parameters' => serialize($params),
1272
                    'scheduled' => intval($schedule) ? intval($schedule) : $this->getCurrentTime(),
1273
                    'exec_time' => 0,
1274
                    'set_id' => intval($setId),
1275
                    'result_data' => '',
1276
                ]
1277
            );
1278
    }
1279
1280
    /************************************
1281
     *
1282
     * URL setting
1283
     *
1284
     ************************************/
1285
1286
    /**
1287
     * Setting a URL for crawling:
1288
     *
1289
     * @param integer $id Page ID
1290
     * @param string $url Complete URL
1291
     * @param array $subCfg Sub configuration array (from TS config)
1292
     * @param integer $tstamp Scheduled-time
1293
     * @param string $configurationHash (optional) configuration hash
1294
     * @param bool $skipInnerDuplicationCheck (optional) skip inner duplication check
1295
     * @return bool
1296
     */
1297 2
    public function addUrl(
1298
        $id,
1299
        $url,
1300
        array $subCfg,
1301
        $tstamp,
1302
        $configurationHash = '',
1303
        $skipInnerDuplicationCheck = false
1304
    ) {
1305 2
        $urlAdded = false;
1306 2
        $rows = [];
1307
1308
        // Creating parameters:
1309
        $parameters = [
1310 2
            'url' => $url
1311
        ];
1312
1313
        // fe user group simulation:
1314 2
        $uGs = implode(',', array_unique(GeneralUtility::intExplode(',', $subCfg['userGroups'], true)));
1315 2
        if ($uGs) {
1316
            $parameters['feUserGroupList'] = $uGs;
1317
        }
1318
1319
        // Setting processing instructions
1320 2
        $parameters['procInstructions'] = GeneralUtility::trimExplode(',', $subCfg['procInstrFilter']);
1321 2
        if (is_array($subCfg['procInstrParams.'])) {
1322 2
            $parameters['procInstrParams'] = $subCfg['procInstrParams.'];
1323
        }
1324
1325
        // Possible TypoScript Template Parents
1326 2
        $parameters['rootTemplatePid'] = $subCfg['rootTemplatePid'];
1327
1328
        // Compile value array:
1329 2
        $parameters_serialized = serialize($parameters);
1330
        $fieldArray = [
1331 2
            'page_id' => intval($id),
1332 2
            'parameters' => $parameters_serialized,
1333 2
            'parameters_hash' => GeneralUtility::shortMD5($parameters_serialized),
1334 2
            'configuration_hash' => $configurationHash,
1335 2
            'scheduled' => $tstamp,
1336 2
            'exec_time' => 0,
1337 2
            'set_id' => intval($this->setID),
1338 2
            'result_data' => '',
1339 2
            'configuration' => $subCfg['key'],
1340
        ];
1341
1342 2
        if ($this->registerQueueEntriesInternallyOnly) {
1343
            //the entries will only be registered and not stored to the database
1344
            $this->queueEntries[] = $fieldArray;
1345
        } else {
1346 2
            if (!$skipInnerDuplicationCheck) {
1347
                // check if there is already an equal entry
1348 2
                $rows = $this->getDuplicateRowsIfExist($tstamp, $fieldArray);
1349
            }
1350
1351 2
            if (count($rows) == 0) {
1352 2
                $connectionForCrawlerQueue = GeneralUtility::makeInstance(ConnectionPool::class)->getConnectionForTable('tx_crawler_queue');
1353 2
                $connectionForCrawlerQueue->insert(
1354 2
                        'tx_crawler_queue',
1355 2
                        $fieldArray
1356
                    );
1357 2
                $uid = $connectionForCrawlerQueue->lastInsertId('tx_crawler_queue', 'qid');
1358 2
                $rows[] = $uid;
1359 2
                $urlAdded = true;
1360 2
                EventDispatcher::getInstance()->post('urlAddedToQueue', $this->setID, ['uid' => $uid, 'fieldArray' => $fieldArray]);
1361
            } else {
1362
                EventDispatcher::getInstance()->post('duplicateUrlInQueue', $this->setID, ['rows' => $rows, 'fieldArray' => $fieldArray]);
1363
            }
1364
        }
1365
1366 2
        return $urlAdded;
1367
    }
1368
1369
    /**
1370
     * This method determines duplicates for a queue entry with the same parameters and this timestamp.
1371
     * If the timestamp is in the past, it will check if there is any unprocessed queue entry in the past.
1372
     * If the timestamp is in the future it will check, if the queued entry has exactly the same timestamp
1373
     *
1374
     * @param int $tstamp
1375
     * @param array $fieldArray
1376
     *
1377
     * @return array
1378
     *
1379
     * TODO: Write Functional Tests
1380
     */
1381 2
    protected function getDuplicateRowsIfExist($tstamp, $fieldArray)
1382
    {
1383 2
        $rows = [];
1384
1385 2
        $currentTime = $this->getCurrentTime();
1386
1387 2
        $queryBuilder = GeneralUtility::makeInstance(ConnectionPool::class)->getQueryBuilderForTable($this->tableName);
1388
        $queryBuilder
1389 2
            ->select('qid')
1390 2
            ->from('tx_crawler_queue');
1391
        //if this entry is scheduled with "now"
1392 2
        if ($tstamp <= $currentTime) {
1393
            if ($this->extensionSettings['enableTimeslot']) {
1394
                $timeBegin = $currentTime - 100;
1395
                $timeEnd = $currentTime + 100;
1396
                $queryBuilder
1397
                    ->where(
1398
                        'scheduled BETWEEN ' . $timeBegin . ' AND ' . $timeEnd . ''
1399
                    )
1400
                    ->orWhere(
1401
                        $queryBuilder->expr()->lte('scheduled', $currentTime)
1402
                    );
1403
            } else {
1404
                $queryBuilder
1405
                    ->where(
1406
                        $queryBuilder->expr()->lte('scheduled', $currentTime)
1407
                    );
1408
            }
1409 2
        } elseif ($tstamp > $currentTime) {
1410
            //entry with a timestamp in the future need to have the same schedule time
1411
            $queryBuilder
1412 2
                ->where(
1413 2
                    $queryBuilder->expr()->eq('scheduled', $tstamp)
1414
                );
1415
        }
1416
1417
        $statement = $queryBuilder
1418 2
            ->andWhere('exec_time != 0')
1419 2
            ->andWhere('process_id != 0')
1420 2
            ->andWhere($queryBuilder->expr()->eq('page_id', $queryBuilder->createNamedParameter($fieldArray['page_id'], \PDO::PARAM_INT)))
1421 2
            ->andWhere($queryBuilder->expr()->eq('parameters_hash', $queryBuilder->createNamedParameter($fieldArray['parameters_hash'], \PDO::PARAM_STR)))
1422 2
            ->execute();
1423
1424 2
        while($row = $statement->fetch()) {
1425
            $rows[] = $row['qid'];
1426
        }
1427
1428 2
        return $rows;
1429
    }
1430
1431
    /**
1432
     * Returns the current system time
1433
     *
1434
     * @return int
1435
     */
1436
    public function getCurrentTime()
1437
    {
1438
        return time();
1439
    }
1440
1441
    /************************************
1442
     *
1443
     * URL reading
1444
     *
1445
     ************************************/
1446
1447
    /**
1448
     * Read URL for single queue entry
1449
     *
1450
     * @param integer $queueId
1451
     * @param boolean $force If set, will process even if exec_time has been set!
1452
     * @return integer
1453
     */
1454
    public function readUrl($queueId, $force = false)
1455
    {
1456
        $queryBuilder = GeneralUtility::makeInstance(ConnectionPool::class)->getQueryBuilderForTable($this->tableName);
1457
        $ret = 0;
1458
        if ($this->debugMode) {
1459
            $this->getLogger()->log(
1460
                LogLevel::DEBUG,
1461
                'crawler-readurl start ' . microtime(true)
1462
            );
1463
        }
1464
        // Get entry:
1465
        $queryBuilder
1466
            ->select('*')
1467
            ->from('tx_crawler_queue')
1468
            ->where(
1469
                $queryBuilder->expr()->eq('qid', $queryBuilder->createNamedParameter($queueId, \PDO::PARAM_INT))
1470
            );
1471
        if(!$force) {
1472
            $queryBuilder
1473
                ->andWhere('exec_time = 0')
1474
                ->andWhere('process_scheduled > 0');
1475
        }
1476
        $queueRec = $queryBuilder->execute()->fetch();
1477
1478
        if (!is_array($queueRec)) {
1479
            return;
1480
        }
1481
1482
        $parameters = unserialize($queueRec['parameters']);
1483
        if ($parameters['rootTemplatePid']) {
1484
            $this->initTSFE((int)$parameters['rootTemplatePid']);
1485
        } else {
1486
            GeneralUtility::sysLog(
0 ignored issues
show
Deprecated Code introduced by
The method TYPO3\CMS\Core\Utility\GeneralUtility::sysLog() has been deprecated with message: since TYPO3 v9, will be removed in TYPO3 v10.0.

This method has been deprecated. The supplier of the class has supplied an explanatory message.

The explanatory message should give you some clue as to whether and when the method will be removed from the class and what other method or class to use instead.

Loading history...
1487
                'Page with (' . $queueRec['page_id'] . ') could not be crawled, please check your crawler configuration. Perhaps no Root Template Pid is set',
1488
                'crawler',
1489
                GeneralUtility::SYSLOG_SEVERITY_WARNING
1490
            );
1491
        }
1492
1493
        SignalSlotUtility::emitSignal(
1494
            __CLASS__,
1495
            SignalSlotUtility::SIGNNAL_QUEUEITEM_PREPROCESS,
1496
            [$queueId, &$queueRec]
1497
        );
1498
1499
        // Set exec_time to lock record:
1500
        $field_array = ['exec_time' => $this->getCurrentTime()];
1501
1502
        if (isset($this->processID)) {
1503
            //if mulitprocessing is used we need to store the id of the process which has handled this entry
1504
            $field_array['process_id_completed'] = $this->processID;
1505
        }
1506
1507
        $queryBuilder->update(
1508
                'tx_crawler_queue',
1509
                $field_array,
1510
                [ 'qid' => (int) $queueI]
0 ignored issues
show
Bug introduced by
The variable $queueI does not exist. Did you mean $queueId?

This check looks for variables that are accessed but have not been defined. It raises an issue if it finds another variable that has a similar name.

The variable may have been renamed without also renaming all references.

Loading history...
1511
            );
1512
1513
        $result = $this->readUrl_exec($queueRec);
1514
        $resultData = unserialize($result['content']);
1515
1516
        //atm there's no need to point to specific pollable extensions
1517
        if (is_array($GLOBALS['TYPO3_CONF_VARS']['EXTCONF']['crawler']['pollSuccess'])) {
1518
            foreach ($GLOBALS['TYPO3_CONF_VARS']['EXTCONF']['crawler']['pollSuccess'] as $pollable) {
1519
                // only check the success value if the instruction is runnig
1520
                // it is important to name the pollSuccess key same as the procInstructions key
1521
                if (is_array($resultData['parameters']['procInstructions']) && in_array(
1522
                    $pollable,
1523
                        $resultData['parameters']['procInstructions']
1524
                )
1525
                ) {
1526
                    if (!empty($resultData['success'][$pollable]) && $resultData['success'][$pollable]) {
1527
                        $ret |= self::CLI_STATUS_POLLABLE_PROCESSED;
1528
                    }
1529
                }
1530
            }
1531
        }
1532
1533
        // Set result in log which also denotes the end of the processing of this entry.
1534
        $field_array = ['result_data' => serialize($result)];
1535
1536
        SignalSlotUtility::emitSignal(
1537
            __CLASS__,
1538
            SignalSlotUtility::SIGNNAL_QUEUEITEM_POSTPROCESS,
1539
            [$queueId, &$field_array]
1540
        );
1541
1542
        GeneralUtility::makeInstance(ConnectionPool::class)->getConnectionForTable('tx_crawler_queue')
1543
            ->update(
1544
                'tx_crawler_queue',
1545
                $field_array,
1546
                [ 'qid' => (int)$queueId ]
1547
            );
1548
1549
        if ($this->debugMode) {
1550
            $this->getLogger()->log(
1551
                LogLevel::DEBUG,
1552
                'crawler-readurl stop ' . microtime(true)
1553
            );
1554
        }
1555
1556
        return $ret;
1557
    }
1558
1559
    /**
1560
     * Read URL for not-yet-inserted log-entry
1561
     *
1562
     * @param array $field_array Queue field array,
1563
     *
1564
     * @return string
1565
     */
1566
    public function readUrlFromArray($field_array)
1567
    {
1568
1569
            // Set exec_time to lock record:
1570
        $field_array['exec_time'] = $this->getCurrentTime();
1571
        $connectionForCrawlerQueue = GeneralUtility::makeInstance(ConnectionPool::class)->getConnectionForTable('tx_crawler_queue');
1572
        $connectionForCrawlerQueue->insert(
1573
            'tx_crawler_queue',
1574
            $field_array
1575
        );
1576
        $queueId = $field_array['qid'] = $connectionForCrawlerQueue->lastInsertId('tx_crawler_queue', 'qid');
1577
1578
        $result = $this->readUrl_exec($field_array);
1579
1580
        // Set result in log which also denotes the end of the processing of this entry.
1581
        $field_array = ['result_data' => serialize($result)];
1582
1583
        SignalSlotUtility::emitSignal(
1584
            __CLASS__,
1585
            SignalSlotUtility::SIGNNAL_QUEUEITEM_POSTPROCESS,
1586
            [$queueId, &$field_array]
1587
        );
1588
1589
        $connectionForCrawlerQueue->update(
1590
            'tx_crawler_queue',
1591
            $field_array,
1592
            ['qid' => $queueId]
1593
        );
1594
1595
        return $result;
1596
    }
1597
1598
    /**
1599
     * Read URL for a queue record
1600
     *
1601
     * @param array $queueRec Queue record
1602
     * @return string
1603
     */
1604
    public function readUrl_exec($queueRec)
1605
    {
1606
        // Decode parameters:
1607
        $parameters = unserialize($queueRec['parameters']);
1608
        $result = 'ERROR';
1609
        if (is_array($parameters)) {
1610
            if ($parameters['_CALLBACKOBJ']) { // Calling object:
1611
                $objRef = $parameters['_CALLBACKOBJ'];
1612
                $callBackObj = GeneralUtility::makeInstance($objRef);
1613
                if (is_object($callBackObj)) {
1614
                    unset($parameters['_CALLBACKOBJ']);
1615
                    $result = ['content' => serialize($callBackObj->crawler_execute($parameters, $this))];
1616
                } else {
1617
                    $result = ['content' => 'No object: ' . $objRef];
1618
                }
1619
            } else { // Regular FE request:
1620
1621
                // Prepare:
1622
                $crawlerId = $queueRec['qid'] . ':' . md5($queueRec['qid'] . '|' . $queueRec['set_id'] . '|' . $GLOBALS['TYPO3_CONF_VARS']['SYS']['encryptionKey']);
1623
1624
                // Get result:
1625
                $result = $this->requestUrl($parameters['url'], $crawlerId);
1626
1627
                EventDispatcher::getInstance()->post('urlCrawled', $queueRec['set_id'], ['url' => $parameters['url'], 'result' => $result]);
1628
            }
1629
        }
1630
1631
        return $result;
1632
    }
1633
1634
    /**
1635
     * Gets the content of a URL.
1636
     *
1637
     * @param string $originalUrl URL to read
1638
     * @param string $crawlerId Crawler ID string (qid + hash to verify)
1639
     * @param integer $timeout Timeout time
1640
     * @param integer $recursion Recursion limiter for 302 redirects
1641
     * @return array|boolean
1642
     */
1643 2
    public function requestUrl($originalUrl, $crawlerId, $timeout = 2, $recursion = 10)
1644
    {
1645 2
        if (!$recursion) {
1646
            return false;
1647
        }
1648
1649
        // Parse URL, checking for scheme:
1650 2
        $url = parse_url($originalUrl);
1651
1652 2
        if ($url === false) {
1653
            $this->getLogger()->log(
1654
                LogLevel::DEBUG,
1655
                sprintf('Could not parse_url() for string "%s"', $url),
1656
                ['crawlerId' => $crawlerId]
1657
            );
1658
            return false;
1659
        }
1660
1661 2
        if (!in_array($url['scheme'], ['','http','https'])) {
1662
            $this->getLogger()->log(
1663
                LogLevel::DEBUG,
1664
                sprintf('Scheme does not match for url "%s"', $url),
1665
                ['crawlerId' => $crawlerId]
1666
            );
1667
            return false;
1668
        }
1669
1670
        // direct request
1671 2
        if ($this->extensionSettings['makeDirectRequests']) {
1672 2
            $result = $this->sendDirectRequest($originalUrl, $crawlerId);
1673 2
            return $result;
1674
        }
1675
1676
        $reqHeaders = $this->buildRequestHeaderArray($url, $crawlerId);
1677
1678
        // thanks to Pierrick Caillon for adding proxy support
1679
        $rurl = $url;
1680
1681
        if ($GLOBALS['TYPO3_CONF_VARS']['SYS']['curlUse'] && $GLOBALS['TYPO3_CONF_VARS']['SYS']['curlProxyServer']) {
1682
            $rurl = parse_url($GLOBALS['TYPO3_CONF_VARS']['SYS']['curlProxyServer']);
1683
            $url['path'] = $url['scheme'] . '://' . $url['host'] . ($url['port'] > 0 ? ':' . $url['port'] : '') . $url['path'];
1684
            $reqHeaders = $this->buildRequestHeaderArray($url, $crawlerId);
1685
        }
1686
1687
        $host = $rurl['host'];
1688
1689
        if ($url['scheme'] == 'https') {
1690
            $host = 'ssl://' . $host;
1691
            $port = ($rurl['port'] > 0) ? $rurl['port'] : 443;
1692
        } else {
1693
            $port = ($rurl['port'] > 0) ? $rurl['port'] : 80;
1694
        }
1695
1696
        $startTime = microtime(true);
1697
        $fp = fsockopen($host, $port, $errno, $errstr, $timeout);
1698
1699
        if (!$fp) {
1700
            $this->getLogger()->log(
1701
                LogLevel::DEBUG,
1702
                sprintf('Error while opening "%s"', $url),
1703
                ['crawlerId' => $crawlerId]
1704
            );
1705
            return false;
1706
        } else {
1707
            // Request message:
1708
            $msg = implode("\r\n", $reqHeaders) . "\r\n\r\n";
1709
            fputs($fp, $msg);
1710
1711
            // Read response:
1712
            $d = $this->getHttpResponseFromStream($fp);
1713
            fclose($fp);
1714
1715
            $time = microtime(true) - $startTime;
1716
            $this->log($originalUrl . ' ' . $time);
1717
1718
            // Implode content and headers:
1719
            $result = [
1720
                'request' => $msg,
1721
                'headers' => implode('', $d['headers']),
1722
                'content' => implode('', (array)$d['content'])
1723
            ];
1724
1725
            if (($this->extensionSettings['follow30x']) && ($newUrl = $this->getRequestUrlFrom302Header($d['headers'], $url['user'], $url['pass']))) {
1726
                $result = array_merge(['parentRequest' => $result], $this->requestUrl($newUrl, $crawlerId, $recursion--));
0 ignored issues
show
Bug introduced by
It seems like $newUrl defined by $this->getRequestUrlFrom...['user'], $url['pass']) on line 1725 can also be of type boolean; however, AOE\Crawler\Controller\C...ontroller::requestUrl() does only seem to accept string, maybe add an additional type check?

If a method or function can return multiple different values and unless you are sure that you only can receive a single value in this context, we recommend to add an additional type check:

/**
 * @return array|string
 */
function returnsDifferentValues($x) {
    if ($x) {
        return 'foo';
    }

    return array();
}

$x = returnsDifferentValues($y);
if (is_array($x)) {
    // $x is an array.
}

If this a common case that PHP Analyzer should handle natively, please let us know by opening an issue.

Loading history...
1727
                $newRequestUrl = $this->requestUrl($newUrl, $crawlerId, $timeout, --$recursion);
0 ignored issues
show
Bug introduced by
It seems like $newUrl defined by $this->getRequestUrlFrom...['user'], $url['pass']) on line 1725 can also be of type boolean; however, AOE\Crawler\Controller\C...ontroller::requestUrl() does only seem to accept string, maybe add an additional type check?

If a method or function can return multiple different values and unless you are sure that you only can receive a single value in this context, we recommend to add an additional type check:

/**
 * @return array|string
 */
function returnsDifferentValues($x) {
    if ($x) {
        return 'foo';
    }

    return array();
}

$x = returnsDifferentValues($y);
if (is_array($x)) {
    // $x is an array.
}

If this a common case that PHP Analyzer should handle natively, please let us know by opening an issue.

Loading history...
1728
1729
                if (is_array($newRequestUrl)) {
1730
                    $result = array_merge(['parentRequest' => $result], $newRequestUrl);
1731
                } else {
1732
                    $this->getLogger()->log(
1733
                        LogLevel::DEBUG,
1734
                        sprintf('Error while opening "%s"', $url),
1735
                        ['crawlerId' => $crawlerId]
1736
                    );
1737
                    return false;
1738
                }
1739
            }
1740
1741
            return $result;
1742
        }
1743
    }
1744
1745
    /**
1746
     * Gets the base path of the website frontend.
1747
     * (e.g. if you call http://mydomain.com/cms/index.php in
1748
     * the browser the base path is "/cms/")
1749
     *
1750
     * @return string Base path of the website frontend
1751
     */
1752
    protected function getFrontendBasePath()
1753
    {
1754
        $frontendBasePath = '/';
1755
1756
        // Get the path from the extension settings:
1757
        if (isset($this->extensionSettings['frontendBasePath']) && $this->extensionSettings['frontendBasePath']) {
1758
            $frontendBasePath = $this->extensionSettings['frontendBasePath'];
1759
            // If empty, try to use config.absRefPrefix:
1760
        } elseif (isset($GLOBALS['TSFE']->absRefPrefix) && !empty($GLOBALS['TSFE']->absRefPrefix)) {
1761
            $frontendBasePath = $GLOBALS['TSFE']->absRefPrefix;
1762
            // If not in CLI mode the base path can be determined from $_SERVER environment:
1763
        } elseif (!defined('TYPO3_REQUESTTYPE_CLI') || !TYPO3_REQUESTTYPE_CLI) {
1764
            $frontendBasePath = GeneralUtility::getIndpEnv('TYPO3_SITE_PATH');
1765
        }
1766
1767
        // Base path must be '/<pathSegements>/':
1768
        if ($frontendBasePath !== '/') {
1769
            $frontendBasePath = '/' . ltrim($frontendBasePath, '/');
1770
            $frontendBasePath = rtrim($frontendBasePath, '/') . '/';
1771
        }
1772
1773
        return $frontendBasePath;
1774
    }
1775
1776
    /**
1777
     * Executes a shell command and returns the outputted result.
1778
     *
1779
     * @param string $command Shell command to be executed
1780
     * @return string Outputted result of the command execution
1781
     */
1782
    protected function executeShellCommand($command)
1783
    {
1784
        return shell_exec($command);
1785
    }
1786
1787
    /**
1788
     * Reads HTTP response from the given stream.
1789
     *
1790
     * @param  resource $streamPointer  Pointer to connection stream.
1791
     * @return array                    Associative array with the following items:
1792
     *                                  headers <array> Response headers sent by server.
1793
     *                                  content <array> Content, with each line as an array item.
1794
     */
1795 1
    protected function getHttpResponseFromStream($streamPointer)
1796
    {
1797 1
        $response = ['headers' => [], 'content' => []];
1798
1799 1
        if (is_resource($streamPointer)) {
1800
            // read headers
1801 1
            while ($line = fgets($streamPointer, '2048')) {
1802 1
                $line = trim($line);
1803 1
                if ($line !== '') {
1804 1
                    $response['headers'][] = $line;
1805
                } else {
1806 1
                    break;
1807
                }
1808
            }
1809
1810
            // read content
1811 1
            while ($line = fgets($streamPointer, '2048')) {
1812 1
                $response['content'][] = $line;
1813
            }
1814
        }
1815
1816 1
        return $response;
1817
    }
1818
1819
    /**
1820
     * @param message
1821
     */
1822 2
    protected function log($message)
1823
    {
1824 2
        if (!empty($this->extensionSettings['logFileName'])) {
1825
            $fileResult = @file_put_contents($this->extensionSettings['logFileName'], date('Ymd His') . ' ' . $message . PHP_EOL, FILE_APPEND);
1826
            if (!$fileResult) {
1827
1828
                $this->getLogger()->log(
1829
                    LogLevel::INFO,
1830
                    sprintf('File "%s" could not be written, please check file permissions.', $this->extensionSettings['logFileName'])
1831
                );
1832
            }
1833
        }
1834 2
    }
1835
1836
    /**
1837
     * Builds HTTP request headers.
1838
     *
1839
     * @param array $url
1840
     * @param string $crawlerId
1841
     *
1842
     * @return array
1843
     */
1844 6
    protected function buildRequestHeaderArray(array $url, $crawlerId)
1845
    {
1846 6
        $reqHeaders = [];
1847 6
        $reqHeaders[] = 'GET ' . $url['path'] . ($url['query'] ? '?' . $url['query'] : '') . ' HTTP/1.0';
1848 6
        $reqHeaders[] = 'Host: ' . $url['host'];
1849 6
        if (stristr($url['query'], 'ADMCMD_previewWS')) {
1850 2
            $reqHeaders[] = 'Cookie: $Version="1"; be_typo_user="1"; $Path=/';
1851
        }
1852 6
        $reqHeaders[] = 'Connection: close';
1853 6
        if ($url['user'] != '') {
1854 2
            $reqHeaders[] = 'Authorization: Basic ' . base64_encode($url['user'] . ':' . $url['pass']);
1855
        }
1856 6
        $reqHeaders[] = 'X-T3crawler: ' . $crawlerId;
1857 6
        $reqHeaders[] = 'User-Agent: TYPO3 crawler';
1858 6
        return $reqHeaders;
1859
    }
1860
1861
    /**
1862
     * Check if the submitted HTTP-Header contains a redirect location and built new crawler-url
1863
     *
1864
     * @param array $headers HTTP Header
1865
     * @param string $user HTTP Auth. User
1866
     * @param string $pass HTTP Auth. Password
1867
     * @return bool|string
1868
     */
1869 12
    protected function getRequestUrlFrom302Header($headers, $user = '', $pass = '')
1870
    {
1871 12
        $header = [];
1872 12
        if (!is_array($headers)) {
1873 1
            return false;
1874
        }
1875 11
        if (!(stristr($headers[0], '301 Moved') || stristr($headers[0], '302 Found') || stristr($headers[0], '302 Moved'))) {
1876 2
            return false;
1877
        }
1878
1879 9
        foreach ($headers as $hl) {
1880 9
            $tmp = explode(": ", $hl);
1881 9
            $header[trim($tmp[0])] = trim($tmp[1]);
1882 9
            if (trim($tmp[0]) == 'Location') {
1883 6
                break;
1884
            }
1885
        }
1886 9
        if (!array_key_exists('Location', $header)) {
1887 3
            return false;
1888
        }
1889
1890 6
        if ($user != '') {
1891 3
            if (!($tmp = parse_url($header['Location']))) {
1892 1
                return false;
1893
            }
1894 2
            $newUrl = $tmp['scheme'] . '://' . $user . ':' . $pass . '@' . $tmp['host'] . $tmp['path'];
1895 2
            if ($tmp['query'] != '') {
1896 2
                $newUrl .= '?' . $tmp['query'];
1897
            }
1898
        } else {
1899 3
            $newUrl = $header['Location'];
1900
        }
1901 5
        return $newUrl;
1902
    }
1903
1904
    /**************************
1905
     *
1906
     * tslib_fe hooks:
1907
     *
1908
     **************************/
1909
1910
    /**
1911
     * Initialization hook (called after database connection)
1912
     * Takes the "HTTP_X_T3CRAWLER" header and looks up queue record and verifies if the session comes from the system (by comparing hashes)
1913
     *
1914
     * @param array $params Parameters from frontend
1915
     * @param object $ref TSFE object (reference under PHP5)
1916
     * @return void
1917
     *
1918
     * FIXME: Look like this is not used, in commit 9910d3f40cce15f4e9b7bcd0488bf21f31d53ebc it's added as public,
1919
     * FIXME: I think this can be removed. (TNM)
1920
     */
1921
    public function fe_init(&$params, $ref)
0 ignored issues
show
Unused Code introduced by
The parameter $ref is not used and could be removed.

This check looks from parameters that have been defined for a function or method, but which are not used in the method body.

Loading history...
1922
    {
1923
        // Authenticate crawler request:
1924
        if (isset($_SERVER['HTTP_X_T3CRAWLER'])) {
1925
            $queryBuilder = GeneralUtility::makeInstance(ConnectionPool::class)->getQueryBuilderForTable($this->tableName);
1926
            list($queueId, $hash) = explode(':', $_SERVER['HTTP_X_T3CRAWLER']);
1927
1928
            $queueRec = $queryBuilder
1929
                ->select('*')
1930
                ->from('tx_crawler_queue')
1931
                ->where(
1932
                    $queryBuilder->expr()->eq('qid', $queryBuilder->createNamedParameter($queueId, \PDO::PARAM_INT))
1933
                )
1934
                ->execute()
1935
                ->fetch();
1936
1937
            // If a crawler record was found and hash was matching, set it up:
1938
            if (is_array($queueRec) && $hash === md5($queueRec['qid'] . '|' . $queueRec['set_id'] . '|' . $GLOBALS['TYPO3_CONF_VARS']['SYS']['encryptionKey'])) {
1939
                $params['pObj']->applicationData['tx_crawler']['running'] = true;
1940
                $params['pObj']->applicationData['tx_crawler']['parameters'] = unserialize($queueRec['parameters']);
1941
                $params['pObj']->applicationData['tx_crawler']['log'] = [];
1942
            } else {
1943
                die('No crawler entry found!');
1944
            }
1945
        }
1946
    }
1947
1948
    /*****************************
1949
     *
1950
     * Compiling URLs to crawl - tools
1951
     *
1952
     *****************************/
1953
1954
    /**
1955
     * @param integer $id Root page id to start from.
1956
     * @param integer $depth Depth of tree, 0=only id-page, 1= on sublevel, 99 = infinite
1957
     * @param integer $scheduledTime Unix Time when the URL is timed to be visited when put in queue
1958
     * @param integer $reqMinute Number of requests per minute (creates the interleave between requests)
1959
     * @param boolean $submitCrawlUrls If set, submits the URLs to queue in database (real crawling)
1960
     * @param boolean $downloadCrawlUrls If set (and submitcrawlUrls is false) will fill $downloadUrls with entries)
1961
     * @param array $incomingProcInstructions Array of processing instructions
1962
     * @param array $configurationSelection Array of configuration keys
1963
     * @return string
1964
     */
1965
    public function getPageTreeAndUrls(
1966
        $id,
1967
        $depth,
1968
        $scheduledTime,
1969
        $reqMinute,
1970
        $submitCrawlUrls,
1971
        $downloadCrawlUrls,
1972
        array $incomingProcInstructions,
1973
        array $configurationSelection
1974
    ) {
1975
        global $BACK_PATH;
1976
        global $LANG;
1977
        if (!is_object($LANG)) {
1978
            $LANG = GeneralUtility::makeInstance(LanguageService::class);
1979
            $LANG->init(0);
1980
        }
1981
        $this->scheduledTime = $scheduledTime;
1982
        $this->reqMinute = $reqMinute;
1983
        $this->submitCrawlUrls = $submitCrawlUrls;
1984
        $this->downloadCrawlUrls = $downloadCrawlUrls;
1985
        $this->incomingProcInstructions = $incomingProcInstructions;
1986
        $this->incomingConfigurationSelection = $configurationSelection;
1987
1988
        $this->duplicateTrack = [];
1989
        $this->downloadUrls = [];
1990
1991
        // Drawing tree:
1992
        /* @var PageTreeView $tree */
1993
        $tree = GeneralUtility::makeInstance(PageTreeView::class);
1994
        $perms_clause = $GLOBALS['BE_USER']->getPagePermsClause(1);
1995
        $tree->init('AND ' . $perms_clause);
1996
1997
        $pageInfo = BackendUtility::readPageAccess($id, $perms_clause);
1998
        if (is_array($pageInfo)) {
1999
            // Set root row:
2000
            $tree->tree[] = [
2001
                'row' => $pageInfo,
2002
                'HTML' => IconUtility::getIconForRecord('pages', $pageInfo)
2003
            ];
2004
        }
2005
2006
        // Get branch beneath:
2007
        if ($depth) {
2008
            $tree->getTree($id, $depth, '');
2009
        }
2010
2011
        // Traverse page tree:
2012
        $code = '';
2013
2014
        foreach ($tree->tree as $data) {
2015
            $this->MP = false;
2016
2017
            // recognize mount points
2018
            if ($data['row']['doktype'] == 7) {
2019
                $queryBuilder = GeneralUtility::makeInstance(ConnectionPool::class)->getQueryBuilderForTable($this->tableName);
2020
                $queryBuilder->getRestrictions()->removeAll()->add(GeneralUtility::makeInstance(DeletedRestriction::class));
2021
                $mountpage = $queryBuilder
2022
                    ->select('*')
2023
                    ->from('pages')
2024
                    ->where(
2025
                        $queryBuilder->expr()->eq('uid', $queryBuilder->createNamedParameter($data['row']['uid'], \PDO::PARAM_INT))
2026
                    )
2027
                    ->execute()
2028
                    ->fetchAll();
2029
                $queryBuilder->getRestrictions()->reset();
2030
2031
                // fetch mounted pages
2032
                $this->MP = $mountpage[0]['mount_pid'] . '-' . $data['row']['uid'];
0 ignored issues
show
Documentation Bug introduced by
The property $MP was declared of type boolean, but $mountpage[0]['mount_pid...' . $data['row']['uid'] is of type string. Maybe add a type cast?

This check looks for assignments to scalar types that may be of the wrong type.

To ensure the code behaves as expected, it may be a good idea to add an explicit type cast.

$answer = 42;

$correct = false;

$correct = (bool) $answer;
Loading history...
2033
2034
                $mountTree = GeneralUtility::makeInstance(PageTreeView::class);
2035
                $mountTree->init('AND ' . $perms_clause);
2036
                $mountTree->getTree($mountpage[0]['mount_pid'], $depth, '');
2037
2038
                foreach ($mountTree->tree as $mountData) {
2039
                    $code .= $this->drawURLs_addRowsForPage(
2040
                        $mountData['row'],
2041
                        $mountData['HTML'] . BackendUtility::getRecordTitle('pages', $mountData['row'], true)
2042
                    );
2043
                }
2044
2045
                // replace page when mount_pid_ol is enabled
2046
                if ($mountpage[0]['mount_pid_ol']) {
2047
                    $data['row']['uid'] = $mountpage[0]['mount_pid'];
2048
                } else {
2049
                    // if the mount_pid_ol is not set the MP must not be used for the mountpoint page
2050
                    $this->MP = false;
2051
                }
2052
            }
2053
2054
            $code .= $this->drawURLs_addRowsForPage(
2055
                $data['row'],
2056
                $data['HTML'] . BackendUtility::getRecordTitle('pages', $data['row'], true)
2057
            );
2058
        }
2059
2060
        return $code;
2061
    }
2062
2063
    /**
2064
     * Expands exclude string
2065
     *
2066
     * @param string $excludeString Exclude string
2067
     * @return array
2068
     */
2069 1
    public function expandExcludeString($excludeString)
2070
    {
2071
        // internal static caches;
2072 1
        static $expandedExcludeStringCache;
2073 1
        static $treeCache;
2074
2075 1
        if (empty($expandedExcludeStringCache[$excludeString])) {
2076 1
            $pidList = [];
2077
2078 1
            if (!empty($excludeString)) {
2079
                /** @var PageTreeView $tree */
2080
                $tree = GeneralUtility::makeInstance(PageTreeView::class);
2081
                $tree->init('AND ' . $this->backendUser->getPagePermsClause(1));
2082
2083
                $excludeParts = GeneralUtility::trimExplode(',', $excludeString);
2084
2085
                foreach ($excludeParts as $excludePart) {
2086
                    list($pid, $depth) = GeneralUtility::trimExplode('+', $excludePart);
2087
2088
                    // default is "page only" = "depth=0"
2089
                    if (empty($depth)) {
2090
                        $depth = (stristr($excludePart, '+')) ? 99 : 0;
2091
                    }
2092
2093
                    $pidList[] = $pid;
2094
2095
                    if ($depth > 0) {
2096
                        if (empty($treeCache[$pid][$depth])) {
2097
                            $tree->reset();
2098
                            $tree->getTree($pid, $depth);
2099
                            $treeCache[$pid][$depth] = $tree->tree;
2100
                        }
2101
2102
                        foreach ($treeCache[$pid][$depth] as $data) {
2103
                            $pidList[] = $data['row']['uid'];
2104
                        }
2105
                    }
2106
                }
2107
            }
2108
2109 1
            $expandedExcludeStringCache[$excludeString] = array_unique($pidList);
2110
        }
2111
2112 1
        return $expandedExcludeStringCache[$excludeString];
2113
    }
2114
2115
    /**
2116
     * Create the rows for display of the page tree
2117
     * For each page a number of rows are shown displaying GET variable configuration
2118
     *
2119
     * @param    array        Page row
2120
     * @param    string        Page icon and title for row
2121
     * @return    string        HTML <tr> content (one or more)
2122
     */
2123
    public function drawURLs_addRowsForPage(array $pageRow, $pageTitleAndIcon)
2124
    {
2125
        $skipMessage = '';
2126
2127
        // Get list of configurations
2128
        $configurations = $this->getUrlsForPageRow($pageRow, $skipMessage);
2129
2130
        if (count($this->incomingConfigurationSelection) > 0) {
2131
            // remove configuration that does not match the current selection
2132
            foreach ($configurations as $confKey => $confArray) {
2133
                if (!in_array($confKey, $this->incomingConfigurationSelection)) {
2134
                    unset($configurations[$confKey]);
2135
                }
2136
            }
2137
        }
2138
2139
        // Traverse parameter combinations:
2140
        $c = 0;
2141
        $content = '';
2142
        if (count($configurations)) {
2143
            foreach ($configurations as $confKey => $confArray) {
2144
2145
                    // Title column:
2146
                if (!$c) {
2147
                    $titleClm = '<td rowspan="' . count($configurations) . '">' . $pageTitleAndIcon . '</td>';
2148
                } else {
2149
                    $titleClm = '';
2150
                }
2151
2152
                if (!in_array($pageRow['uid'], $this->expandExcludeString($confArray['subCfg']['exclude']))) {
2153
2154
                        // URL list:
2155
                    $urlList = $this->urlListFromUrlArray(
2156
                        $confArray,
2157
                        $pageRow,
2158
                        $this->scheduledTime,
2159
                        $this->reqMinute,
2160
                        $this->submitCrawlUrls,
2161
                        $this->downloadCrawlUrls,
2162
                        $this->duplicateTrack,
2163
                        $this->downloadUrls,
2164
                        $this->incomingProcInstructions // if empty the urls won't be filtered by processing instructions
2165
                    );
2166
2167
                    // Expanded parameters:
2168
                    $paramExpanded = '';
2169
                    $calcAccu = [];
2170
                    $calcRes = 1;
2171
                    foreach ($confArray['paramExpanded'] as $gVar => $gVal) {
2172
                        $paramExpanded .= '
2173
                            <tr>
2174
                                <td class="bgColor4-20">' . htmlspecialchars('&' . $gVar . '=') . '<br/>' .
2175
                                                '(' . count($gVal) . ')' .
2176
                                                '</td>
2177
                                <td class="bgColor4" nowrap="nowrap">' . nl2br(htmlspecialchars(implode(chr(10), $gVal))) . '</td>
2178
                            </tr>
2179
                        ';
2180
                        $calcRes *= count($gVal);
2181
                        $calcAccu[] = count($gVal);
2182
                    }
2183
                    $paramExpanded = '<table class="lrPadding c-list param-expanded">' . $paramExpanded . '</table>';
2184
                    $paramExpanded .= 'Comb: ' . implode('*', $calcAccu) . '=' . $calcRes;
2185
2186
                    // Options
2187
                    $optionValues = '';
2188
                    if ($confArray['subCfg']['userGroups']) {
2189
                        $optionValues .= 'User Groups: ' . $confArray['subCfg']['userGroups'] . '<br/>';
2190
                    }
2191
                    if ($confArray['subCfg']['baseUrl']) {
2192
                        $optionValues .= 'Base Url: ' . $confArray['subCfg']['baseUrl'] . '<br/>';
2193
                    }
2194
                    if ($confArray['subCfg']['procInstrFilter']) {
2195
                        $optionValues .= 'ProcInstr: ' . $confArray['subCfg']['procInstrFilter'] . '<br/>';
2196
                    }
2197
2198
                    // Compile row:
2199
                    $content .= '
2200
                        <tr class="bgColor' . ($c % 2 ? '-20' : '-10') . '">
2201
                            ' . $titleClm . '
2202
                            <td>' . htmlspecialchars($confKey) . '</td>
2203
                            <td>' . nl2br(htmlspecialchars(rawurldecode(trim(str_replace('&', chr(10) . '&', GeneralUtility::implodeArrayForUrl('', $confArray['paramParsed'])))))) . '</td>
2204
                            <td>' . $paramExpanded . '</td>
2205
                            <td nowrap="nowrap">' . $urlList . '</td>
2206
                            <td nowrap="nowrap">' . $optionValues . '</td>
2207
                            <td nowrap="nowrap">' . DebugUtility::viewArray($confArray['subCfg']['procInstrParams.']) . '</td>
2208
                        </tr>';
2209
                } else {
2210
                    $content .= '<tr class="bgColor' . ($c % 2 ? '-20' : '-10') . '">
2211
                            ' . $titleClm . '
2212
                            <td>' . htmlspecialchars($confKey) . '</td>
2213
                            <td colspan="5"><em>No entries</em> (Page is excluded in this configuration)</td>
2214
                        </tr>';
2215
                }
2216
2217
                $c++;
2218
            }
2219
        } else {
2220
            $message = !empty($skipMessage) ? ' (' . $skipMessage . ')' : '';
2221
2222
            // Compile row:
2223
            $content .= '
2224
                <tr class="bgColor-20" style="border-bottom: 1px solid black;">
2225
                    <td>' . $pageTitleAndIcon . '</td>
2226
                    <td colspan="6"><em>No entries</em>' . $message . '</td>
2227
                </tr>';
2228
        }
2229
2230
        return $content;
2231
    }
2232
2233
    /*****************************
2234
     *
2235
     * CLI functions
2236
     *
2237
     *****************************/
2238
2239
    /**
2240
     * Main function for running from Command Line PHP script (cron job)
2241
     * See ext/crawler/cli/crawler_cli.phpsh for details
2242
     *
2243
     * @return int number of remaining items or false if error
2244
     */
2245
    public function CLI_main($args)
2246
    {
2247
        $this->setCliArgs($args);
2248
        $this->setAccessMode('cli');
2249
        $result = self::CLI_STATUS_NOTHING_PROCCESSED;
2250
        $cliObj = GeneralUtility::makeInstance(CrawlerCommandLineController::class);
0 ignored issues
show
Unused Code introduced by
$cliObj is not used, you could remove the assignment.

This check looks for variable assignements that are either overwritten by other assignments or where the variable is not used subsequently.

$myVar = 'Value';
$higher = false;

if (rand(1, 6) > 3) {
    $higher = true;
} else {
    $higher = false;
}

Both the $myVar assignment in line 1 and the $higher assignment in line 2 are dead. The first because $myVar is never used and the second because $higher is always overwritten for every possible time line.

Loading history...
2251
2252
        if (!$this->getDisabled() && $this->CLI_checkAndAcquireNewProcess($this->CLI_buildProcessId())) {
2253
            $countInARun = $this->cli_argValue('--countInARun') ? intval($this->cli_argValue('--countInARun')) : $this->extensionSettings['countInARun'];
0 ignored issues
show
Bug introduced by
The call to cli_argValue() misses a required argument $idx.

This check looks for function calls that miss required arguments.

Loading history...
2254
            // Seconds
2255
            $sleepAfterFinish = $this->cli_argValue('--sleepAfterFinish') ? intval($this->cli_argValue('--sleepAfterFinish')) : $this->extensionSettings['sleepAfterFinish'];
0 ignored issues
show
Bug introduced by
The call to cli_argValue() misses a required argument $idx.

This check looks for function calls that miss required arguments.

Loading history...
2256
            // Milliseconds
2257
            $sleepTime = $this->cli_argValue('--sleepTime') ? intval($this->cli_argValue('--sleepTime')) : $this->extensionSettings['sleepTime'];
0 ignored issues
show
Bug introduced by
The call to cli_argValue() misses a required argument $idx.

This check looks for function calls that miss required arguments.

Loading history...
2258
2259
            try {
2260
                // Run process:
2261
                $result = $this->CLI_run($countInARun, $sleepTime, $sleepAfterFinish);
2262
            } catch (\Exception $e) {
2263
                $this->CLI_debug(get_class($e) . ': ' . $e->getMessage());
2264
                $result = self::CLI_STATUS_ABORTED;
2265
            }
2266
2267
            // Cleanup
2268
            $this->processRepository->deleteProcessesWithoutItemsAssigned();
2269
2270
            //TODO can't we do that in a clean way?
2271
            $releaseStatus = $this->CLI_releaseProcesses($this->CLI_buildProcessId());
0 ignored issues
show
Unused Code introduced by
$releaseStatus is not used, you could remove the assignment.

This check looks for variable assignements that are either overwritten by other assignments or where the variable is not used subsequently.

$myVar = 'Value';
$higher = false;

if (rand(1, 6) > 3) {
    $higher = true;
} else {
    $higher = false;
}

Both the $myVar assignment in line 1 and the $higher assignment in line 2 are dead. The first because $myVar is never used and the second because $higher is always overwritten for every possible time line.

Loading history...
2272
2273
            $this->CLI_debug("Unprocessed Items remaining:" . $this->queueRepository->countUnprocessedItems() . " (" . $this->CLI_buildProcessId() . ")");
2274
            $result |= ($this->queueRepository->countUnprocessedItems() > 0 ? self::CLI_STATUS_REMAIN : self::CLI_STATUS_NOTHING_PROCCESSED);
2275
        } else {
2276
            $result |= self::CLI_STATUS_ABORTED;
2277
        }
2278
2279
        return $result;
2280
    }
2281
2282
    /**
2283
     * Helper function
2284
     *
2285
     * @param string $option Option string, eg. "-s
2286
     * @param int $idx Value index, default is 0 (zero) = the first one...
2287
     * @return string
2288
     */
2289
    private function cli_argValue($option, $idx) {
2290
        return is_array($this->cli_args[$option]) ? $this->cli_args[$option][$idx] : '';
0 ignored issues
show
Bug introduced by
The property cli_args does not exist. Did you maybe forget to declare it?

In PHP it is possible to write to properties without declaring them. For example, the following is perfectly valid PHP code:

class MyClass { }

$x = new MyClass();
$x->foo = true;

Generally, it is a good practice to explictly declare properties to avoid accidental typos and provide IDE auto-completion:

class MyClass {
    public $foo;
}

$x = new MyClass();
$x->foo = true;
Loading history...
2291
    }
2292
2293
    /**
2294
     * Helper function
2295
     *
2296
     * @param string $string The string to output
2297
     */
2298
    private function cli_echo($string) {
2299
        $this->outputLine($string);
0 ignored issues
show
Bug introduced by
The method outputLine() does not seem to exist on object<AOE\Crawler\Controller\CrawlerController>.

This check looks for calls to methods that do not seem to exist on a given type. It looks for the method on the type itself as well as in inherited classes or implemented interfaces.

This is most likely a typographical error or the method has been renamed.

Loading history...
2300
    }
2301
2302
    /**
2303
     * Set cli args
2304
     *
2305
     * This is a copy from the CommandLineController from TYPO3 < v9
2306
     *
2307
     * TODO: Rework
2308
     *
2309
     * @param array $argv
2310
     */
2311
    private function setCliArgs(array $argv) {
2312
        $cli_options = [];
2313
        $index = '_DEFAULT';
2314
        foreach ($argv as $token) {
2315
            // Options starting with a number is invalid - they could be negative values!
2316
            if ($token[0] === '-' && !MathUtility::canBeInterpretedAsInteger($token[1])) {
2317
                list($index, $opt) = explode('=', $token, 2);
2318
                if (isset($cli_options[$index])) {
2319
                    echo 'ERROR: Option ' . $index . ' was used twice!' . LF;
2320
                    die;
2321
                }
2322
                $cli_options[$index] = [];
2323
                if (isset($opt)) {
2324
                    $cli_options[$index][] = $opt;
2325
                }
2326
            } else {
2327
                $cli_options[$index][] = $token;
2328
            }
2329
        }
2330
2331
        $this->cliArgs = $cli_options;
2332
    }
2333
2334
2335
2336
    /**
2337
     * Function executed by crawler_im.php cli script.
2338
     *
2339
     * @return void
2340
     */
2341
    public function CLI_main_im($args = [])
2342
    {
2343
        $this->setAccessMode('cli_im');
2344
2345
        if(!empty($args)) {
2346
            $this->setCliArgs($args);
2347
        }
2348
2349
        // Force user to admin state and set workspace to "Live":
2350
        $this->backendUser->user['admin'] = 1;
2351
        $this->backendUser->setWorkspace(0);
2352
2353
        if ($this->cli_argValue('-o') === 'exec') {
0 ignored issues
show
Bug introduced by
The call to cli_argValue() misses a required argument $idx.

This check looks for function calls that miss required arguments.

Loading history...
2354
            $this->registerQueueEntriesInternallyOnly = true;
2355
        }
2356
2357
        if (isset($cliObj->cli_args['_DEFAULT'][2])) {
2358
            // Crawler is called over TYPO3 BE
2359
            $pageId = MathUtility::forceIntegerInRange($cliObj->cli_args['_DEFAULT'][2], 0);
0 ignored issues
show
Bug introduced by
The variable $cliObj does not exist. Did you forget to declare it?

This check marks access to variables or properties that have not been declared yet. While PHP has no explicit notion of declaring a variable, accessing it before a value is assigned to it is most likely a bug.

Loading history...
2360
        } else {
2361
            // Crawler is called over cli
2362
            $pageId = MathUtility::forceIntegerInRange($cliObj->cli_args['_DEFAULT'][1], 0);
2363
        }
2364
2365
        $configurationKeys = $this->getConfigurationKeys($cliObj);
0 ignored issues
show
Unused Code introduced by
The call to CrawlerController::getConfigurationKeys() has too many arguments starting with $cliObj.

This check compares calls to functions or methods with their respective definitions. If the call has more arguments than are defined, it raises an issue.

If a function is defined several times with a different number of parameters, the check may pick up the wrong definition and report false positives. One codebase where this has been known to happen is Wordpress.

In this case you can add the @ignore PhpDoc annotation to the duplicate definition and it will be ignored.

Loading history...
2366
2367
        if (!is_array($configurationKeys)) {
2368
            $configurations = $this->getUrlsForPageId($pageId);
2369
            if (is_array($configurations)) {
2370
                $configurationKeys = array_keys($configurations);
2371
            } else {
2372
                $configurationKeys = [];
2373
            }
2374
        }
2375
2376
        if ($this->cli_argValue('-o') === 'queue' || $this->cli_argValue('-o') === 'exec') {
0 ignored issues
show
Bug introduced by
The call to cli_argValue() misses a required argument $idx.

This check looks for function calls that miss required arguments.

Loading history...
2377
            $reason = new Reason();
2378
            $reason->setReason(Reason::REASON_GUI_SUBMIT);
2379
            $reason->setDetailText('The cli script of the crawler added to the queue');
2380
            EventDispatcher::getInstance()->post(
2381
                'invokeQueueChange',
2382
                $this->setID,
2383
                ['reason' => $reason]
2384
            );
2385
        }
2386
2387
        if ($this->extensionSettings['cleanUpOldQueueEntries']) {
2388
            $this->cleanUpOldQueueEntries();
2389
        }
2390
2391
        $this->setID = (int) GeneralUtility::md5int(microtime());
2392
        $this->getPageTreeAndUrls(
2393
            $pageId,
2394
            MathUtility::forceIntegerInRange($this->cli_argValue('-d'), 0, 99),
0 ignored issues
show
Bug introduced by
The call to cli_argValue() misses a required argument $idx.

This check looks for function calls that miss required arguments.

Loading history...
2395
            $this->getCurrentTime(),
2396
            MathUtility::forceIntegerInRange($cliObj->cli_isArg('-n') ? $this->cli_argValue('-n') : 30, 1, 1000),
0 ignored issues
show
Bug introduced by
The call to cli_argValue() misses a required argument $idx.

This check looks for function calls that miss required arguments.

Loading history...
2397
            $this->cli_argValue('-o') === 'queue' || $this->cli_argValue('-o') === 'exec',
0 ignored issues
show
Bug introduced by
The call to cli_argValue() misses a required argument $idx.

This check looks for function calls that miss required arguments.

Loading history...
2398
            $this->cli_argValue('-o') === 'url',
0 ignored issues
show
Bug introduced by
The call to cli_argValue() misses a required argument $idx.

This check looks for function calls that miss required arguments.

Loading history...
2399
            GeneralUtility::trimExplode(',', $this->cli_argValue('-proc'), true),
0 ignored issues
show
Bug introduced by
The call to cli_argValue() misses a required argument $idx.

This check looks for function calls that miss required arguments.

Loading history...
2400
            $configurationKeys
2401
        );
2402
2403
        if ($this->cli_argValue('-o') === 'url') {
0 ignored issues
show
Bug introduced by
The call to cli_argValue() misses a required argument $idx.

This check looks for function calls that miss required arguments.

Loading history...
2404
            $this->cli_echo(implode(chr(10), $this->downloadUrls) . chr(10), true);
0 ignored issues
show
Unused Code introduced by
The call to CrawlerController::cli_echo() has too many arguments starting with true.

This check compares calls to functions or methods with their respective definitions. If the call has more arguments than are defined, it raises an issue.

If a function is defined several times with a different number of parameters, the check may pick up the wrong definition and report false positives. One codebase where this has been known to happen is Wordpress.

In this case you can add the @ignore PhpDoc annotation to the duplicate definition and it will be ignored.

Loading history...
2405
        } elseif ($this->cli_argValue('-o') === 'exec') {
0 ignored issues
show
Bug introduced by
The call to cli_argValue() misses a required argument $idx.

This check looks for function calls that miss required arguments.

Loading history...
2406
            $this->cli_echo("Executing " . count($this->urlList) . " requests right away:\n\n");
2407
            $this->cli_echo(implode(chr(10), $this->urlList) . chr(10));
2408
            $this->cli_echo("\nProcessing:\n");
2409
2410
            foreach ($this->queueEntries as $queueRec) {
2411
                $p = unserialize($queueRec['parameters']);
2412
                $this->cli_echo($p['url'] . ' (' . implode(',', $p['procInstructions']) . ') => ');
2413
2414
                $result = $this->readUrlFromArray($queueRec);
2415
2416
                $requestResult = unserialize($result['content']);
2417
                if (is_array($requestResult)) {
2418
                    $resLog = is_array($requestResult['log']) ? chr(10) . chr(9) . chr(9) . implode(chr(10) . chr(9) . chr(9), $requestResult['log']) : '';
2419
                    $this->cli_echo('OK: ' . $resLog . chr(10));
2420
                } else {
2421
                    $this->cli_echo('Error checking Crawler Result: ' . substr(preg_replace('/\s+/', ' ', strip_tags($result['content'])), 0, 30000) . '...' . chr(10));
2422
                }
2423
            }
2424
        } elseif ($this->cli_argValue('-o') === 'queue') {
0 ignored issues
show
Bug introduced by
The call to cli_argValue() misses a required argument $idx.

This check looks for function calls that miss required arguments.

Loading history...
2425
            $this->cli_echo("Putting " . count($this->urlList) . " entries in queue:\n\n");
2426
            $this->cli_echo(implode(chr(10), $this->urlList) . chr(10));
2427
        } else {
2428
            $this->cli_echo(count($this->urlList) . " entries found for processing. (Use -o to decide action):\n\n", true);
0 ignored issues
show
Unused Code introduced by
The call to CrawlerController::cli_echo() has too many arguments starting with true.

This check compares calls to functions or methods with their respective definitions. If the call has more arguments than are defined, it raises an issue.

If a function is defined several times with a different number of parameters, the check may pick up the wrong definition and report false positives. One codebase where this has been known to happen is Wordpress.

In this case you can add the @ignore PhpDoc annotation to the duplicate definition and it will be ignored.

Loading history...
2429
            $this->cli_echo(implode(chr(10), $this->urlList) . chr(10), true);
0 ignored issues
show
Unused Code introduced by
The call to CrawlerController::cli_echo() has too many arguments starting with true.

This check compares calls to functions or methods with their respective definitions. If the call has more arguments than are defined, it raises an issue.

If a function is defined several times with a different number of parameters, the check may pick up the wrong definition and report false positives. One codebase where this has been known to happen is Wordpress.

In this case you can add the @ignore PhpDoc annotation to the duplicate definition and it will be ignored.

Loading history...
2430
        }
2431
    }
2432
2433
    /**
2434
     * Function executed by crawler_im.php cli script.
2435
     *
2436
     * @return bool
2437
     */
2438
    public function CLI_main_flush()
2439
    {
2440
        $this->setAccessMode('cli_flush');
2441
        $cliObj = GeneralUtility::makeInstance(FlushCommandLineController::class);
2442
2443
        // Force user to admin state and set workspace to "Live":
2444
        $this->backendUser->user['admin'] = 1;
2445
        $this->backendUser->setWorkspace(0);
2446
2447
        $pageId = MathUtility::forceIntegerInRange($cliObj->cli_args['_DEFAULT'][1], 0);
2448
        $fullFlush = ($pageId == 0);
2449
2450
        $mode = $this->cli_argValue('-o');
0 ignored issues
show
Bug introduced by
The call to cli_argValue() misses a required argument $idx.

This check looks for function calls that miss required arguments.

Loading history...
2451
2452
        switch ($mode) {
2453
            case 'all':
2454
                $result = $this->getLogEntriesForPageId($pageId, '', true, $fullFlush);
2455
                break;
2456
            case 'finished':
2457
            case 'pending':
2458
                $result = $this->getLogEntriesForPageId($pageId, $mode, true, $fullFlush);
2459
                break;
2460
            default:
2461
        }
2462
2463
        return $result !== false;
0 ignored issues
show
Bug introduced by
The variable $result does not seem to be defined for all execution paths leading up to this point.

If you define a variable conditionally, it can happen that it is not defined for all execution paths.

Let’s take a look at an example:

function myFunction($a) {
    switch ($a) {
        case 'foo':
            $x = 1;
            break;

        case 'bar':
            $x = 2;
            break;
    }

    // $x is potentially undefined here.
    echo $x;
}

In the above example, the variable $x is defined if you pass “foo” or “bar” as argument for $a. However, since the switch statement has no default case statement, if you pass any other value, the variable $x would be undefined.

Available Fixes

  1. Check for existence of the variable explicitly:

    function myFunction($a) {
        switch ($a) {
            case 'foo':
                $x = 1;
                break;
    
            case 'bar':
                $x = 2;
                break;
        }
    
        if (isset($x)) { // Make sure it's always set.
            echo $x;
        }
    }
    
  2. Define a default value for the variable:

    function myFunction($a) {
        $x = ''; // Set a default which gets overridden for certain paths.
        switch ($a) {
            case 'foo':
                $x = 1;
                break;
    
            case 'bar':
                $x = 2;
                break;
        }
    
        echo $x;
    }
    
  3. Add a value for the missing path:

    function myFunction($a) {
        switch ($a) {
            case 'foo':
                $x = 1;
                break;
    
            case 'bar':
                $x = 2;
                break;
    
            // We add support for the missing case.
            default:
                $x = '';
                break;
        }
    
        echo $x;
    }
    
Loading history...
2464
    }
2465
2466
    /**
2467
     * Obtains configuration keys from the CLI arguments
2468
     *
2469
     * @return mixed                        Array of keys or null if no keys found
2470
     */
2471
    protected function getConfigurationKeys()
2472
    {
2473
        $parameter = trim($this->cli_argValue('-conf'));
0 ignored issues
show
Bug introduced by
The call to cli_argValue() misses a required argument $idx.

This check looks for function calls that miss required arguments.

Loading history...
2474
        return ($parameter != '' ? GeneralUtility::trimExplode(',', $parameter) : []);
2475
    }
2476
2477
    /**
2478
     * Running the functionality of the CLI (crawling URLs from queue)
2479
     *
2480
     * @param int $countInARun
2481
     * @param int $sleepTime
2482
     * @param int $sleepAfterFinish
2483
     * @return string
2484
     */
2485
    public function CLI_run($countInARun, $sleepTime, $sleepAfterFinish)
2486
    {
2487
        $queryBuilder = GeneralUtility::makeInstance(ConnectionPool::class)->getQueryBuilderForTable($this->tableName);
2488
        $result = 0;
2489
        $counter = 0;
2490
2491
2492
2493
        // First, run hooks:
2494
        $this->CLI_runHooks();
2495
2496
        // Clean up the queue
2497
        if (intval($this->extensionSettings['purgeQueueDays']) > 0) {
2498
            $purgeDate = $this->getCurrentTime() - 24 * 60 * 60 * intval($this->extensionSettings['purgeQueueDays']);
2499
2500
            $del = $queryBuilder
2501
                ->delete($this->tableName)
2502
                ->where(
2503
                    'exec_time != 0 AND exec_time < ' . $purgeDate
2504
                );
2505
            if (false == $del) {
2506
2507
                $this->getLogger()->log(
2508
                    LogLevel::INFO,
2509
                    'Records could not be deleted.'
2510
                );
2511
            }
2512
        }
2513
2514
        // Select entries:
2515
        //TODO Shouldn't this reside within the transaction?
2516
        $rows = $queryBuilder
2517
            ->select('qid', 'scheduled')
2518
            ->from('tx_crawler_queue')
2519
            ->where(
2520
                $queryBuilder->expr()->eq('exec_time', 0),
2521
                $queryBuilder->expr()->eq('process_scheduled', 0),
2522
                $queryBuilder->expr()->lte('scheduled',  $this->getCurrentTime())
2523
            )
2524
            ->orderBy('scheduled')
2525
            ->addOrderBy('qid')
2526
            ->setMaxResults($countInARun)
2527
            ->execute()
2528
            ->fetchAll();
2529
2530
        if (count($rows) > 0) {
2531
            $quidList = [];
2532
2533
            foreach ($rows as $r) {
2534
                $quidList[] = $r['qid'];
2535
            }
2536
2537
            $processId = $this->CLI_buildProcessId();
2538
2539
            //reserve queue entries for process
2540
2541
            //$this->queryBuilder->getConnection()->executeQuery('BEGIN');
2542
            //TODO make sure we're not taking assigned queue-entires
2543
2544
            //save the number of assigned queue entrys to determine who many have been processed later
2545
            $numberOfAffectedRows = $queryBuilder
2546
                ->update('tx_crawler_queue')
2547
                ->where(
2548
                    $queryBuilder->expr()->in('qid', $quidList)
2549
                )
2550
                ->set('process_scheduled', $queryBuilder->createNamedParamter($this->getCurrentTime(), \PDO::PARAM_INT))
2551
                ->set('process_id', $processId)
2552
                ->execute();
2553
2554
2555
            GeneralUtility::makeInstance(ConnectionPool::class)->getConnectionForTable('tx_crawler_process')
2556
                ->update(
2557
                    'tx_crawler_process',
2558
                    [ 'assigned_items_count' => (int)$numberOfAffectedRows ],
2559
                    [ 'process_id' => (int) $processId ]
2560
                );
2561
2562
            if ($numberOfAffectedRows == count($quidList)) {
2563
                //$this->queryBuilder->getConnection()->executeQuery('COMMIT');
2564
            } else {
2565
                //$this->queryBuilder->getConnection()->executeQuery('ROLLBACK');
2566
                $this->CLI_debug("Nothing processed due to multi-process collision (" . $this->CLI_buildProcessId() . ")");
2567
                return ($result | self::CLI_STATUS_ABORTED);
2568
            }
2569
2570
            foreach ($rows as $r) {
2571
                $result |= $this->readUrl($r['qid']);
2572
2573
                $counter++;
2574
                usleep(intval($sleepTime)); // Just to relax the system
2575
2576
                // if during the start and the current read url the cli has been disable we need to return from the function
2577
                // mark the process NOT as ended.
2578
                if ($this->getDisabled()) {
2579
                    return ($result | self::CLI_STATUS_ABORTED);
2580
                }
2581
2582
                if (!$this->CLI_checkIfProcessIsActive($this->CLI_buildProcessId())) {
2583
                    $this->CLI_debug("conflict / timeout (" . $this->CLI_buildProcessId() . ")");
2584
2585
                    //TODO might need an additional returncode
2586
                    $result |= self::CLI_STATUS_ABORTED;
2587
                    break; //possible timeout
2588
                }
2589
            }
2590
2591
            sleep(intval($sleepAfterFinish));
2592
2593
            $msg = 'Rows: ' . $counter;
2594
            $this->CLI_debug($msg . " (" . $this->CLI_buildProcessId() . ")");
2595
        } else {
2596
            $this->CLI_debug("Nothing within queue which needs to be processed (" . $this->CLI_buildProcessId() . ")");
2597
        }
2598
2599
        if ($counter > 0) {
2600
            $result |= self::CLI_STATUS_PROCESSED;
2601
        }
2602
2603
        return $result;
2604
    }
2605
2606
    /**
2607
     * Activate hooks
2608
     *
2609
     * @return void
2610
     */
2611
    public function CLI_runHooks()
2612
    {
2613
        global $TYPO3_CONF_VARS;
2614
        if (is_array($TYPO3_CONF_VARS['EXTCONF']['crawler']['cli_hooks'])) {
2615
            foreach ($TYPO3_CONF_VARS['EXTCONF']['crawler']['cli_hooks'] as $objRef) {
2616
                $hookObj = GeneralUtility::makeInstance($objRef);
2617
                if (is_object($hookObj)) {
2618
                    $hookObj->crawler_init($this);
2619
                }
2620
            }
2621
        }
2622
    }
2623
2624
    /**
2625
     * Try to acquire a new process with the given id
2626
     * also performs some auto-cleanup for orphan processes
2627
     * @todo preemption might not be the most elegant way to clean up
2628
     *
2629
     * @param string $id identification string for the process
2630
     * @return boolean
2631
     */
2632
    public function CLI_checkAndAcquireNewProcess($id)
2633
    {
2634
        $queryBuilder = GeneralUtility::makeInstance(ConnectionPool::class)->getQueryBuilderForTable($this->tableName);
2635
        $ret = true;
2636
2637
        $systemProcessId = getmypid();
2638
        if ($systemProcessId < 1) {
2639
            return false;
2640
        }
2641
2642
        $processCount = 0;
2643
        $orphanProcesses = [];
2644
2645
        //$this->queryBuilder->getConnection()->executeQuery('BEGIN');
2646
2647
        $statement = $queryBuilder
2648
            ->select('process_id', 'ttl')
2649
            ->from('tx_crawler_process')
2650
            ->where(
2651
                'active = 1 AND deleted = 0'
2652
            )
2653
            ->execute();
2654
2655
        $currentTime = $this->getCurrentTime();
2656
2657
        while ($row = $statement->fetch()) {
2658
            if ($row['ttl'] < $currentTime) {
2659
                $orphanProcesses[] = $row['process_id'];
2660
            } else {
2661
                $processCount++;
2662
            }
2663
        }
2664
2665
        // if there are less than allowed active processes then add a new one
2666
        if ($processCount < intval($this->extensionSettings['processLimit'])) {
2667
            $this->CLI_debug("add process " . $this->CLI_buildProcessId() . " (" . ($processCount + 1) . "/" . intval($this->extensionSettings['processLimit']) . ")");
2668
2669
            GeneralUtility::makeInstance(ConnectionPool::class)->getConnectionForTable('tx_crawler_process')->insert(
2670
                'tx_crawler_process',
2671
                [
2672
                    'process_id' => $id,
2673
                    'active' => 1,
2674
                    'ttl' => $currentTime + (int)$this->extensionSettings['processMaxRunTime'],
2675
                    'system_process_id' => $systemProcessId
2676
                ]
2677
            );
2678
        } else {
2679
            $this->CLI_debug("Processlimit reached (" . ($processCount) . "/" . intval($this->extensionSettings['processLimit']) . ")");
2680
            $ret = false;
2681
        }
2682
2683
        $this->CLI_releaseProcesses($orphanProcesses, true); // maybe this should be somehow included into the current lock
2684
        $this->CLI_deleteProcessesMarkedDeleted();
0 ignored issues
show
Deprecated Code introduced by
The method AOE\Crawler\Controller\C...rocessesMarkedDeleted() has been deprecated with message: since crawler v7.0.0, will be removed in crawler v8.0.0.
Please Consider using $this->processRepository->deleteProcessesMarkedAsDeleted()

This method has been deprecated. The supplier of the class has supplied an explanatory message.

The explanatory message should give you some clue as to whether and when the method will be removed from the class and what other method or class to use instead.

Loading history...
2685
2686
        //$this->queryBuilder->getConnection()->executeQuery('COMMIT');
2687
2688
        return $ret;
2689
    }
2690
2691
    /**
2692
     * Release a process and the required resources
2693
     *
2694
     * @param  mixed    $releaseIds   string with a single process-id or array with multiple process-ids
2695
     * @param  boolean  $withinLock   show whether the DB-actions are included within an existing lock
2696
     * @return boolean
2697
     */
2698
    public function CLI_releaseProcesses($releaseIds, $withinLock = false)
2699
    {
2700
        $queryBuilder = GeneralUtility::makeInstance(ConnectionPool::class)->getQueryBuilderForTable($this->tableName);
2701
2702
        if (!is_array($releaseIds)) {
2703
            $releaseIds = [$releaseIds];
2704
        }
2705
2706
        if (!(count($releaseIds) > 0)) {
2707
            return false;   //nothing to release
2708
        }
2709
2710
        if (!$withinLock) {
2711
            //$this->queryBuilder->getConnection()->executeQuery('BEGIN');
2712
        }
2713
2714
        // some kind of 2nd chance algo - this way you need at least 2 processes to have a real cleanup
2715
        // this ensures that a single process can't mess up the entire process table
2716
2717
        // mark all processes as deleted which have no "waiting" queue-entires and which are not active
2718
2719
        $queryBuilder
2720
        ->update('tx_crawler_queue', 'q')
2721
        ->where(
2722
            'q.process_id IN(SELECT p.process_id FROM tx_crawler_process as p WHERE p.active = 0 and p.deleted = 0)'
2723
        )
2724
        ->set('q.process_scheduled', 0)
2725
        ->set('q.process_id', '')
2726
        ->execute();
2727
2728
        // FIXME: Not entirely sure that this is equivalent to the previous version
2729
        $queryBuilder->resetQueryPart('set');
2730
2731
        $queryBuilder
2732
            ->update('tx_crawler_process', 'p')
2733
            ->where(
2734
                $queryBuilder->expr()->eq('p.active', 0),
2735
                'p.process_id IN(SELECT q.process_id FROM tx_crawler_queue as q WHERE q.exec_time = 0)'
2736
            )
2737
            ->set('p.system_process_id', 0)
2738
            ->execute();
2739
        // previous version for reference
2740
        /*
2741
        $GLOBALS['TYPO3_DB']->exec_UPDATEquery(
2742
            'tx_crawler_process',
2743
            'active=0 AND deleted=0
2744
            AND NOT EXISTS (
2745
                SELECT * FROM tx_crawler_queue
2746
                WHERE tx_crawler_queue.process_id = tx_crawler_process.process_id
2747
                AND tx_crawler_queue.exec_time = 0
2748
            )',
2749
            [
2750
                'deleted' => '1',
2751
                'system_process_id' => 0
2752
            ]
2753
        );*/
2754
        // mark all requested processes as non-active
2755
        $queryBuilder
2756
            ->update('tx_crawler_process')
2757
            ->where(
2758
                'NOT EXISTS (
2759
                SELECT * FROM tx_crawler_queue
2760
                    WHERE tx_crawler_queue.process_id = tx_crawler_process.process_id
2761
                    AND tx_crawler_queue.exec_time = 0
2762
                )',
2763
                $queryBuilder->expr()->in('process_id', $queryBuilder->createNamedParameter($releaseIds, Connection::PARAM_STR_ARRAY)),
2764
                $queryBuilder->expr()->eq('deleted', 0)
2765
            )
2766
            ->set('active', 0)
2767
            ->execute();
2768
        $queryBuilder->resetQueryPart('set');
2769
        $queryBuilder
2770
            ->update('tx_crawler_queue')
2771
            ->where(
2772
                $queryBuilder->expr()->eq('exec_time', 0),
2773
                $queryBuilder->expr()->in('process_id', $queryBuilder->createNamedParameter($releaseIds, Connection::PARAM_STR_ARRAY)),
2774
                $queryBuilder->expr()->eq('deleted', 0)
2775
            )
2776
            ->set('process_scheduled', 0)
2777
            ->set('process_id', '')
2778
            ->execute();
2779
2780
        if (!$withinLock) {
2781
            //$this->queryBuilder->getConnection()->executeQuery('COMMIT');
2782
        }
2783
2784
        return true;
2785
    }
2786
2787
    /**
2788
     * Delete processes marked as deleted
2789
     *
2790
     * @return void
2791
     *
2792
     * @deprecated since crawler v7.0.0, will be removed in crawler v8.0.0.
2793
     * Please Consider using $this->processRepository->deleteProcessesMarkedAsDeleted()
2794
     */
2795 1
    public function CLI_deleteProcessesMarkedDeleted()
2796
    {
2797 1
        $queryBuilder = GeneralUtility::makeInstance(ConnectionPool::class)->getQueryBuilderForTable($this->tableName);
2798
        $queryBuilder
2799 1
            ->delete('tx_crawler_process')
2800 1
            ->where('deleted = 1')
2801 1
            ->execute();
2802 1
    }
2803
2804
    /**
2805
     * Check if there are still resources left for the process with the given id
2806
     * Used to determine timeouts and to ensure a proper cleanup if there's a timeout
2807
     *
2808
     * @param  string  identification string for the process
2809
     * @return boolean determines if the process is still active / has resources
2810
     *
2811
     * TODO: Please consider moving this to Domain Model for Process or in ProcessRepository
2812
     */
2813 1
    public function CLI_checkIfProcessIsActive($pid)
2814
    {
2815 1
        $queryBuilder = GeneralUtility::makeInstance(ConnectionPool::class)->getQueryBuilderForTable($this->tableName);
2816 1
        $ret = false;
2817
2818
        $statement = $queryBuilder
2819 1
            ->from('tx_crawler_process')
2820 1
            ->select('active')
2821 1
            ->where(
2822 1
                $queryBuilder->expr()->eq('process_id', intval($pid))
2823
            )
2824 1
            ->orderBy('ttl')
2825 1
            ->execute();
2826
2827 1
        if ($row = $statement->fetch(0)) {
2828 1
            $ret = intVal($row['active']) == 1;
2829
        }
2830
2831 1
        return $ret;
2832
    }
2833
2834
    /**
2835
     * Create a unique Id for the current process
2836
     *
2837
     * @return string  the ID
2838
     */
2839 2
    public function CLI_buildProcessId()
2840
    {
2841 2
        if (!$this->processID) {
2842 1
            $this->processID = GeneralUtility::shortMD5($this->microtime(true));
2843
        }
2844 2
        return $this->processID;
2845
    }
2846
2847
    /**
2848
     * @param bool $get_as_float
2849
     *
2850
     * @return mixed
2851
     */
2852
    protected function microtime($get_as_float = false)
2853
    {
2854
        return microtime($get_as_float);
2855
    }
2856
2857
    /**
2858
     * Prints a message to the stdout (only if debug-mode is enabled)
2859
     *
2860
     * @param  string $msg  the message
2861
     */
2862
    public function CLI_debug($msg)
2863
    {
2864
        if (intval($this->extensionSettings['processDebug'])) {
2865
            echo $msg . "\n";
2866
            flush();
2867
        }
2868
    }
2869
2870
    /**
2871
     * Get URL content by making direct request to TYPO3.
2872
     *
2873
     * @param  string $url          Page URL
2874
     * @param  int    $crawlerId    Crawler-ID
2875
     * @return array
2876
     */
2877 2
    protected function sendDirectRequest($url, $crawlerId)
2878
    {
2879 2
        $parsedUrl = parse_url($url);
2880 2
        if (!is_array($parsedUrl)) {
2881
            return [];
2882
        }
2883
2884 2
        $requestHeaders = $this->buildRequestHeaderArray($parsedUrl, $crawlerId);
2885
2886 2
        $cmd = escapeshellcmd($this->extensionSettings['phpPath']);
2887 2
        $cmd .= ' ';
2888 2
        $cmd .= escapeshellarg(ExtensionManagementUtility::extPath('crawler') . 'cli/bootstrap.php');
2889 2
        $cmd .= ' ';
2890 2
        $cmd .= escapeshellarg($this->getFrontendBasePath());
2891 2
        $cmd .= ' ';
2892 2
        $cmd .= escapeshellarg($url);
2893 2
        $cmd .= ' ';
2894 2
        $cmd .= escapeshellarg(base64_encode(serialize($requestHeaders)));
2895
2896 2
        $startTime = microtime(true);
2897 2
        $content = $this->executeShellCommand($cmd);
2898 2
        $this->log($url . ' ' . (microtime(true) - $startTime));
2899
2900
        $result = [
2901 2
            'request' => implode("\r\n", $requestHeaders) . "\r\n\r\n",
2902 2
            'headers' => '',
2903 2
            'content' => $content
2904
        ];
2905
2906 2
        return $result;
2907
    }
2908
2909
    /**
2910
     * Cleans up entries that stayed for too long in the queue. These are:
2911
     * - processed entries that are over 1.5 days in age
2912
     * - scheduled entries that are over 7 days old
2913
     *
2914
     * @return void
2915
     */
2916
    public function cleanUpOldQueueEntries()
2917
    {
2918
        $processedAgeInSeconds = $this->extensionSettings['cleanUpProcessedAge'] * 86400; // 24*60*60 Seconds in 24 hours
2919
        $scheduledAgeInSeconds = $this->extensionSettings['cleanUpScheduledAge'] * 86400;
2920
2921
        $now = time();
2922
        $condition = '(exec_time<>0 AND exec_time<' . ($now - $processedAgeInSeconds) . ') OR scheduled<=' . ($now - $scheduledAgeInSeconds);
2923
        $this->flushQueue($condition);
2924
    }
2925
2926
    /**
2927
     * Initializes a TypoScript Frontend necessary for using TypoScript and TypoLink functions
2928
     *
2929
     * @param int $id
2930
     * @param int $typeNum
2931
     *
2932
     * @return void
2933
     */
2934
    protected function initTSFE($id = 1, $typeNum = 0)
2935
    {
2936
        EidUtility::initTCA();
0 ignored issues
show
Deprecated Code introduced by
The method TYPO3\CMS\Frontend\Utility\EidUtility::initTCA() has been deprecated with message: since TYPO3 v9.4, will be removed in TYPO3 v10.0. Is not needed anymore within eID scripts as TCA is now available at any time

This method has been deprecated. The supplier of the class has supplied an explanatory message.

The explanatory message should give you some clue as to whether and when the method will be removed from the class and what other method or class to use instead.

Loading history...
2937
        if (!is_object($GLOBALS['TT'])) {
2938
            $GLOBALS['TT'] = new TimeTracker(false);
2939
            $GLOBALS['TT']->start();
2940
        }
2941
2942
        $GLOBALS['TSFE'] = GeneralUtility::makeInstance(TypoScriptFrontendController::class, $GLOBALS['TYPO3_CONF_VARS'], $id, $typeNum);
2943
        $GLOBALS['TSFE']->sys_page = GeneralUtility::makeInstance(PageRepository::class);
2944
        $GLOBALS['TSFE']->sys_page->init(true);
2945
        $GLOBALS['TSFE']->initFEuser();
2946
        $GLOBALS['TSFE']->determineId();
2947
        $GLOBALS['TSFE']->initTemplate();
2948
        $GLOBALS['TSFE']->rootLine = $GLOBALS['TSFE']->sys_page->getRootLine($id, '');
2949
        $GLOBALS['TSFE']->getConfigArray();
2950
    }
2951
2952
    /**
2953
     * Returns a md5 hash generated from a serialized configuration array.
2954
     *
2955
     * @param array $configuration
2956
     *
2957
     * @return string
2958
     */
2959 7
    protected function getConfigurationHash(array $configuration) {
2960 7
        unset($configuration['paramExpanded']);
2961 7
        unset($configuration['URLs']);
2962 7
        return md5(serialize($configuration));
2963
    }
2964
2965
    /**
2966
     * Check whether the Crawling Protocol should be http or https
2967
     *
2968
     * @param $crawlerConfiguration
2969
     * @param $pageConfiguration
2970
     *
2971
     * @return bool
2972
     */
2973 6
    protected function isCrawlingProtocolHttps($crawlerConfiguration, $pageConfiguration) {
2974 6
        switch($crawlerConfiguration) {
2975
            case -1:
2976 1
                return false;
2977 5
            case 0:
2978 3
                return $pageConfiguration;
2979 2
            case 1:
2980 1
                return true;
2981
            default:
2982 1
                return false;
2983
        }
2984
    }
2985
}
2986