Completed
Push — Testing/fix ( f80687...a7e6fd )
by Tomas Norre
16:42 queued 15:05
created

CrawlerController::getHttpResponseFromStream()   A

Complexity

Conditions 5
Paths 3

Size

Total Lines 23

Duplication

Lines 0
Ratio 0 %

Code Coverage

Tests 11
CRAP Score 5

Importance

Changes 0
Metric Value
cc 5
nc 3
nop 1
dl 0
loc 23
rs 9.2408
c 0
b 0
f 0
ccs 11
cts 11
cp 1
crap 5
1
<?php
2
namespace AOE\Crawler\Controller;
3
4
/***************************************************************
5
 *  Copyright notice
6
 *
7
 *  (c) 2017 AOE GmbH <[email protected]>
8
 *
9
 *  All rights reserved
10
 *
11
 *  This script is part of the TYPO3 project. The TYPO3 project is
12
 *  free software; you can redistribute it and/or modify
13
 *  it under the terms of the GNU General Public License as published by
14
 *  the Free Software Foundation; either version 3 of the License, or
15
 *  (at your option) any later version.
16
 *
17
 *  The GNU General Public License can be found at
18
 *  http://www.gnu.org/copyleft/gpl.html.
19
 *
20
 *  This script is distributed in the hope that it will be useful,
21
 *  but WITHOUT ANY WARRANTY; without even the implied warranty of
22
 *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
23
 *  GNU General Public License for more details.
24
 *
25
 *  This copyright notice MUST APPEAR in all copies of the script!
26
 ***************************************************************/
27
28
use AOE\Crawler\Command\CrawlerCommandLineController;
29
use AOE\Crawler\Command\FlushCommandLineController;
30
use AOE\Crawler\Command\QueueCommandLineController;
31
use AOE\Crawler\Domain\Model\Reason;
32
use AOE\Crawler\Domain\Repository\ProcessRepository;
33
use AOE\Crawler\Domain\Repository\QueueRepository;
34
use AOE\Crawler\Event\EventDispatcher;
35
use AOE\Crawler\Utility\IconUtility;
36
use AOE\Crawler\Utility\SignalSlotUtility;
37
use TYPO3\CMS\Backend\Utility\BackendUtility;
38
use TYPO3\CMS\Backend\Tree\View\PageTreeView;
39
use TYPO3\CMS\Core\Authentication\BackendUserAuthentication;
40
use TYPO3\CMS\Core\Database\Connection;
41
use TYPO3\CMS\Core\Database\ConnectionPool;
42
use TYPO3\CMS\Core\Database\DatabaseConnection;
43
use TYPO3\CMS\Core\Database\Query\QueryBuilder;
44
use TYPO3\CMS\Core\Log\Logger;
45
use TYPO3\CMS\Core\Log\LogLevel;
46
use TYPO3\CMS\Core\TimeTracker\NullTimeTracker;
47
use TYPO3\CMS\Core\TimeTracker\TimeTracker;
48
use TYPO3\CMS\Core\Utility\DebugUtility;
49
use TYPO3\CMS\Core\Utility\ExtensionManagementUtility;
50
use TYPO3\CMS\Core\Utility\GeneralUtility;
51
use TYPO3\CMS\Core\Utility\MathUtility;
52
use TYPO3\CMS\Extbase\Object\ObjectManager;
53
use TYPO3\CMS\Frontend\Controller\TypoScriptFrontendController;
54
use TYPO3\CMS\Frontend\Page\PageGenerator;
55
use TYPO3\CMS\Frontend\Page\PageRepository;
56
use TYPO3\CMS\Frontend\Utility\EidUtility;
57
use TYPO3\CMS\Lang\LanguageService;
58
use TYPO3\CMS\Core\Database\Query\Restriction\DeletedRestriction;
59
use TYPO3\CMS\Core\Database\Query\Restriction\StartTimeRestriction;
60
use TYPO3\CMS\Core\Database\Query\Restriction\EndTimeRestriction;
61
use TYPO3\CMS\Core\TypoScript\Parser\TypoScriptParser;
62
63
/**
64
 * Class CrawlerController
65
 *
66
 * @package AOE\Crawler\Controller
67
 */
68
class CrawlerController
69
{
70
    const CLI_STATUS_NOTHING_PROCCESSED = 0;
71
    const CLI_STATUS_REMAIN = 1; //queue not empty
72
    const CLI_STATUS_PROCESSED = 2; //(some) queue items where processed
73
    const CLI_STATUS_ABORTED = 4; //instance didn't finish
74
    const CLI_STATUS_POLLABLE_PROCESSED = 8;
75
76
    /**
77
     * @var integer
78
     */
79
    public $setID = 0;
80
81
    /**
82
     * @var string
83
     */
84
    public $processID = '';
85
86
    /**
87
     * One hour is max stalled time for the CLI
88
     * If the process had the status "start" for 3600 seconds, it will be regarded stalled and a new process is started
89
     *
90
     * @var integer
91
     */
92
    public $max_CLI_exec_time = 3600;
93
94
    /**
95
     * @var array
96
     */
97
    public $duplicateTrack = [];
98
99
    /**
100
     * @var array
101
     */
102
    public $downloadUrls = [];
103
104
    /**
105
     * @var array
106
     */
107
    public $incomingProcInstructions = [];
108
109
    /**
110
     * @var array
111
     */
112
    public $incomingConfigurationSelection = [];
113
114
    /**
115
     * @var bool
116
     */
117
    public $registerQueueEntriesInternallyOnly = false;
118
119
    /**
120
     * @var array
121
     */
122
    public $queueEntries = [];
123
124
    /**
125
     * @var array
126
     */
127
    public $urlList = [];
128
129
    /**
130
     * @var boolean
131
     */
132
    public $debugMode = false;
133
134
    /**
135
     * @var array
136
     */
137
    public $extensionSettings = [];
138
139
    /**
140
     * Mount Point
141
     *
142
     * @var boolean
143
     */
144
    public $MP = false;
145
146
    /**
147
     * @var string
148
     */
149
    protected $processFilename;
150
151
    /**
152
     * Holds the internal access mode can be 'gui','cli' or 'cli_im'
153
     *
154
     * @var string
155
     */
156
    protected $accessMode;
157
158
    /**
159
     * @var BackendUserAuthentication
160
     */
161
    private $backendUser;
162
163
    /**
164
     * @var integer
165
     */
166
    private $scheduledTime = 0;
167
168
    /**
169
     * @var integer
170
     */
171
    private $reqMinute = 0;
172
173
    /**
174
     * @var bool
175
     */
176
    private $submitCrawlUrls = false;
177
178
    /**
179
     * @var bool
180
     */
181
    private $downloadCrawlUrls = false;
182
183
    /**
184
     * @var QueueRepository
185
     */
186
    protected  $queueRepository;
187
188
    /**
189
     * @var ProcessRepository
190
     */
191
    protected $processRepository;
192
193
    /**
194
     * @var string
195
     */
196
    protected $tableName = 'tx_crawler_queue';
197
198
    /**
199
     * @var QueryBuilder
200
     */
201
    protected $queryBuilder = QueryBuilder::class;
202
203
    /**
204
     * @var array
205
     */
206
    private $cliArgs;
207
208
209
    /**
210
     * @var Logger
211
     */
212
    private $logger;
213
214
    /**
215
     * Method to set the accessMode can be gui, cli or cli_im
216
     *
217
     * @return string
218
     */
219 1
    public function getAccessMode()
220
    {
221 1
        return $this->accessMode;
222
    }
223
224
    /**
225
     * @param string $accessMode
226
     */
227 1
    public function setAccessMode($accessMode)
228
    {
229 1
        $this->accessMode = $accessMode;
230 1
    }
231
232
    /**
233
     * Set disabled status to prevent processes from being processed
234
     *
235
     * @param  bool $disabled (optional, defaults to true)
236
     * @return void
237
     */
238 3
    public function setDisabled($disabled = true)
239
    {
240 3
        if ($disabled) {
241 2
            GeneralUtility::writeFile($this->processFilename, '');
242
        } else {
243 1
            if (is_file($this->processFilename)) {
244 1
                unlink($this->processFilename);
245
            }
246
        }
247 3
    }
248
249
    /**
250
     * Get disable status
251
     *
252
     * @return bool true if disabled
253
     */
254 3
    public function getDisabled()
255
    {
256 3
        return is_file($this->processFilename);
257
    }
258
259
    /**
260
     * @param string $filenameWithPath
261
     *
262
     * @return void
263
     */
264 4
    public function setProcessFilename($filenameWithPath)
265
    {
266 4
        $this->processFilename = $filenameWithPath;
267 4
    }
268
269
    /**
270
     * @return string
271
     */
272 1
    public function getProcessFilename()
273
    {
274 1
        return $this->processFilename;
275
    }
276
277
    /**
278
     * @return Logger
279
     */
280
    private function getLogger(): Logger {
281
        if($this->logger === null) {
282
            $this->logger = GeneralUtility::makeInstance(\TYPO3\CMS\Core\Log\LogManager::class)->getLogger(__CLASS__);
283
        }
284
        return $this->logger;
285
    }
286
287
    /************************************
288
     *
289
     * Getting URLs based on Page TSconfig
290
     *
291
     ************************************/
292
293 31
    public function __construct()
294
    {
295 31
        $objectManager = GeneralUtility::makeInstance(ObjectManager::class);
296 31
        $this->queueRepository = $objectManager->get(QueueRepository::class);
297 31
        $this->processRepository = $objectManager->get(ProcessRepository::class);
298
299 31
        $this->backendUser = $GLOBALS['BE_USER'];
300 31
        $this->processFilename = PATH_site . 'typo3temp/tx_crawler.proc';
301
302 31
        $settings = unserialize($GLOBALS['TYPO3_CONF_VARS']['EXT']['extConf']['crawler']);
303 31
        $settings = is_array($settings) ? $settings : [];
304
305
        // read ext_em_conf_template settings and set
306 31
        $this->setExtensionSettings($settings);
307
308
        // set defaults:
309 31
        if (MathUtility::convertToPositiveInteger($this->extensionSettings['countInARun']) == 0) {
310
            $this->extensionSettings['countInARun'] = 100;
311
        }
312
313 31
        $this->extensionSettings['processLimit'] = MathUtility::forceIntegerInRange($this->extensionSettings['processLimit'], 1, 99, 1);
314 31
        $this->queryBuilder = GeneralUtility::makeInstance(ConnectionPool::class)->getQueryBuilderForTable($this->tableName);
315 31
    }
316
317
    /**
318
     * Sets the extensions settings (unserialized pendant of $TYPO3_CONF_VARS['EXT']['extConf']['crawler']).
319
     *
320
     * @param array $extensionSettings
321
     * @return void
322
     */
323 40
    public function setExtensionSettings(array $extensionSettings)
324
    {
325 40
        $this->extensionSettings = $extensionSettings;
326 40
    }
327
328
    /**
329
     * Check if the given page should be crawled
330
     *
331
     * @param array $pageRow
332
     * @return false|string false if the page should be crawled (not excluded), true / skipMessage if it should be skipped
333
     */
334 8
    public function checkIfPageShouldBeSkipped(array $pageRow)
335
    {
336 8
        $skipPage = false;
337 8
        $skipMessage = 'Skipped'; // message will be overwritten later
338
339
        // if page is hidden
340 8
        if (!$this->extensionSettings['crawlHiddenPages']) {
341 8
            if ($pageRow['hidden']) {
342 1
                $skipPage = true;
343 1
                $skipMessage = 'Because page is hidden';
344
            }
345
        }
346
347 8
        if (!$skipPage) {
348 7
            if (GeneralUtility::inList('3,4', $pageRow['doktype']) || $pageRow['doktype'] >= 199) {
349 3
                $skipPage = true;
350 3
                $skipMessage = 'Because doktype is not allowed';
351
            }
352
        }
353
354 8
        if (!$skipPage) {
355 4
            if (is_array($GLOBALS['TYPO3_CONF_VARS']['EXTCONF']['crawler']['excludeDoktype'])) {
356 2
                foreach ($GLOBALS['TYPO3_CONF_VARS']['EXTCONF']['crawler']['excludeDoktype'] as $key => $doktypeList) {
357 1
                    if (GeneralUtility::inList($doktypeList, $pageRow['doktype'])) {
358 1
                        $skipPage = true;
359 1
                        $skipMessage = 'Doktype was excluded by "' . $key . '"';
360 1
                        break;
361
                    }
362
                }
363
            }
364
        }
365
366 8
        if (!$skipPage) {
367
            // veto hook
368 3
            if (is_array($GLOBALS['TYPO3_CONF_VARS']['EXTCONF']['crawler']['pageVeto'])) {
369
                foreach ($GLOBALS['TYPO3_CONF_VARS']['EXTCONF']['crawler']['pageVeto'] as $key => $func) {
370
                    $params = [
371
                        'pageRow' => $pageRow
372
                    ];
373
                    // expects "false" if page is ok and "true" or a skipMessage if this page should _not_ be crawled
374
                    $veto = GeneralUtility::callUserFunction($func, $params, $this);
375
                    if ($veto !== false) {
376
                        $skipPage = true;
377
                        if (is_string($veto)) {
378
                            $skipMessage = $veto;
379
                        } else {
380
                            $skipMessage = 'Veto from hook "' . htmlspecialchars($key) . '"';
381
                        }
382
                        // no need to execute other hooks if a previous one return a veto
383
                        break;
384
                    }
385
                }
386
            }
387
        }
388
389 8
        return $skipPage ? $skipMessage : false;
390
    }
391
392
    /**
393
     * Wrapper method for getUrlsForPageId()
394
     * It returns an array of configurations and no urls!
395
     *
396
     * @param array $pageRow Page record with at least dok-type and uid columns.
397
     * @param string $skipMessage
398
     * @return array
399
     * @see getUrlsForPageId()
400
     */
401 4
    public function getUrlsForPageRow(array $pageRow, &$skipMessage = '')
402
    {
403 4
        $message = $this->checkIfPageShouldBeSkipped($pageRow);
404
405 4
        if ($message === false) {
406 3
            $forceSsl = ($pageRow['url_scheme'] === 2) ? true : false;
407 3
            $res = $this->getUrlsForPageId($pageRow['uid'], $forceSsl);
408 3
            $skipMessage = '';
409
        } else {
410 1
            $skipMessage = $message;
411 1
            $res = [];
412
        }
413
414 4
        return $res;
415
    }
416
417
    /**
418
     * This method is used to count if there are ANY unprocessed queue entries
419
     * of a given page_id and the configuration which matches a given hash.
420
     * If there if none, we can skip an inner detail check
421
     *
422
     * @param  int $uid
423
     * @param  string $configurationHash
424
     * @return boolean
425
     */
426 5
    protected function noUnprocessedQueueEntriesForPageWithConfigurationHashExist($uid, $configurationHash)
427
    {
428 5
        return $this->queryBuilder
429 5
            ->count('*')
430 5
            ->from($this->tableName)
431 5
            ->where(
432 5
                $this->queryBuilder->expr()->eq('page_id', intval($uid)),
433 5
                $this->queryBuilder->expr()->eq('configuration_hash', $this->queryBuilder->createNamedParameter($configurationHash)),
434 5
                $this->queryBuilder->expr()->eq('exec_time', 0)
435
            )
436 5
            ->execute()
437 5
            ->fetchColumn();
438
    }
439
440
    /**
441
     * Creates a list of URLs from input array (and submits them to queue if asked for)
442
     * See Web > Info module script + "indexed_search"'s crawler hook-client using this!
443
     *
444
     * @param    array        Information about URLs from pageRow to crawl.
445
     * @param    array        Page row
446
     * @param    integer        Unix time to schedule indexing to, typically time()
447
     * @param    integer        Number of requests per minute (creates the interleave between requests)
448
     * @param    boolean        If set, submits the URLs to queue
449
     * @param    boolean        If set (and submitcrawlUrls is false) will fill $downloadUrls with entries)
450
     * @param    array        Array which is passed by reference and contains the an id per url to secure we will not crawl duplicates
451
     * @param    array        Array which will be filled with URLS for download if flag is set.
452
     * @param    array        Array of processing instructions
453
     * @return    string        List of URLs (meant for display in backend module)
454
     *
455
     */
456 2
    public function urlListFromUrlArray(
457
    array $vv,
458
    array $pageRow,
459
    $scheduledTime,
460
    $reqMinute,
461
    $submitCrawlUrls,
462
    $downloadCrawlUrls,
463
    array &$duplicateTrack,
464
    array &$downloadUrls,
465
    array $incomingProcInstructions
466
    ) {
467 2
        $urlList = '';
468
        // realurl support (thanks to Ingo Renner)
469 2
        if (ExtensionManagementUtility::isLoaded('realurl') && $vv['subCfg']['realurl']) {
470
471
            /** @var tx_realurl $urlObj */
472
            $urlObj = GeneralUtility::makeInstance('tx_realurl');
473
474
            if (!empty($vv['subCfg']['baseUrl'])) {
475
                $urlParts = parse_url($vv['subCfg']['baseUrl']);
476
                $host = strtolower($urlParts['host']);
477
                $urlObj->host = $host;
478
479
                // First pass, finding configuration OR pointer string:
480
                $urlObj->extConf = isset($GLOBALS['TYPO3_CONF_VARS']['EXTCONF']['realurl'][$urlObj->host]) ? $GLOBALS['TYPO3_CONF_VARS']['EXTCONF']['realurl'][$urlObj->host] : $GLOBALS['TYPO3_CONF_VARS']['EXTCONF']['realurl']['_DEFAULT'];
481
482
                // If it turned out to be a string pointer, then look up the real config:
483
                if (is_string($urlObj->extConf)) {
484
                    $urlObj->extConf = is_array($GLOBALS['TYPO3_CONF_VARS']['EXTCONF']['realurl'][$urlObj->extConf]) ? $GLOBALS['TYPO3_CONF_VARS']['EXTCONF']['realurl'][$urlObj->extConf] : $GLOBALS['TYPO3_CONF_VARS']['EXTCONF']['realurl']['_DEFAULT'];
485
                }
486
            }
487
488
            if (!$GLOBALS['TSFE']->sys_page) {
489
                $GLOBALS['TSFE']->sys_page = GeneralUtility::makeInstance('TYPO3\CMS\Frontend\Page\PageRepository');
490
            }
491
492
            if (!$GLOBALS['TSFE']->tmpl->rootLine[0]['uid']) {
493
                $GLOBALS['TSFE']->tmpl->rootLine[0]['uid'] = $urlObj->extConf['pagePath']['rootpage_id'];
494
            }
495
        }
496
497 2
        if (is_array($vv['URLs'])) {
498 2
            $configurationHash = $this->getConfigurationHash($vv);
499 2
            $skipInnerCheck = $this->noUnprocessedQueueEntriesForPageWithConfigurationHashExist($pageRow['uid'], $configurationHash);
500
501 2
            foreach ($vv['URLs'] as $urlQuery) {
502 2
                if ($this->drawURLs_PIfilter($vv['subCfg']['procInstrFilter'], $incomingProcInstructions)) {
503
504
                    // Calculate cHash:
505 2
                    if ($vv['subCfg']['cHash']) {
506
                        /* @var $cacheHash \TYPO3\CMS\Frontend\Page\CacheHashCalculator */
507
                        $cacheHash = GeneralUtility::makeInstance('TYPO3\CMS\Frontend\Page\CacheHashCalculator');
508
                        $urlQuery .= '&cHash=' . $cacheHash->generateForParameters($urlQuery);
509
                    }
510
511
                    // Create key by which to determine unique-ness:
512 2
                    $uKey = $urlQuery . '|' . $vv['subCfg']['userGroups'] . '|' . $vv['subCfg']['baseUrl'] . '|' . $vv['subCfg']['procInstrFilter'];
513
514
                    // realurl support (thanks to Ingo Renner)
515 2
                    $urlQuery = 'index.php' . $urlQuery;
516 2
                    if (ExtensionManagementUtility::isLoaded('realurl') && $vv['subCfg']['realurl']) {
517
                        $params = [
518
                            'LD' => [
519
                                'totalURL' => $urlQuery
520
                            ],
521
                            'TCEmainHook' => true
522
                        ];
523
                        $urlObj->encodeSpURL($params);
0 ignored issues
show
Bug introduced by
The variable $urlObj does not seem to be defined for all execution paths leading up to this point.

If you define a variable conditionally, it can happen that it is not defined for all execution paths.

Let’s take a look at an example:

function myFunction($a) {
    switch ($a) {
        case 'foo':
            $x = 1;
            break;

        case 'bar':
            $x = 2;
            break;
    }

    // $x is potentially undefined here.
    echo $x;
}

In the above example, the variable $x is defined if you pass “foo” or “bar” as argument for $a. However, since the switch statement has no default case statement, if you pass any other value, the variable $x would be undefined.

Available Fixes

  1. Check for existence of the variable explicitly:

    function myFunction($a) {
        switch ($a) {
            case 'foo':
                $x = 1;
                break;
    
            case 'bar':
                $x = 2;
                break;
        }
    
        if (isset($x)) { // Make sure it's always set.
            echo $x;
        }
    }
    
  2. Define a default value for the variable:

    function myFunction($a) {
        $x = ''; // Set a default which gets overridden for certain paths.
        switch ($a) {
            case 'foo':
                $x = 1;
                break;
    
            case 'bar':
                $x = 2;
                break;
        }
    
        echo $x;
    }
    
  3. Add a value for the missing path:

    function myFunction($a) {
        switch ($a) {
            case 'foo':
                $x = 1;
                break;
    
            case 'bar':
                $x = 2;
                break;
    
            // We add support for the missing case.
            default:
                $x = '';
                break;
        }
    
        echo $x;
    }
    
Loading history...
524
                        $urlQuery = $params['LD']['totalURL'];
525
                    }
526
527
                    // Scheduled time:
528 2
                    $schTime = $scheduledTime + round(count($duplicateTrack) * (60 / $reqMinute));
529 2
                    $schTime = floor($schTime / 60) * 60;
530
531 2
                    if (isset($duplicateTrack[$uKey])) {
532
533
                        //if the url key is registered just display it and do not resubmit is
534
                        $urlList = '<em><span class="typo3-dimmed">' . htmlspecialchars($urlQuery) . '</span></em><br/>';
535
                    } else {
536 2
                        $urlList = '[' . date('d.m.y H:i', $schTime) . '] ' . htmlspecialchars($urlQuery);
537 2
                        $this->urlList[] = '[' . date('d.m.y H:i', $schTime) . '] ' . $urlQuery;
538
539 2
                        $theUrl = ($vv['subCfg']['baseUrl'] ? $vv['subCfg']['baseUrl'] : GeneralUtility::getIndpEnv('TYPO3_SITE_URL')) . $urlQuery;
540
541
                        // Submit for crawling!
542 2
                        if ($submitCrawlUrls) {
543 2
                            $added = $this->addUrl(
544 2
                            $pageRow['uid'],
545 2
                            $theUrl,
546 2
                            $vv['subCfg'],
547 2
                            $scheduledTime,
548 2
                            $configurationHash,
549 2
                            $skipInnerCheck
550
                            );
551 2
                            if ($added === false) {
552 2
                                $urlList .= ' (Url already existed)';
553
                            }
554
                        } elseif ($downloadCrawlUrls) {
555
                            $downloadUrls[$theUrl] = $theUrl;
556
                        }
557
558 2
                        $urlList .= '<br />';
559
                    }
560 2
                    $duplicateTrack[$uKey] = true;
561
                }
562
            }
563
        } else {
564
            $urlList = 'ERROR - no URL generated';
565
        }
566
567 2
        return $urlList;
568
    }
569
570
    /**
571
     * Returns true if input processing instruction is among registered ones.
572
     *
573
     * @param string $piString PI to test
574
     * @param array $incomingProcInstructions Processing instructions
575
     * @return boolean
576
     */
577 5
    public function drawURLs_PIfilter($piString, array $incomingProcInstructions)
578
    {
579 5
        if (empty($incomingProcInstructions)) {
580 1
            return true;
581
        }
582
583 4
        foreach ($incomingProcInstructions as $pi) {
584 4
            if (GeneralUtility::inList($piString, $pi)) {
585 2
                return true;
586
            }
587
        }
588 2
    }
589
590 2
    public function getPageTSconfigForId($id)
591
    {
592 2
        if (!$this->MP) {
593 2
            $pageTSconfig = BackendUtility::getPagesTSconfig($id);
594
        } else {
595
            list(, $mountPointId) = explode('-', $this->MP);
596
            $pageTSconfig = BackendUtility::getPagesTSconfig($mountPointId);
597
        }
598
599
        // Call a hook to alter configuration
600 2
        if (is_array($GLOBALS['TYPO3_CONF_VARS']['EXTCONF']['crawler']['getPageTSconfigForId'])) {
601
            $params = [
602
                'pageId' => $id,
603
                'pageTSConfig' => &$pageTSconfig
604
            ];
605
            foreach ($GLOBALS['TYPO3_CONF_VARS']['EXTCONF']['crawler']['getPageTSconfigForId'] as $userFunc) {
606
                GeneralUtility::callUserFunction($userFunc, $params, $this);
607
            }
608
        }
609
610 2
        return $pageTSconfig;
611
    }
612
613
    /**
614
     * This methods returns an array of configurations.
615
     * And no urls!
616
     *
617
     * @param integer $id Page ID
618
     * @param bool $forceSsl Use https
619
     * @return array
620
     */
621 2
    public function getUrlsForPageId($id, $forceSsl = false)
622
    {
623
624
        /**
625
         * Get configuration from tsConfig
626
         */
627
628
        // Get page TSconfig for page ID:
629 2
        $pageTSconfig = $this->getPageTSconfigForId($id);
630
631 2
        $res = [];
632
633 2
        if (is_array($pageTSconfig) && is_array($pageTSconfig['tx_crawler.']['crawlerCfg.'])) {
634 1
            $crawlerCfg = $pageTSconfig['tx_crawler.']['crawlerCfg.'];
635
636 1
            if (is_array($crawlerCfg['paramSets.'])) {
637 1
                foreach ($crawlerCfg['paramSets.'] as $key => $values) {
638 1
                    if (is_array($values)) {
639 1
                        $key = str_replace('.', '', $key);
640
                        // Sub configuration for a single configuration string:
641 1
                        $subCfg = (array)$crawlerCfg['paramSets.'][$key . '.'];
642 1
                        $subCfg['key'] = $key;
643
644 1
                        if (strcmp($subCfg['procInstrFilter'], '')) {
645 1
                            $subCfg['procInstrFilter'] = implode(',', GeneralUtility::trimExplode(',', $subCfg['procInstrFilter']));
646
                        }
647 1
                        $pidOnlyList = implode(',', GeneralUtility::trimExplode(',', $subCfg['pidsOnly'], true));
648
649
                        // process configuration if it is not page-specific or if the specific page is the current page:
650 1
                        if (!strcmp($subCfg['pidsOnly'], '') || GeneralUtility::inList($pidOnlyList, $id)) {
651
652
                                // add trailing slash if not present
653 1
                            if (!empty($subCfg['baseUrl']) && substr($subCfg['baseUrl'], -1) != '/') {
654
                                $subCfg['baseUrl'] .= '/';
655
                            }
656
657
                            // Explode, process etc.:
658 1
                            $res[$key] = [];
659 1
                            $res[$key]['subCfg'] = $subCfg;
660 1
                            $res[$key]['paramParsed'] = $this->parseParams($crawlerCfg['paramSets.'][$key]);
661 1
                            $res[$key]['paramExpanded'] = $this->expandParameters($res[$key]['paramParsed'], $id);
662 1
                            $res[$key]['origin'] = 'pagets';
663
664
                            // recognize MP value
665 1
                            if (!$this->MP) {
666 1
                                $res[$key]['URLs'] = $this->compileUrls($res[$key]['paramExpanded'], ['?id=' . $id]);
667
                            } else {
668
                                $res[$key]['URLs'] = $this->compileUrls($res[$key]['paramExpanded'], ['?id=' . $id . '&MP=' . $this->MP]);
669
                            }
670
                        }
671
                    }
672
                }
673
            }
674
        }
675
676
        /**
677
         * Get configuration from tx_crawler_configuration records
678
         */
679
680
        // get records along the rootline
681 2
        $rootLine = BackendUtility::BEgetRootLine($id);
682
683
684 2
        $queryBuilder = GeneralUtility::makeInstance(ConnectionPool::class)->getQueryBuilderForTable('tx_crawler_configuration');
685 2
        $queryBuilder->getRestrictions()->removeAll()->add(GeneralUtility::makeInstance(DeletedRestriction::class));
686
687 2
        foreach ($rootLine as $page) {
688
            $configurationRecordsForCurrentPage = $queryBuilder
689 2
                ->select('*')
690 2
                ->from('tx_crawler_configuration')
691 2
                ->where(
692 2
                    $queryBuilder->expr()->eq('pid', $page['uid']),
693 2
                    substr(BackendUtility::BEenableFields('tx_crawler_configuration'), 4) . BackendUtility::deleteClause('tx_crawler_configuration')
0 ignored issues
show
Deprecated Code introduced by
The method TYPO3\CMS\Backend\Utilit...Utility::deleteClause() has been deprecated with message: since TYPO3 v9, will be removed in TYPO3 v10.0, the DeletedRestriction functionality should be used instead.

This method has been deprecated. The supplier of the class has supplied an explanatory message.

The explanatory message should give you some clue as to whether and when the method will be removed from the class and what other method or class to use instead.

Loading history...
694
                )
695 2
                ->execute()
696 2
                ->fetchAll();
697
698 2
            if (is_array($configurationRecordsForCurrentPage)) {
699 2
                foreach ($configurationRecordsForCurrentPage as $configurationRecord) {
700
701
                        // check access to the configuration record
702 1
                    if (empty($configurationRecord['begroups']) || $GLOBALS['BE_USER']->isAdmin() || $this->hasGroupAccess($GLOBALS['BE_USER']->user['usergroup_cached_list'], $configurationRecord['begroups'])) {
703 1
                        $pidOnlyList = implode(',', GeneralUtility::trimExplode(',', $configurationRecord['pidsonly'], true));
704
705
                        // process configuration if it is not page-specific or if the specific page is the current page:
706 1
                        if (!strcmp($configurationRecord['pidsonly'], '') || GeneralUtility::inList($pidOnlyList, $id)) {
707 1
                            $key = $configurationRecord['name'];
708
709
                            // don't overwrite previously defined paramSets
710 1
                            if (!isset($res[$key])) {
711
712
                                    /* @var $TSparserObject \TYPO3\CMS\Core\TypoScript\Parser\TypoScriptParser */
713 1
                                $TSparserObject = GeneralUtility::makeInstance(TypoScriptParser::class);
714 1
                                $TSparserObject->parse($configurationRecord['processing_instruction_parameters_ts']);
715
716 1
                                $isCrawlingProtocolHttps = $this->isCrawlingProtocolHttps($configurationRecord['force_ssl'], $forceSsl);
717
718
                                $subCfg = [
719 1
                                    'procInstrFilter' => $configurationRecord['processing_instruction_filter'],
720 1
                                    'procInstrParams.' => $TSparserObject->setup,
721 1
                                    'baseUrl' => $this->getBaseUrlForConfigurationRecord(
722 1
                                        $configurationRecord['base_url'],
723 1
                                        $configurationRecord['sys_domain_base_url'],
724 1
                                        $isCrawlingProtocolHttps
725
                                    ),
726 1
                                    'realurl' => $configurationRecord['realurl'],
727 1
                                    'cHash' => $configurationRecord['chash'],
728 1
                                    'userGroups' => $configurationRecord['fegroups'],
729 1
                                    'exclude' => $configurationRecord['exclude'],
730 1
                                    'rootTemplatePid' => (int) $configurationRecord['root_template_pid'],
731 1
                                    'key' => $key
732
                                ];
733
734
                                // add trailing slash if not present
735 1
                                if (!empty($subCfg['baseUrl']) && substr($subCfg['baseUrl'], -1) != '/') {
736
                                    $subCfg['baseUrl'] .= '/';
737
                                }
738 1
                                if (!in_array($id, $this->expandExcludeString($subCfg['exclude']))) {
739 1
                                    $res[$key] = [];
740 1
                                    $res[$key]['subCfg'] = $subCfg;
741 1
                                    $res[$key]['paramParsed'] = $this->parseParams($configurationRecord['configuration']);
742 1
                                    $res[$key]['paramExpanded'] = $this->expandParameters($res[$key]['paramParsed'], $id);
743 1
                                    $res[$key]['URLs'] = $this->compileUrls($res[$key]['paramExpanded'], ['?id=' . $id]);
744 1
                                    $res[$key]['origin'] = 'tx_crawler_configuration_' . $configurationRecord['uid'];
745
                                }
746
                            }
747
                        }
748
                    }
749
                }
750
            }
751
        }
752
753 2
        if (is_array($GLOBALS['TYPO3_CONF_VARS']['EXTCONF']['crawler']['processUrls'])) {
754
            foreach ($GLOBALS['TYPO3_CONF_VARS']['EXTCONF']['crawler']['processUrls'] as $func) {
755
                $params = [
756
                    'res' => &$res,
757
                ];
758
                GeneralUtility::callUserFunction($func, $params, $this);
759
            }
760
        }
761
762 2
        return $res;
763
    }
764
765
    /**
766
     * Checks if a domain record exist and returns the base-url based on the record. If not the given baseUrl string is used.
767
     *
768
     * @param string $baseUrl
769
     * @param integer $sysDomainUid
770
     * @param bool $ssl
771
     * @return string
772
     */
773 4
    protected function getBaseUrlForConfigurationRecord($baseUrl, $sysDomainUid, $ssl = false)
774
    {
775 4
        $sysDomainUid = intval($sysDomainUid);
776 4
        $urlScheme = ($ssl === false) ? 'http' : 'https';
777
778 4
        if ($sysDomainUid > 0) {
779 2
            $statement = $this->queryBuilder
780 2
                ->from('sys_domain')
781 2
                ->select('*')
782 2
                ->where(
783 2
                    $this->queryBuilder->expr()->eq('uid', intval($sysDomainUid))
784
                )
785 2
                ->execute();
786
787 2
            $row = $statement->fetch(0);
788 2
            if ($row['domainName'] != '') {
789 1
                return $urlScheme . '://' . $row['domainName'];
790
            }
791
        }
792 3
        return $baseUrl;
793
    }
794
795
    /**
796
     * @param $rootid
797
     * @param $depth
798
     * @return array
799
     *
800
     * TODO: Write Functional Tests
801
     */
802
    public function getConfigurationsForBranch($rootid, $depth)
803
    {
804
        $configurationsForBranch = [];
805
806
        $pageTSconfig = $this->getPageTSconfigForId($rootid);
807
        if (is_array($pageTSconfig) && is_array($pageTSconfig['tx_crawler.']['crawlerCfg.']) && is_array($pageTSconfig['tx_crawler.']['crawlerCfg.']['paramSets.'])) {
808
            $sets = $pageTSconfig['tx_crawler.']['crawlerCfg.']['paramSets.'];
809
            if (is_array($sets)) {
810
                foreach ($sets as $key => $value) {
811
                    if (!is_array($value)) {
812
                        continue;
813
                    }
814
                    $configurationsForBranch[] = substr($key, -1) == '.' ? substr($key, 0, -1) : $key;
815
                }
816
            }
817
        }
818
        $pids = [];
819
        $rootLine = BackendUtility::BEgetRootLine($rootid);
820
        foreach ($rootLine as $node) {
821
            $pids[] = $node['uid'];
822
        }
823
        /* @var PageTreeView $tree */
824
        $tree = GeneralUtility::makeInstance(PageTreeView::class);
825
        $perms_clause = $GLOBALS['BE_USER']->getPagePermsClause(1);
826
        $tree->init('AND ' . $perms_clause);
827
        $tree->getTree($rootid, $depth, '');
828
        foreach ($tree->tree as $node) {
829
            $pids[] = $node['row']['uid'];
830
        }
831
832
        $queryBuilder = $this->getQueryBuilder('tx_crawler_configuration');
833
834
        $queryBuilder->getRestrictions()
835
            ->removeAll()
836
            ->add(GeneralUtility::makeInstance(DeletedRestriction::class))
837
            ->add(GeneralUtility::makeInstance(StartTimeRestriction::class))
838
            ->add(GeneralUtility::makeInstance(EndTimeRestriction::class));
839
840
        $statement = $queryBuilder
841
            ->select('name')
842
            ->from('tx_crawler_configuration')
843
            ->where(
844
                $queryBuilder->expr()->in('pid', $queryBuilder->createNamedParameter($pids, Connection::PARAM_INT_ARRAY))
845
            )
846
        ->execute();
847
848
        while($row = $statement->fetch()) {
849
            $configurationsForBranch[] = $row['name'];
850
        }
851
852
        return $configurationsForBranch;
853
    }
854
855
    /**
856
     * Get querybuilder for given table
857
     *
858
     * @param string $table
859
     * @return \TYPO3\CMS\Core\Database\Query\QueryBuilder
860
     */
861 7
    private function getQueryBuilder(string $table) {
862
863 7
        return GeneralUtility::makeInstance(ConnectionPool::class)
864 7
            ->getConnectionForTable($table)
865 7
            ->createQueryBuilder();
866
    }
867
868
    /**
869
     * Check if a user has access to an item
870
     * (e.g. get the group list of the current logged in user from $GLOBALS['TSFE']->gr_list)
871
     *
872
     * @see \TYPO3\CMS\Frontend\Page\PageRepository::getMultipleGroupsWhereClause()
873
     * @param  string $groupList    Comma-separated list of (fe_)group UIDs from a user
874
     * @param  string $accessList   Comma-separated list of (fe_)group UIDs of the item to access
875
     * @return bool                 TRUE if at least one of the users group UIDs is in the access list or the access list is empty
876
     */
877 3
    public function hasGroupAccess($groupList, $accessList)
878
    {
879 3
        if (empty($accessList)) {
880 1
            return true;
881
        }
882 2
        foreach (GeneralUtility::intExplode(',', $groupList) as $groupUid) {
883 2
            if (GeneralUtility::inList($accessList, $groupUid)) {
884 1
                return true;
885
            }
886
        }
887 1
        return false;
888
    }
889
890
    /**
891
     * Parse GET vars of input Query into array with key=>value pairs
892
     *
893
     * @param string $inputQuery Input query string
894
     * @return array
895
     */
896 5
    public function parseParams($inputQuery)
897
    {
898
        //echo '<pre>', var_dump($inputQuery), '</pre>';
899
        // Extract all GET parameters into an ARRAY:
900 5
        $paramKeyValues = [];
901 5
        $GETparams = explode('&', $inputQuery);
902
903 5
        foreach ($GETparams as $paramAndValue) {
904 5
            list($p, $v) = explode('=', $paramAndValue, 2);
905 5
            if (strlen($p)) {
906 4
                $paramKeyValues[rawurldecode($p)] = rawurldecode($v);
907
            }
908
        }
909
910 5
        return $paramKeyValues;
911
    }
912
913
    /**
914
     * Will expand the parameters configuration to individual values. This follows a certain syntax of the value of each parameter.
915
     * Syntax of values:
916
     * - Basically: If the value is wrapped in [...] it will be expanded according to the following syntax, otherwise the value is taken literally
917
     * - Configuration is splitted by "|" and the parts are processed individually and finally added together
918
     * - For each configuration part:
919
     *         - "[int]-[int]" = Integer range, will be expanded to all values in between, values included, starting from low to high (max. 1000). Example "1-34" or "-40--30"
920
     *         - "_TABLE:[TCA table name];[_PID:[optional page id, default is current page]];[_ENABLELANG:1]" = Look up of table records from PID, filtering out deleted records. Example "_TABLE:tt_content; _PID:123"
921
     *        _ENABLELANG:1 picks only original records without their language overlays
922
     *         - Default: Literal value
923
     *
924
     * @param array $paramArray Array with key (GET var name) and values (value of GET var which is configuration for expansion)
925
     * @param integer $pid Current page ID
926
     * @return array
927
     *
928
     * TODO: Write Functional Tests
929
     */
930 2
    public function expandParameters($paramArray, $pid)
931
    {
932 2
        global $TCA;
933
934
        // Traverse parameter names:
935 2
        foreach ($paramArray as $p => $v) {
936 2
            $v = trim($v);
937
938
            // If value is encapsulated in square brackets it means there are some ranges of values to find, otherwise the value is literal
939 2
            if (substr($v, 0, 1) === '[' && substr($v, -1) === ']') {
940
                // So, find the value inside brackets and reset the paramArray value as an array.
941 2
                $v = substr($v, 1, -1);
942 2
                $paramArray[$p] = [];
943
944
                // Explode parts and traverse them:
945 2
                $parts = explode('|', $v);
946 2
                foreach ($parts as $pV) {
947
948
                        // Look for integer range: (fx. 1-34 or -40--30 // reads minus 40 to minus 30)
949 2
                    if (preg_match('/^(-?[0-9]+)\s*-\s*(-?[0-9]+)$/', trim($pV), $reg)) {
950
951
                        // Swap if first is larger than last:
952
                        if ($reg[1] > $reg[2]) {
953
                            $temp = $reg[2];
954
                            $reg[2] = $reg[1];
955
                            $reg[1] = $temp;
956
                        }
957
958
                        // Traverse range, add values:
959
                        $runAwayBrake = 1000; // Limit to size of range!
960
                        for ($a = $reg[1]; $a <= $reg[2];$a++) {
961
                            $paramArray[$p][] = $a;
962
                            $runAwayBrake--;
963
                            if ($runAwayBrake <= 0) {
964
                                break;
965
                            }
966
                        }
967 2
                    } elseif (substr(trim($pV), 0, 7) == '_TABLE:') {
968
969
                        // Parse parameters:
970
                        $subparts = GeneralUtility::trimExplode(';', $pV);
971
                        $subpartParams = [];
972
                        foreach ($subparts as $spV) {
973
                            list($pKey, $pVal) = GeneralUtility::trimExplode(':', $spV);
974
                            $subpartParams[$pKey] = $pVal;
975
                        }
976
977
                        // Table exists:
978
                        if (isset($TCA[$subpartParams['_TABLE']])) {
979
                            $lookUpPid = isset($subpartParams['_PID']) ? intval($subpartParams['_PID']) : $pid;
980
                            $pidField = isset($subpartParams['_PIDFIELD']) ? trim($subpartParams['_PIDFIELD']) : 'pid';
981
                            $where = isset($subpartParams['_WHERE']) ? $subpartParams['_WHERE'] : '';
982
                            $addTable = isset($subpartParams['_ADDTABLE']) ? $subpartParams['_ADDTABLE'] : '';
983
984
                            $fieldName = $subpartParams['_FIELD'] ? $subpartParams['_FIELD'] : 'uid';
985
                            if ($fieldName === 'uid' || $TCA[$subpartParams['_TABLE']]['columns'][$fieldName]) {
986
                                $queryBuilder = $this->getQueryBuilder($subpartParams['_TABLE']);
987
988
                                $queryBuilder->getRestrictions()
989
                                    ->removeAll()
990
                                    ->add(GeneralUtility::makeInstance(DeletedRestriction::class));
991
992
                                $queryBuilder
993
                                    ->select($fieldName)
994
                                    ->from($subpartParams['_TABLE'])
995
                                    // TODO: Check if this works as intended!
996
                                    ->add('from', $addTable)
997
                                    ->where(
998
                                        $queryBuilder->expr()->eq($queryBuilder->quoteIdentifier($pidField), $queryBuilder->createNamedParameter($lookUpPid, \PDO::PARAM_INT)),
999
                                        $where
1000
                                    );
1001
                                $transOrigPointerField = $TCA[$subpartParams['_TABLE']]['ctrl']['transOrigPointerField'];
1002
1003
                                if ($subpartParams['_ENABLELANG'] && $transOrigPointerField) {
1004
                                    $queryBuilder->andWhere(
1005
                                        $queryBuilder->expr()->lte(
1006
                                            $queryBuilder->quoteIdentifier($transOrigPointerField), 0
1007
                                        )
1008
                                    );
1009
                                }
1010
1011
                                $statement = $queryBuilder->execute();
1012
1013
                                $rows = [];
1014
                                while($row = $statement->fetch()) {
1015
                                    $rows[$fieldName] = $row;
1016
                                }
1017
1018
                                if (is_array($rows)) {
1019
                                    $paramArray[$p] = array_merge($paramArray[$p], array_keys($rows));
1020
                                }
1021
                            }
1022
                        }
1023
                    } else { // Just add value:
1024 2
                        $paramArray[$p][] = $pV;
1025
                    }
1026
                    // Hook for processing own expandParameters place holder
1027 2
                    if (is_array($GLOBALS['TYPO3_CONF_VARS']['SC_OPTIONS']['crawler/class.tx_crawler_lib.php']['expandParameters'])) {
1028
                        $_params = [
1029
                            'pObj' => &$this,
1030
                            'paramArray' => &$paramArray,
1031
                            'currentKey' => $p,
1032
                            'currentValue' => $pV,
1033
                            'pid' => $pid
1034
                        ];
1035
                        foreach ($GLOBALS['TYPO3_CONF_VARS']['SC_OPTIONS']['crawler/class.tx_crawler_lib.php']['expandParameters'] as $key => $_funcRef) {
1036
                            GeneralUtility::callUserFunction($_funcRef, $_params, $this);
1037
                        }
1038
                    }
1039
                }
1040
1041
                // Make unique set of values and sort array by key:
1042 2
                $paramArray[$p] = array_unique($paramArray[$p]);
1043 2
                ksort($paramArray);
1044
            } else {
1045
                // Set the literal value as only value in array:
1046 2
                $paramArray[$p] = [$v];
1047
            }
1048
        }
1049
1050 2
        return $paramArray;
1051
    }
1052
1053
    /**
1054
     * Compiling URLs from parameter array (output of expandParameters())
1055
     * The number of URLs will be the multiplication of the number of parameter values for each key
1056
     *
1057
     * @param array $paramArray Output of expandParameters(): Array with keys (GET var names) and for each an array of values
1058
     * @param array $urls URLs accumulated in this array (for recursion)
1059
     * @return array
1060
     */
1061 5
    public function compileUrls($paramArray, $urls = [])
1062
    {
1063 5
        if (count($paramArray) && is_array($urls)) {
1064
            // shift first off stack:
1065 4
            reset($paramArray);
1066 4
            $varName = key($paramArray);
1067 4
            $valueSet = array_shift($paramArray);
1068
1069
            // Traverse value set:
1070 4
            $newUrls = [];
1071 4
            foreach ($urls as $url) {
1072 3
                foreach ($valueSet as $val) {
1073 3
                    $newUrls[] = $url . (strcmp($val, '') ? '&' . rawurlencode($varName) . '=' . rawurlencode($val) : '');
1074
1075 3
                    if (count($newUrls) > MathUtility::forceIntegerInRange($this->extensionSettings['maxCompileUrls'], 1, 1000000000, 10000)) {
1076
                        break;
1077
                    }
1078
                }
1079
            }
1080 4
            $urls = $newUrls;
1081 4
            $urls = $this->compileUrls($paramArray, $urls);
1082
        }
1083
1084 5
        return $urls;
1085
    }
1086
1087
    /************************************
1088
     *
1089
     * Crawler log
1090
     *
1091
     ************************************/
1092
1093
    /**
1094
     * Return array of records from crawler queue for input page ID
1095
     *
1096
     * @param integer $id Page ID for which to look up log entries.
1097
     * @param string$filter Filter: "all" => all entries, "pending" => all that is not yet run, "finished" => all complete ones
1098
     * @param boolean $doFlush If TRUE, then entries selected at DELETED(!) instead of selected!
1099
     * @param boolean $doFullFlush
1100
     * @param integer $itemsPerPage Limit the amount of entries per page default is 10
1101
     * @return array
1102
     */
1103 4
    public function getLogEntriesForPageId($id, $filter = '', $doFlush = false, $doFullFlush = false, $itemsPerPage = 10)
1104
    {
1105
1106 4
        $this->queryBuilder
1107 4
            ->select('*')
1108 4
            ->from($this->tableName)
1109 4
            ->where(
1110 4
                $this->queryBuilder->expr()->eq('page_id', $this->queryBuilder->createNamedParameter($id, \PDO::PARAM_INT))
1111
            )
1112 4
            ->orderBy('scheduled', 'DESC');
1113
1114 4
        $expressionBuilder = GeneralUtility::makeInstance(ConnectionPool::class)
1115 4
            ->getConnectionForTable($this->tableName)
1116 4
            ->getExpressionBuilder();
1117 4
        $query = $expressionBuilder->andX();
1118 4
        $addWhere = '';
1119 4
        switch ($filter) {
1120 4
            case 'pending':
1121
                $this->queryBuilder->andWhere($this->queryBuilder->expr()->eq('exec_time', 0));
1122
                $addWhere = $query->add($expressionBuilder->eq('exec_time', 0));
1123
                break;
1124 4
            case 'finished':
1125
                $this->queryBuilder->andWhere($this->queryBuilder->expr()->gt('exec_time', 0));
1126
                $addWhere = $query->add($expressionBuilder->gt('exec_time', 0));
1127
                break;
1128
        }
1129
1130
        // FIXME: Write unit test that ensures that the right records are deleted.
1131 4
        if ($doFlush) {
1132 2
            $this->flushQueue(($doFullFlush ? '1=1' : ('page_id=' . intval($id))) . $addWhere);
1133 2
            return [];
1134
        } else {
1135
1136 2
            if($itemsPerPage > 0) {
1137 2
                $this->queryBuilder
1138 2
                    ->setMaxResults((int)$itemsPerPage);
1139
            }
1140
1141 2
            return $this->queryBuilder->execute()->fetchAll();
1142
        }
1143
    }
1144
1145
    /**
1146
     * Return array of records from crawler queue for input set ID
1147
     *
1148
     * @param integer $set_id Set ID for which to look up log entries.
1149
     * @param string $filter Filter: "all" => all entries, "pending" => all that is not yet run, "finished" => all complete ones
1150
     * @param boolean $doFlush If TRUE, then entries selected at DELETED(!) instead of selected!
1151
     * @param integer $itemsPerPage Limit the amount of entires per page default is 10
1152
     * @return array
1153
     */
1154 4
    public function getLogEntriesForSetId($set_id, $filter = '', $doFlush = false, $doFullFlush = false, $itemsPerPage = 10)
1155
    {
1156
1157 4
        $this->queryBuilder
1158 4
            ->select('*')
1159 4
            ->from($this->tableName)
1160 4
            ->where(
1161 4
                $this->queryBuilder->expr()->eq('set_id', $this->queryBuilder->createNamedParameter($set_id, \PDO::PARAM_INT))
1162
            )
1163 4
            ->orderBy('scheduled', 'DESC');
1164
1165 4
        $expressionBuilder = GeneralUtility::makeInstance(ConnectionPool::class)
1166 4
            ->getConnectionForTable($this->tableName)
1167 4
            ->getExpressionBuilder();
1168 4
        $query = $expressionBuilder->andX();
1169
        // FIXME: Write Unit tests for Filters
1170 4
        $addWhere = '';
1171 4
        switch ($filter) {
1172 4
            case 'pending':
1173
                $this->queryBuilder->andWhere($queryBuilder->expr()->eq('exec_time', 0));
0 ignored issues
show
Bug introduced by
The variable $queryBuilder does not exist. Did you forget to declare it?

This check marks access to variables or properties that have not been declared yet. While PHP has no explicit notion of declaring a variable, accessing it before a value is assigned to it is most likely a bug.

Loading history...
1174
                $addWhere = $query->add($expressionBuilder->eq('exec_time', 0));
1175
                break;
1176 4
            case 'finished':
1177
                $this->queryBuilder->andWhere($queryBuilder->expr()->gt('exec_time', 0));
1178
                $addWhere = $query->add($expressionBuilder->gt('exec_time', 0));
1179
                break;
1180
        }
1181
        // FIXME: Write unit test that ensures that the right records are deleted.
1182 4
        if ($doFlush) {
1183 2
            $this->flushQueue($doFullFlush ? '' : ('set_id=' . intval($set_id) . $addWhere));
1184 2
            return [];
1185
        } else {
1186 2
            if($itemsPerPage > 0) {
1187 2
                $this->queryBuilder
1188 2
                    ->setMaxResults((int)$itemsPerPage);
1189
            }
1190
1191 2
            return $this->queryBuilder->execute()->fetchAll();
1192
        }
1193
    }
1194
1195
    /**
1196
     * Removes queue entries
1197
     *
1198
     * @param string $where SQL related filter for the entries which should be removed
1199
     * @return void
1200
     */
1201 7
    protected function flushQueue($where = '')
1202
    {
1203 7
        $realWhere = strlen($where) > 0 ? $where : '1=1';
1204
1205 7
        $queryBuilder = $this->getQueryBuilder($this->tableName);
1206
1207 7
        if (EventDispatcher::getInstance()->hasObserver('queueEntryFlush')) {
1208
1209
            $groups = $queryBuilder
1210
                ->select('DISTINCT set_id')
1211
                ->from($this->tableName)
1212
                ->where($realWhere)
1213
                ->execute()
1214
                ->fetchAll();
1215
            if (is_array($groups)) {
1216
                foreach ($groups as $group) {
1217
                    $subSet = $queryBuilder
1218
                        ->select('uid', 'set_id')
1219
                        ->from($this->tableName)
1220
                        ->where(
1221
                            $realWhere,
1222
                            $queryBuilder->expr()->eq('set_id', $group['set_id'])
1223
                        )
1224
                        ->execute()
1225
                        ->fetchAll();
1226
                    EventDispatcher::getInstance()->post('queueEntryFlush', $group['set_id'], $subSet);
1227
                }
1228
            }
1229
        }
1230
1231
        $queryBuilder
1232 7
            ->delete($this->tableName)
1233 7
            ->where($realWhere)
1234 7
            ->execute();
1235 7
    }
1236
1237
    /**
1238
     * Adding call back entries to log (called from hooks typically, see indexed search class "class.crawler.php"
1239
     *
1240
     * @param integer $setId Set ID
1241
     * @param array $params Parameters to pass to call back function
1242
     * @param string $callBack Call back object reference, eg. 'EXT:indexed_search/class.crawler.php:&tx_indexedsearch_crawler'
1243
     * @param integer $page_id Page ID to attach it to
1244
     * @param integer $schedule Time at which to activate
1245
     * @return void
1246
     */
1247
    public function addQueueEntry_callBack($setId, $params, $callBack, $page_id = 0, $schedule = 0)
1248
    {
1249
        if (!is_array($params)) {
1250
            $params = [];
1251
        }
1252
        $params['_CALLBACKOBJ'] = $callBack;
1253
1254
        GeneralUtility::makeInstance(ConnectionPool::class)->getConnectionForTable('tx_crawler_queue')
1255
            ->insert(
1256
                'tx_crawler_queue',
1257
                [
1258
                    'page_id' => intval($page_id),
1259
                    'parameters' => serialize($params),
1260
                    'scheduled' => intval($schedule) ? intval($schedule) : $this->getCurrentTime(),
1261
                    'exec_time' => 0,
1262
                    'set_id' => intval($setId),
1263
                    'result_data' => '',
1264
                ]
1265
            );
1266
    }
1267
1268
    /************************************
1269
     *
1270
     * URL setting
1271
     *
1272
     ************************************/
1273
1274
    /**
1275
     * Setting a URL for crawling:
1276
     *
1277
     * @param integer $id Page ID
1278
     * @param string $url Complete URL
1279
     * @param array $subCfg Sub configuration array (from TS config)
1280
     * @param integer $tstamp Scheduled-time
1281
     * @param string $configurationHash (optional) configuration hash
1282
     * @param bool $skipInnerDuplicationCheck (optional) skip inner duplication check
1283
     * @return bool
1284
     */
1285 2
    public function addUrl(
1286
        $id,
1287
        $url,
1288
        array $subCfg,
1289
        $tstamp,
1290
        $configurationHash = '',
1291
        $skipInnerDuplicationCheck = false
1292
    ) {
1293 2
        $urlAdded = false;
1294 2
        $rows = [];
1295
1296
        // Creating parameters:
1297
        $parameters = [
1298 2
            'url' => $url
1299
        ];
1300
1301
        // fe user group simulation:
1302 2
        $uGs = implode(',', array_unique(GeneralUtility::intExplode(',', $subCfg['userGroups'], true)));
1303 2
        if ($uGs) {
1304
            $parameters['feUserGroupList'] = $uGs;
1305
        }
1306
1307
        // Setting processing instructions
1308 2
        $parameters['procInstructions'] = GeneralUtility::trimExplode(',', $subCfg['procInstrFilter']);
1309 2
        if (is_array($subCfg['procInstrParams.'])) {
1310 2
            $parameters['procInstrParams'] = $subCfg['procInstrParams.'];
1311
        }
1312
1313
        // Possible TypoScript Template Parents
1314 2
        $parameters['rootTemplatePid'] = $subCfg['rootTemplatePid'];
1315
1316
        // Compile value array:
1317 2
        $parameters_serialized = serialize($parameters);
1318
        $fieldArray = [
1319 2
            'page_id' => intval($id),
1320 2
            'parameters' => $parameters_serialized,
1321 2
            'parameters_hash' => GeneralUtility::shortMD5($parameters_serialized),
1322 2
            'configuration_hash' => $configurationHash,
1323 2
            'scheduled' => $tstamp,
1324 2
            'exec_time' => 0,
1325 2
            'set_id' => intval($this->setID),
1326 2
            'result_data' => '',
1327 2
            'configuration' => $subCfg['key'],
1328
        ];
1329
1330 2
        if ($this->registerQueueEntriesInternallyOnly) {
1331
            //the entries will only be registered and not stored to the database
1332
            $this->queueEntries[] = $fieldArray;
1333
        } else {
1334 2
            if (!$skipInnerDuplicationCheck) {
1335
                // check if there is already an equal entry
1336 2
                $rows = $this->getDuplicateRowsIfExist($tstamp, $fieldArray);
1337
            }
1338
1339 2
            if (count($rows) == 0) {
1340 2
                $connectionForCrawlerQueue = GeneralUtility::makeInstance(ConnectionPool::class)->getConnectionForTable('tx_crawler_queue');
1341 2
                $connectionForCrawlerQueue->insert(
1342 2
                        'tx_crawler_queue',
1343 2
                        $fieldArray
1344
                    );
1345 2
                $uid = $connectionForCrawlerQueue->lastInsertId('tx_crawler_queue', 'qid');
1346 2
                $rows[] = $uid;
1347 2
                $urlAdded = true;
1348 2
                EventDispatcher::getInstance()->post('urlAddedToQueue', $this->setID, ['uid' => $uid, 'fieldArray' => $fieldArray]);
1349
            } else {
1350
                EventDispatcher::getInstance()->post('duplicateUrlInQueue', $this->setID, ['rows' => $rows, 'fieldArray' => $fieldArray]);
1351
            }
1352
        }
1353
1354 2
        return $urlAdded;
1355
    }
1356
1357
    /**
1358
     * This method determines duplicates for a queue entry with the same parameters and this timestamp.
1359
     * If the timestamp is in the past, it will check if there is any unprocessed queue entry in the past.
1360
     * If the timestamp is in the future it will check, if the queued entry has exactly the same timestamp
1361
     *
1362
     * @param int $tstamp
1363
     * @param array $fieldArray
1364
     *
1365
     * @return array
1366
     *
1367
     * TODO: Write Functional Tests
1368
     */
1369 2
    protected function getDuplicateRowsIfExist($tstamp, $fieldArray)
1370
    {
1371 2
        $rows = [];
1372
1373 2
        $currentTime = $this->getCurrentTime();
1374 2
        $this->queryBuilder
1375 2
            ->select('qid')
1376 2
            ->from('tx_crawler_queue');
1377
        //if this entry is scheduled with "now"
1378 2
        if ($tstamp <= $currentTime) {
1379
            if ($this->extensionSettings['enableTimeslot']) {
1380
                $timeBegin = $currentTime - 100;
1381
                $timeEnd = $currentTime + 100;
1382
                $this->queryBuilder
1383
                    ->where(
1384
                        'scheduled BETWEEN ' . $timeBegin . ' AND ' . $timeEnd . ''
1385
                    )
1386
                    ->orWhere(
1387
                        $this->queryBuilder->expr()->lte('scheduled', $currentTime)
1388
                    );
1389
            } else {
1390
                $this->queryBuilder
1391
                    ->where(
1392
                        $this->queryBuilder->expr()->lte('scheduled', $currentTime)
1393
                    );
1394
            }
1395 2
        } elseif ($tstamp > $currentTime) {
1396
            //entry with a timestamp in the future need to have the same schedule time
1397 2
            $this->queryBuilder
1398 2
                ->where(
1399 2
                    $this->queryBuilder->expr()->eq('scheduled', $tstamp)
1400
                );
1401
        }
1402
1403 2
        $statement = $this->queryBuilder
1404 2
            ->andWhere('exec_time != 0')
1405 2
            ->andWhere('process_id != 0')
1406 2
            ->andWhere($this->queryBuilder->expr()->eq('page_id', $this->queryBuilder->createNamedParameter($fieldArray['page_id'], \PDO::PARAM_INT)))
1407 2
            ->andWhere($this->queryBuilder->expr()->eq('parameters_hash', $this->queryBuilder->createNamedParameter($fieldArray['parameters_hash'], \PDO::PARAM_STR)))
1408 2
            ->execute();
1409
1410 2
        while($row = $statement->fetch()) {
1411
            $rows[] = $row['qid'];
1412
        }
1413
1414 2
        return $rows;
1415
    }
1416
1417
    /**
1418
     * Returns the current system time
1419
     *
1420
     * @return int
1421
     */
1422
    public function getCurrentTime()
1423
    {
1424
        return time();
1425
    }
1426
1427
    /************************************
1428
     *
1429
     * URL reading
1430
     *
1431
     ************************************/
1432
1433
    /**
1434
     * Read URL for single queue entry
1435
     *
1436
     * @param integer $queueId
1437
     * @param boolean $force If set, will process even if exec_time has been set!
1438
     * @return integer
1439
     */
1440
    public function readUrl($queueId, $force = false)
1441
    {
1442
        $ret = 0;
1443
        if ($this->debugMode) {
1444
            $this->getLogger()->log(
1445
                LogLevel::DEBUG,
1446
                'crawler-readurl start ' . microtime(true)
1447
            );
1448
        }
1449
        // Get entry:
1450
        $this->queryBuilder
1451
            ->select('*')
1452
            ->from('tx_crawler_queue')
1453
            ->where(
1454
                $this->queryBuilder->expr()->eq('qid', $this->queryBuilder->createNamedParameter($queueId, \PDO::PARAM_INT))
1455
            );
1456
        if(!$force) {
1457
            $this->queryBuilder
1458
                ->andWhere('exec_time = 0')
1459
                ->andWhere('process_scheduled > 0');
1460
        }
1461
        $queueRec = $this->queryBuilder->execute()->fetch();
1462
1463
        if (!is_array($queueRec)) {
1464
            return;
1465
        }
1466
1467
        $parameters = unserialize($queueRec['parameters']);
1468
        if ($parameters['rootTemplatePid']) {
1469
            $this->initTSFE((int)$parameters['rootTemplatePid']);
1470
        } else {
1471
            GeneralUtility::sysLog(
0 ignored issues
show
Deprecated Code introduced by
The method TYPO3\CMS\Core\Utility\GeneralUtility::sysLog() has been deprecated with message: since TYPO3 v9, will be removed in TYPO3 v10.0.

This method has been deprecated. The supplier of the class has supplied an explanatory message.

The explanatory message should give you some clue as to whether and when the method will be removed from the class and what other method or class to use instead.

Loading history...
1472
                'Page with (' . $queueRec['page_id'] . ') could not be crawled, please check your crawler configuration. Perhaps no Root Template Pid is set',
1473
                'crawler',
1474
                GeneralUtility::SYSLOG_SEVERITY_WARNING
1475
            );
1476
        }
1477
1478
        SignalSlotUtility::emitSignal(
1479
            __CLASS__,
1480
            SignalSlotUtility::SIGNNAL_QUEUEITEM_PREPROCESS,
1481
            [$queueId, &$queueRec]
1482
        );
1483
1484
        // Set exec_time to lock record:
1485
        $field_array = ['exec_time' => $this->getCurrentTime()];
1486
1487
        if (isset($this->processID)) {
1488
            //if mulitprocessing is used we need to store the id of the process which has handled this entry
1489
            $field_array['process_id_completed'] = $this->processID;
1490
        }
1491
        GeneralUtility::makeInstance(ConnectionPool::class)->getConnectionForTable('tx_crawler_queue')
1492
            ->update(
1493
                'tx_crawler_queue',
1494
                $field_array,
1495
                [ 'qid' => (int) $queueI]
0 ignored issues
show
Bug introduced by
The variable $queueI does not exist. Did you mean $queueId?

This check looks for variables that are accessed but have not been defined. It raises an issue if it finds another variable that has a similar name.

The variable may have been renamed without also renaming all references.

Loading history...
1496
            );
1497
1498
        $result = $this->readUrl_exec($queueRec);
1499
        $resultData = unserialize($result['content']);
1500
1501
        //atm there's no need to point to specific pollable extensions
1502
        if (is_array($GLOBALS['TYPO3_CONF_VARS']['EXTCONF']['crawler']['pollSuccess'])) {
1503
            foreach ($GLOBALS['TYPO3_CONF_VARS']['EXTCONF']['crawler']['pollSuccess'] as $pollable) {
1504
                // only check the success value if the instruction is runnig
1505
                // it is important to name the pollSuccess key same as the procInstructions key
1506
                if (is_array($resultData['parameters']['procInstructions']) && in_array(
1507
                    $pollable,
1508
                        $resultData['parameters']['procInstructions']
1509
                )
1510
                ) {
1511
                    if (!empty($resultData['success'][$pollable]) && $resultData['success'][$pollable]) {
1512
                        $ret |= self::CLI_STATUS_POLLABLE_PROCESSED;
1513
                    }
1514
                }
1515
            }
1516
        }
1517
1518
        // Set result in log which also denotes the end of the processing of this entry.
1519
        $field_array = ['result_data' => serialize($result)];
1520
1521
        SignalSlotUtility::emitSignal(
1522
            __CLASS__,
1523
            SignalSlotUtility::SIGNNAL_QUEUEITEM_POSTPROCESS,
1524
            [$queueId, &$field_array]
1525
        );
1526
1527
        GeneralUtility::makeInstance(ConnectionPool::class)->getConnectionForTable('tx_crawler_queue')
1528
            ->update(
1529
                'tx_crawler_queue',
1530
                $field_array,
1531
                [ 'qid' => (int)$queueId ]
1532
            );
1533
1534
        if ($this->debugMode) {
1535
            $this->getLogger()->log(
1536
                LogLevel::DEBUG,
1537
                'crawler-readurl stop ' . microtime(true)
1538
            );
1539
        }
1540
1541
        return $ret;
1542
    }
1543
1544
    /**
1545
     * Read URL for not-yet-inserted log-entry
1546
     *
1547
     * @param array $field_array Queue field array,
1548
     *
1549
     * @return string
1550
     */
1551
    public function readUrlFromArray($field_array)
1552
    {
1553
1554
            // Set exec_time to lock record:
1555
        $field_array['exec_time'] = $this->getCurrentTime();
1556
        $connectionForCrawlerQueue = GeneralUtility::makeInstance(ConnectionPool::class)->getConnectionForTable('tx_crawler_queue');
1557
        $connectionForCrawlerQueue->insert(
1558
            'tx_crawler_queue',
1559
            $field_array
1560
        );
1561
        $queueId = $field_array['qid'] = $connectionForCrawlerQueue->lastInsertId('tx_crawler_queue', 'qid');
1562
1563
        $result = $this->readUrl_exec($field_array);
1564
1565
        // Set result in log which also denotes the end of the processing of this entry.
1566
        $field_array = ['result_data' => serialize($result)];
1567
1568
        SignalSlotUtility::emitSignal(
1569
            __CLASS__,
1570
            SignalSlotUtility::SIGNNAL_QUEUEITEM_POSTPROCESS,
1571
            [$queueId, &$field_array]
1572
        );
1573
1574
        $connectionForCrawlerQueue->update(
1575
            'tx_crawler_queue',
1576
            $field_array,
1577
            ['qid' => $queueId]
1578
        );
1579
1580
        return $result;
1581
    }
1582
1583
    /**
1584
     * Read URL for a queue record
1585
     *
1586
     * @param array $queueRec Queue record
1587
     * @return string
1588
     */
1589
    public function readUrl_exec($queueRec)
1590
    {
1591
        // Decode parameters:
1592
        $parameters = unserialize($queueRec['parameters']);
1593
        $result = 'ERROR';
1594
        if (is_array($parameters)) {
1595
            if ($parameters['_CALLBACKOBJ']) { // Calling object:
1596
                $objRef = $parameters['_CALLBACKOBJ'];
1597
                $callBackObj = GeneralUtility::makeInstance($objRef);
1598
                if (is_object($callBackObj)) {
1599
                    unset($parameters['_CALLBACKOBJ']);
1600
                    $result = ['content' => serialize($callBackObj->crawler_execute($parameters, $this))];
1601
                } else {
1602
                    $result = ['content' => 'No object: ' . $objRef];
1603
                }
1604
            } else { // Regular FE request:
1605
1606
                // Prepare:
1607
                $crawlerId = $queueRec['qid'] . ':' . md5($queueRec['qid'] . '|' . $queueRec['set_id'] . '|' . $GLOBALS['TYPO3_CONF_VARS']['SYS']['encryptionKey']);
1608
1609
                // Get result:
1610
                $result = $this->requestUrl($parameters['url'], $crawlerId);
1611
1612
                EventDispatcher::getInstance()->post('urlCrawled', $queueRec['set_id'], ['url' => $parameters['url'], 'result' => $result]);
1613
            }
1614
        }
1615
1616
        return $result;
1617
    }
1618
1619
    /**
1620
     * Gets the content of a URL.
1621
     *
1622
     * @param string $originalUrl URL to read
1623
     * @param string $crawlerId Crawler ID string (qid + hash to verify)
1624
     * @param integer $timeout Timeout time
1625
     * @param integer $recursion Recursion limiter for 302 redirects
1626
     * @return array|boolean
1627
     */
1628 2
    public function requestUrl($originalUrl, $crawlerId, $timeout = 2, $recursion = 10)
1629
    {
1630 2
        if (!$recursion) {
1631
            return false;
1632
        }
1633
1634
        // Parse URL, checking for scheme:
1635 2
        $url = parse_url($originalUrl);
1636
1637 2
        if ($url === false) {
1638
            $this->getLogger()->log(
1639
                LogLevel::DEBUG,
1640
                sprintf('Could not parse_url() for string "%s"', $url),
1641
                ['crawlerId' => $crawlerId]
1642
            );
1643
            return false;
1644
        }
1645
1646 2
        if (!in_array($url['scheme'], ['','http','https'])) {
1647
            $this->getLogger()->log(
1648
                LogLevel::DEBUG,
1649
                sprintf('Scheme does not match for url "%s"', $url),
1650
                ['crawlerId' => $crawlerId]
1651
            );
1652
            return false;
1653
        }
1654
1655
        // direct request
1656 2
        if ($this->extensionSettings['makeDirectRequests']) {
1657 2
            $result = $this->sendDirectRequest($originalUrl, $crawlerId);
1658 2
            return $result;
1659
        }
1660
1661
        $reqHeaders = $this->buildRequestHeaderArray($url, $crawlerId);
1662
1663
        // thanks to Pierrick Caillon for adding proxy support
1664
        $rurl = $url;
1665
1666
        if ($GLOBALS['TYPO3_CONF_VARS']['SYS']['curlUse'] && $GLOBALS['TYPO3_CONF_VARS']['SYS']['curlProxyServer']) {
1667
            $rurl = parse_url($GLOBALS['TYPO3_CONF_VARS']['SYS']['curlProxyServer']);
1668
            $url['path'] = $url['scheme'] . '://' . $url['host'] . ($url['port'] > 0 ? ':' . $url['port'] : '') . $url['path'];
1669
            $reqHeaders = $this->buildRequestHeaderArray($url, $crawlerId);
1670
        }
1671
1672
        $host = $rurl['host'];
1673
1674
        if ($url['scheme'] == 'https') {
1675
            $host = 'ssl://' . $host;
1676
            $port = ($rurl['port'] > 0) ? $rurl['port'] : 443;
1677
        } else {
1678
            $port = ($rurl['port'] > 0) ? $rurl['port'] : 80;
1679
        }
1680
1681
        $startTime = microtime(true);
1682
        $fp = fsockopen($host, $port, $errno, $errstr, $timeout);
1683
1684
        if (!$fp) {
1685
            $this->getLogger()->log(
1686
                LogLevel::DEBUG,
1687
                sprintf('Error while opening "%s"', $url),
1688
                ['crawlerId' => $crawlerId]
1689
            );
1690
            return false;
1691
        } else {
1692
            // Request message:
1693
            $msg = implode("\r\n", $reqHeaders) . "\r\n\r\n";
1694
            fputs($fp, $msg);
1695
1696
            // Read response:
1697
            $d = $this->getHttpResponseFromStream($fp);
1698
            fclose($fp);
1699
1700
            $time = microtime(true) - $startTime;
1701
            $this->log($originalUrl . ' ' . $time);
1702
1703
            // Implode content and headers:
1704
            $result = [
1705
                'request' => $msg,
1706
                'headers' => implode('', $d['headers']),
1707
                'content' => implode('', (array)$d['content'])
1708
            ];
1709
1710
            if (($this->extensionSettings['follow30x']) && ($newUrl = $this->getRequestUrlFrom302Header($d['headers'], $url['user'], $url['pass']))) {
1711
                $result = array_merge(['parentRequest' => $result], $this->requestUrl($newUrl, $crawlerId, $recursion--));
0 ignored issues
show
Bug introduced by
It seems like $newUrl defined by $this->getRequestUrlFrom...['user'], $url['pass']) on line 1710 can also be of type boolean; however, AOE\Crawler\Controller\C...ontroller::requestUrl() does only seem to accept string, maybe add an additional type check?

If a method or function can return multiple different values and unless you are sure that you only can receive a single value in this context, we recommend to add an additional type check:

/**
 * @return array|string
 */
function returnsDifferentValues($x) {
    if ($x) {
        return 'foo';
    }

    return array();
}

$x = returnsDifferentValues($y);
if (is_array($x)) {
    // $x is an array.
}

If this a common case that PHP Analyzer should handle natively, please let us know by opening an issue.

Loading history...
1712
                $newRequestUrl = $this->requestUrl($newUrl, $crawlerId, $timeout, --$recursion);
0 ignored issues
show
Bug introduced by
It seems like $newUrl defined by $this->getRequestUrlFrom...['user'], $url['pass']) on line 1710 can also be of type boolean; however, AOE\Crawler\Controller\C...ontroller::requestUrl() does only seem to accept string, maybe add an additional type check?

If a method or function can return multiple different values and unless you are sure that you only can receive a single value in this context, we recommend to add an additional type check:

/**
 * @return array|string
 */
function returnsDifferentValues($x) {
    if ($x) {
        return 'foo';
    }

    return array();
}

$x = returnsDifferentValues($y);
if (is_array($x)) {
    // $x is an array.
}

If this a common case that PHP Analyzer should handle natively, please let us know by opening an issue.

Loading history...
1713
1714
                if (is_array($newRequestUrl)) {
1715
                    $result = array_merge(['parentRequest' => $result], $newRequestUrl);
1716
                } else {
1717
                    $this->getLogger()->log(
1718
                        LogLevel::DEBUG,
1719
                        sprintf('Error while opening "%s"', $url),
1720
                        ['crawlerId' => $crawlerId]
1721
                    );
1722
                    return false;
1723
                }
1724
            }
1725
1726
            return $result;
1727
        }
1728
    }
1729
1730
    /**
1731
     * Gets the base path of the website frontend.
1732
     * (e.g. if you call http://mydomain.com/cms/index.php in
1733
     * the browser the base path is "/cms/")
1734
     *
1735
     * @return string Base path of the website frontend
1736
     */
1737
    protected function getFrontendBasePath()
1738
    {
1739
        $frontendBasePath = '/';
1740
1741
        // Get the path from the extension settings:
1742
        if (isset($this->extensionSettings['frontendBasePath']) && $this->extensionSettings['frontendBasePath']) {
1743
            $frontendBasePath = $this->extensionSettings['frontendBasePath'];
1744
            // If empty, try to use config.absRefPrefix:
1745
        } elseif (isset($GLOBALS['TSFE']->absRefPrefix) && !empty($GLOBALS['TSFE']->absRefPrefix)) {
1746
            $frontendBasePath = $GLOBALS['TSFE']->absRefPrefix;
1747
            // If not in CLI mode the base path can be determined from $_SERVER environment:
1748
        } elseif (!defined('TYPO3_REQUESTTYPE_CLI') || !TYPO3_REQUESTTYPE_CLI) {
1749
            $frontendBasePath = GeneralUtility::getIndpEnv('TYPO3_SITE_PATH');
1750
        }
1751
1752
        // Base path must be '/<pathSegements>/':
1753
        if ($frontendBasePath !== '/') {
1754
            $frontendBasePath = '/' . ltrim($frontendBasePath, '/');
1755
            $frontendBasePath = rtrim($frontendBasePath, '/') . '/';
1756
        }
1757
1758
        return $frontendBasePath;
1759
    }
1760
1761
    /**
1762
     * Executes a shell command and returns the outputted result.
1763
     *
1764
     * @param string $command Shell command to be executed
1765
     * @return string Outputted result of the command execution
1766
     */
1767
    protected function executeShellCommand($command)
1768
    {
1769
        return shell_exec($command);
1770
    }
1771
1772
    /**
1773
     * Reads HTTP response from the given stream.
1774
     *
1775
     * @param  resource $streamPointer  Pointer to connection stream.
1776
     * @return array                    Associative array with the following items:
1777
     *                                  headers <array> Response headers sent by server.
1778
     *                                  content <array> Content, with each line as an array item.
1779
     */
1780 1
    protected function getHttpResponseFromStream($streamPointer)
1781
    {
1782 1
        $response = ['headers' => [], 'content' => []];
1783
1784 1
        if (is_resource($streamPointer)) {
1785
            // read headers
1786 1
            while ($line = fgets($streamPointer, '2048')) {
1787 1
                $line = trim($line);
1788 1
                if ($line !== '') {
1789 1
                    $response['headers'][] = $line;
1790
                } else {
1791 1
                    break;
1792
                }
1793
            }
1794
1795
            // read content
1796 1
            while ($line = fgets($streamPointer, '2048')) {
1797 1
                $response['content'][] = $line;
1798
            }
1799
        }
1800
1801 1
        return $response;
1802
    }
1803
1804
    /**
1805
     * @param message
1806
     */
1807 2
    protected function log($message)
1808
    {
1809 2
        if (!empty($this->extensionSettings['logFileName'])) {
1810
            $fileResult = @file_put_contents($this->extensionSettings['logFileName'], date('Ymd His') . ' ' . $message . PHP_EOL, FILE_APPEND);
1811
            if (!$fileResult) {
1812
1813
                $this->getLogger()->log(
1814
                    LogLevel::INFO,
1815
                    sprintf('File "%s" could not be written, please check file permissions.', $this->extensionSettings['logFileName'])
1816
                );
1817
            }
1818
        }
1819 2
    }
1820
1821
    /**
1822
     * Builds HTTP request headers.
1823
     *
1824
     * @param array $url
1825
     * @param string $crawlerId
1826
     *
1827
     * @return array
1828
     */
1829 6
    protected function buildRequestHeaderArray(array $url, $crawlerId)
1830
    {
1831 6
        $reqHeaders = [];
1832 6
        $reqHeaders[] = 'GET ' . $url['path'] . ($url['query'] ? '?' . $url['query'] : '') . ' HTTP/1.0';
1833 6
        $reqHeaders[] = 'Host: ' . $url['host'];
1834 6
        if (stristr($url['query'], 'ADMCMD_previewWS')) {
1835 2
            $reqHeaders[] = 'Cookie: $Version="1"; be_typo_user="1"; $Path=/';
1836
        }
1837 6
        $reqHeaders[] = 'Connection: close';
1838 6
        if ($url['user'] != '') {
1839 2
            $reqHeaders[] = 'Authorization: Basic ' . base64_encode($url['user'] . ':' . $url['pass']);
1840
        }
1841 6
        $reqHeaders[] = 'X-T3crawler: ' . $crawlerId;
1842 6
        $reqHeaders[] = 'User-Agent: TYPO3 crawler';
1843 6
        return $reqHeaders;
1844
    }
1845
1846
    /**
1847
     * Check if the submitted HTTP-Header contains a redirect location and built new crawler-url
1848
     *
1849
     * @param array $headers HTTP Header
1850
     * @param string $user HTTP Auth. User
1851
     * @param string $pass HTTP Auth. Password
1852
     * @return bool|string
1853
     */
1854 12
    protected function getRequestUrlFrom302Header($headers, $user = '', $pass = '')
1855
    {
1856 12
        $header = [];
1857 12
        if (!is_array($headers)) {
1858 1
            return false;
1859
        }
1860 11
        if (!(stristr($headers[0], '301 Moved') || stristr($headers[0], '302 Found') || stristr($headers[0], '302 Moved'))) {
1861 2
            return false;
1862
        }
1863
1864 9
        foreach ($headers as $hl) {
1865 9
            $tmp = explode(": ", $hl);
1866 9
            $header[trim($tmp[0])] = trim($tmp[1]);
1867 9
            if (trim($tmp[0]) == 'Location') {
1868 6
                break;
1869
            }
1870
        }
1871 9
        if (!array_key_exists('Location', $header)) {
1872 3
            return false;
1873
        }
1874
1875 6
        if ($user != '') {
1876 3
            if (!($tmp = parse_url($header['Location']))) {
1877 1
                return false;
1878
            }
1879 2
            $newUrl = $tmp['scheme'] . '://' . $user . ':' . $pass . '@' . $tmp['host'] . $tmp['path'];
1880 2
            if ($tmp['query'] != '') {
1881 2
                $newUrl .= '?' . $tmp['query'];
1882
            }
1883
        } else {
1884 3
            $newUrl = $header['Location'];
1885
        }
1886 5
        return $newUrl;
1887
    }
1888
1889
    /**************************
1890
     *
1891
     * tslib_fe hooks:
1892
     *
1893
     **************************/
1894
1895
    /**
1896
     * Initialization hook (called after database connection)
1897
     * Takes the "HTTP_X_T3CRAWLER" header and looks up queue record and verifies if the session comes from the system (by comparing hashes)
1898
     *
1899
     * @param array $params Parameters from frontend
1900
     * @param object $ref TSFE object (reference under PHP5)
1901
     * @return void
1902
     *
1903
     * FIXME: Look like this is not used, in commit 9910d3f40cce15f4e9b7bcd0488bf21f31d53ebc it's added as public,
1904
     * FIXME: I think this can be removed. (TNM)
1905
     */
1906
    public function fe_init(&$params, $ref)
0 ignored issues
show
Unused Code introduced by
The parameter $ref is not used and could be removed.

This check looks from parameters that have been defined for a function or method, but which are not used in the method body.

Loading history...
1907
    {
1908
        // Authenticate crawler request:
1909
        if (isset($_SERVER['HTTP_X_T3CRAWLER'])) {
1910
            list($queueId, $hash) = explode(':', $_SERVER['HTTP_X_T3CRAWLER']);
1911
1912
            $queueRec = $this->queryBuilder
1913
                ->select('*')
1914
                ->from('tx_crawler_queue')
1915
                ->where(
1916
                    $this->queryBuilder->expr()->eq('qid', $this->queryBuilder->createNamedParameter($queueId, \PDO::PARAM_INT))
1917
                )
1918
                ->execute()
1919
                ->fetch();
1920
1921
            // If a crawler record was found and hash was matching, set it up:
1922
            if (is_array($queueRec) && $hash === md5($queueRec['qid'] . '|' . $queueRec['set_id'] . '|' . $GLOBALS['TYPO3_CONF_VARS']['SYS']['encryptionKey'])) {
1923
                $params['pObj']->applicationData['tx_crawler']['running'] = true;
1924
                $params['pObj']->applicationData['tx_crawler']['parameters'] = unserialize($queueRec['parameters']);
1925
                $params['pObj']->applicationData['tx_crawler']['log'] = [];
1926
            } else {
1927
                die('No crawler entry found!');
1928
            }
1929
        }
1930
    }
1931
1932
    /*****************************
1933
     *
1934
     * Compiling URLs to crawl - tools
1935
     *
1936
     *****************************/
1937
1938
    /**
1939
     * @param integer $id Root page id to start from.
1940
     * @param integer $depth Depth of tree, 0=only id-page, 1= on sublevel, 99 = infinite
1941
     * @param integer $scheduledTime Unix Time when the URL is timed to be visited when put in queue
1942
     * @param integer $reqMinute Number of requests per minute (creates the interleave between requests)
1943
     * @param boolean $submitCrawlUrls If set, submits the URLs to queue in database (real crawling)
1944
     * @param boolean $downloadCrawlUrls If set (and submitcrawlUrls is false) will fill $downloadUrls with entries)
1945
     * @param array $incomingProcInstructions Array of processing instructions
1946
     * @param array $configurationSelection Array of configuration keys
1947
     * @return string
1948
     */
1949
    public function getPageTreeAndUrls(
1950
        $id,
1951
        $depth,
1952
        $scheduledTime,
1953
        $reqMinute,
1954
        $submitCrawlUrls,
1955
        $downloadCrawlUrls,
1956
        array $incomingProcInstructions,
1957
        array $configurationSelection
1958
    ) {
1959
        global $BACK_PATH;
1960
        global $LANG;
1961
        if (!is_object($LANG)) {
1962
            $LANG = GeneralUtility::makeInstance(LanguageService::class);
1963
            $LANG->init(0);
1964
        }
1965
        $this->scheduledTime = $scheduledTime;
1966
        $this->reqMinute = $reqMinute;
1967
        $this->submitCrawlUrls = $submitCrawlUrls;
1968
        $this->downloadCrawlUrls = $downloadCrawlUrls;
1969
        $this->incomingProcInstructions = $incomingProcInstructions;
1970
        $this->incomingConfigurationSelection = $configurationSelection;
1971
1972
        $this->duplicateTrack = [];
1973
        $this->downloadUrls = [];
1974
1975
        // Drawing tree:
1976
        /* @var PageTreeView $tree */
1977
        $tree = GeneralUtility::makeInstance(PageTreeView::class);
1978
        $perms_clause = $GLOBALS['BE_USER']->getPagePermsClause(1);
1979
        $tree->init('AND ' . $perms_clause);
1980
1981
        $pageInfo = BackendUtility::readPageAccess($id, $perms_clause);
1982
        if (is_array($pageInfo)) {
1983
            // Set root row:
1984
            $tree->tree[] = [
1985
                'row' => $pageInfo,
1986
                'HTML' => IconUtility::getIconForRecord('pages', $pageInfo)
1987
            ];
1988
        }
1989
1990
        // Get branch beneath:
1991
        if ($depth) {
1992
            $tree->getTree($id, $depth, '');
1993
        }
1994
1995
        // Traverse page tree:
1996
        $code = '';
1997
1998
        foreach ($tree->tree as $data) {
1999
            $this->MP = false;
2000
2001
            // recognize mount points
2002
            if ($data['row']['doktype'] == 7) {
2003
                $this->queryBuilder->getRestrictions()->removeAll()->add(GeneralUtility::makeInstance(DeletedRestriction::class));
2004
                $mountpage = $this->queryBuilder
2005
                    ->select('*')
2006
                    ->from('pages')
2007
                    ->where(
2008
                        $this->queryBuilder->expr()->eq('uid', $this->queryBuilder->createNamedParameter($data['row']['uid'], \PDO::PARAM_INT))
2009
                    )
2010
                    ->execute()
2011
                    ->fetchAll();
2012
                $this->queryBuilder->getRestrictions()->reset();
0 ignored issues
show
Bug introduced by
The method reset() does not seem to exist on object<TYPO3\CMS\Core\Da...tionContainerInterface>.

This check looks for calls to methods that do not seem to exist on a given type. It looks for the method on the type itself as well as in inherited classes or implemented interfaces.

This is most likely a typographical error or the method has been renamed.

Loading history...
2013
2014
                // fetch mounted pages
2015
                $this->MP = $mountpage[0]['mount_pid'] . '-' . $data['row']['uid'];
0 ignored issues
show
Documentation Bug introduced by
The property $MP was declared of type boolean, but $mountpage[0]['mount_pid...' . $data['row']['uid'] is of type string. Maybe add a type cast?

This check looks for assignments to scalar types that may be of the wrong type.

To ensure the code behaves as expected, it may be a good idea to add an explicit type cast.

$answer = 42;

$correct = false;

$correct = (bool) $answer;
Loading history...
2016
2017
                $mountTree = GeneralUtility::makeInstance(PageTreeView::class);
2018
                $mountTree->init('AND ' . $perms_clause);
2019
                $mountTree->getTree($mountpage[0]['mount_pid'], $depth, '');
2020
2021
                foreach ($mountTree->tree as $mountData) {
2022
                    $code .= $this->drawURLs_addRowsForPage(
2023
                        $mountData['row'],
2024
                        $mountData['HTML'] . BackendUtility::getRecordTitle('pages', $mountData['row'], true)
2025
                    );
2026
                }
2027
2028
                // replace page when mount_pid_ol is enabled
2029
                if ($mountpage[0]['mount_pid_ol']) {
2030
                    $data['row']['uid'] = $mountpage[0]['mount_pid'];
2031
                } else {
2032
                    // if the mount_pid_ol is not set the MP must not be used for the mountpoint page
2033
                    $this->MP = false;
2034
                }
2035
            }
2036
2037
            $code .= $this->drawURLs_addRowsForPage(
2038
                $data['row'],
2039
                $data['HTML'] . BackendUtility::getRecordTitle('pages', $data['row'], true)
2040
            );
2041
        }
2042
2043
        return $code;
2044
    }
2045
2046
    /**
2047
     * Expands exclude string
2048
     *
2049
     * @param string $excludeString Exclude string
2050
     * @return array
2051
     */
2052 1
    public function expandExcludeString($excludeString)
2053
    {
2054
        // internal static caches;
2055 1
        static $expandedExcludeStringCache;
2056 1
        static $treeCache;
2057
2058 1
        if (empty($expandedExcludeStringCache[$excludeString])) {
2059 1
            $pidList = [];
2060
2061 1
            if (!empty($excludeString)) {
2062
                /** @var PageTreeView $tree */
2063
                $tree = GeneralUtility::makeInstance(PageTreeView::class);
2064
                $tree->init('AND ' . $this->backendUser->getPagePermsClause(1));
2065
2066
                $excludeParts = GeneralUtility::trimExplode(',', $excludeString);
2067
2068
                foreach ($excludeParts as $excludePart) {
2069
                    list($pid, $depth) = GeneralUtility::trimExplode('+', $excludePart);
2070
2071
                    // default is "page only" = "depth=0"
2072
                    if (empty($depth)) {
2073
                        $depth = (stristr($excludePart, '+')) ? 99 : 0;
2074
                    }
2075
2076
                    $pidList[] = $pid;
2077
2078
                    if ($depth > 0) {
2079
                        if (empty($treeCache[$pid][$depth])) {
2080
                            $tree->reset();
2081
                            $tree->getTree($pid, $depth);
2082
                            $treeCache[$pid][$depth] = $tree->tree;
2083
                        }
2084
2085
                        foreach ($treeCache[$pid][$depth] as $data) {
2086
                            $pidList[] = $data['row']['uid'];
2087
                        }
2088
                    }
2089
                }
2090
            }
2091
2092 1
            $expandedExcludeStringCache[$excludeString] = array_unique($pidList);
2093
        }
2094
2095 1
        return $expandedExcludeStringCache[$excludeString];
2096
    }
2097
2098
    /**
2099
     * Create the rows for display of the page tree
2100
     * For each page a number of rows are shown displaying GET variable configuration
2101
     *
2102
     * @param    array        Page row
2103
     * @param    string        Page icon and title for row
2104
     * @return    string        HTML <tr> content (one or more)
2105
     */
2106
    public function drawURLs_addRowsForPage(array $pageRow, $pageTitleAndIcon)
2107
    {
2108
        $skipMessage = '';
2109
2110
        // Get list of configurations
2111
        $configurations = $this->getUrlsForPageRow($pageRow, $skipMessage);
2112
2113
        if (count($this->incomingConfigurationSelection) > 0) {
2114
            // remove configuration that does not match the current selection
2115
            foreach ($configurations as $confKey => $confArray) {
2116
                if (!in_array($confKey, $this->incomingConfigurationSelection)) {
2117
                    unset($configurations[$confKey]);
2118
                }
2119
            }
2120
        }
2121
2122
        // Traverse parameter combinations:
2123
        $c = 0;
2124
        $content = '';
2125
        if (count($configurations)) {
2126
            foreach ($configurations as $confKey => $confArray) {
2127
2128
                    // Title column:
2129
                if (!$c) {
2130
                    $titleClm = '<td rowspan="' . count($configurations) . '">' . $pageTitleAndIcon . '</td>';
2131
                } else {
2132
                    $titleClm = '';
2133
                }
2134
2135
                if (!in_array($pageRow['uid'], $this->expandExcludeString($confArray['subCfg']['exclude']))) {
2136
2137
                        // URL list:
2138
                    $urlList = $this->urlListFromUrlArray(
2139
                        $confArray,
2140
                        $pageRow,
2141
                        $this->scheduledTime,
2142
                        $this->reqMinute,
2143
                        $this->submitCrawlUrls,
2144
                        $this->downloadCrawlUrls,
2145
                        $this->duplicateTrack,
2146
                        $this->downloadUrls,
2147
                        $this->incomingProcInstructions // if empty the urls won't be filtered by processing instructions
2148
                    );
2149
2150
                    // Expanded parameters:
2151
                    $paramExpanded = '';
2152
                    $calcAccu = [];
2153
                    $calcRes = 1;
2154
                    foreach ($confArray['paramExpanded'] as $gVar => $gVal) {
2155
                        $paramExpanded .= '
2156
                            <tr>
2157
                                <td class="bgColor4-20">' . htmlspecialchars('&' . $gVar . '=') . '<br/>' .
2158
                                                '(' . count($gVal) . ')' .
2159
                                                '</td>
2160
                                <td class="bgColor4" nowrap="nowrap">' . nl2br(htmlspecialchars(implode(chr(10), $gVal))) . '</td>
2161
                            </tr>
2162
                        ';
2163
                        $calcRes *= count($gVal);
2164
                        $calcAccu[] = count($gVal);
2165
                    }
2166
                    $paramExpanded = '<table class="lrPadding c-list param-expanded">' . $paramExpanded . '</table>';
2167
                    $paramExpanded .= 'Comb: ' . implode('*', $calcAccu) . '=' . $calcRes;
2168
2169
                    // Options
2170
                    $optionValues = '';
2171
                    if ($confArray['subCfg']['userGroups']) {
2172
                        $optionValues .= 'User Groups: ' . $confArray['subCfg']['userGroups'] . '<br/>';
2173
                    }
2174
                    if ($confArray['subCfg']['baseUrl']) {
2175
                        $optionValues .= 'Base Url: ' . $confArray['subCfg']['baseUrl'] . '<br/>';
2176
                    }
2177
                    if ($confArray['subCfg']['procInstrFilter']) {
2178
                        $optionValues .= 'ProcInstr: ' . $confArray['subCfg']['procInstrFilter'] . '<br/>';
2179
                    }
2180
2181
                    // Compile row:
2182
                    $content .= '
2183
                        <tr class="bgColor' . ($c % 2 ? '-20' : '-10') . '">
2184
                            ' . $titleClm . '
2185
                            <td>' . htmlspecialchars($confKey) . '</td>
2186
                            <td>' . nl2br(htmlspecialchars(rawurldecode(trim(str_replace('&', chr(10) . '&', GeneralUtility::implodeArrayForUrl('', $confArray['paramParsed'])))))) . '</td>
2187
                            <td>' . $paramExpanded . '</td>
2188
                            <td nowrap="nowrap">' . $urlList . '</td>
2189
                            <td nowrap="nowrap">' . $optionValues . '</td>
2190
                            <td nowrap="nowrap">' . DebugUtility::viewArray($confArray['subCfg']['procInstrParams.']) . '</td>
2191
                        </tr>';
2192
                } else {
2193
                    $content .= '<tr class="bgColor' . ($c % 2 ? '-20' : '-10') . '">
2194
                            ' . $titleClm . '
2195
                            <td>' . htmlspecialchars($confKey) . '</td>
2196
                            <td colspan="5"><em>No entries</em> (Page is excluded in this configuration)</td>
2197
                        </tr>';
2198
                }
2199
2200
                $c++;
2201
            }
2202
        } else {
2203
            $message = !empty($skipMessage) ? ' (' . $skipMessage . ')' : '';
2204
2205
            // Compile row:
2206
            $content .= '
2207
                <tr class="bgColor-20" style="border-bottom: 1px solid black;">
2208
                    <td>' . $pageTitleAndIcon . '</td>
2209
                    <td colspan="6"><em>No entries</em>' . $message . '</td>
2210
                </tr>';
2211
        }
2212
2213
        return $content;
2214
    }
2215
2216
    /*****************************
2217
     *
2218
     * CLI functions
2219
     *
2220
     *****************************/
2221
2222
    /**
2223
     * Main function for running from Command Line PHP script (cron job)
2224
     * See ext/crawler/cli/crawler_cli.phpsh for details
2225
     *
2226
     * @return int number of remaining items or false if error
2227
     */
2228
    public function CLI_main($args)
2229
    {
2230
        $this->setCliArgs($args);
2231
        $this->setAccessMode('cli');
2232
        $result = self::CLI_STATUS_NOTHING_PROCCESSED;
2233
        $cliObj = GeneralUtility::makeInstance(CrawlerCommandLineController::class);
0 ignored issues
show
Unused Code introduced by
$cliObj is not used, you could remove the assignment.

This check looks for variable assignements that are either overwritten by other assignments or where the variable is not used subsequently.

$myVar = 'Value';
$higher = false;

if (rand(1, 6) > 3) {
    $higher = true;
} else {
    $higher = false;
}

Both the $myVar assignment in line 1 and the $higher assignment in line 2 are dead. The first because $myVar is never used and the second because $higher is always overwritten for every possible time line.

Loading history...
2234
2235
        if (!$this->getDisabled() && $this->CLI_checkAndAcquireNewProcess($this->CLI_buildProcessId())) {
2236
            $countInARun = $this->cli_argValue('--countInARun') ? intval($this->cli_argValue('--countInARun')) : $this->extensionSettings['countInARun'];
0 ignored issues
show
Bug introduced by
The call to cli_argValue() misses a required argument $idx.

This check looks for function calls that miss required arguments.

Loading history...
2237
            // Seconds
2238
            $sleepAfterFinish = $this->cli_argValue('--sleepAfterFinish') ? intval($this->cli_argValue('--sleepAfterFinish')) : $this->extensionSettings['sleepAfterFinish'];
0 ignored issues
show
Bug introduced by
The call to cli_argValue() misses a required argument $idx.

This check looks for function calls that miss required arguments.

Loading history...
2239
            // Milliseconds
2240
            $sleepTime = $this->cli_argValue('--sleepTime') ? intval($this->cli_argValue('--sleepTime')) : $this->extensionSettings['sleepTime'];
0 ignored issues
show
Bug introduced by
The call to cli_argValue() misses a required argument $idx.

This check looks for function calls that miss required arguments.

Loading history...
2241
2242
            try {
2243
                // Run process:
2244
                $result = $this->CLI_run($countInARun, $sleepTime, $sleepAfterFinish);
2245
            } catch (\Exception $e) {
2246
                $this->CLI_debug(get_class($e) . ': ' . $e->getMessage());
2247
                $result = self::CLI_STATUS_ABORTED;
2248
            }
2249
2250
            // Cleanup
2251
            $this->processRepository->deleteProcessesWithoutItemsAssigned();
2252
2253
            //TODO can't we do that in a clean way?
2254
            $releaseStatus = $this->CLI_releaseProcesses($this->CLI_buildProcessId());
0 ignored issues
show
Unused Code introduced by
$releaseStatus is not used, you could remove the assignment.

This check looks for variable assignements that are either overwritten by other assignments or where the variable is not used subsequently.

$myVar = 'Value';
$higher = false;

if (rand(1, 6) > 3) {
    $higher = true;
} else {
    $higher = false;
}

Both the $myVar assignment in line 1 and the $higher assignment in line 2 are dead. The first because $myVar is never used and the second because $higher is always overwritten for every possible time line.

Loading history...
2255
2256
            $this->CLI_debug("Unprocessed Items remaining:" . $this->queueRepository->countUnprocessedItems() . " (" . $this->CLI_buildProcessId() . ")");
2257
            $result |= ($this->queueRepository->countUnprocessedItems() > 0 ? self::CLI_STATUS_REMAIN : self::CLI_STATUS_NOTHING_PROCCESSED);
2258
        } else {
2259
            $result |= self::CLI_STATUS_ABORTED;
2260
        }
2261
2262
        return $result;
2263
    }
2264
2265
    /**
2266
     * Helper function
2267
     *
2268
     * @param string $option Option string, eg. "-s
2269
     * @param int $idx Value index, default is 0 (zero) = the first one...
2270
     * @return string
2271
     */
2272
    private function cli_argValue($option, $idx) {
2273
        return is_array($this->cli_args[$option]) ? $this->cli_args[$option][$idx] : '';
0 ignored issues
show
Bug introduced by
The property cli_args does not exist. Did you maybe forget to declare it?

In PHP it is possible to write to properties without declaring them. For example, the following is perfectly valid PHP code:

class MyClass { }

$x = new MyClass();
$x->foo = true;

Generally, it is a good practice to explictly declare properties to avoid accidental typos and provide IDE auto-completion:

class MyClass {
    public $foo;
}

$x = new MyClass();
$x->foo = true;
Loading history...
2274
    }
2275
2276
    /**
2277
     * Helper function
2278
     *
2279
     * @param string $string The string to output
2280
     */
2281
    private function cli_echo($string) {
2282
        $this->outputLine($string);
0 ignored issues
show
Bug introduced by
The method outputLine() does not seem to exist on object<AOE\Crawler\Controller\CrawlerController>.

This check looks for calls to methods that do not seem to exist on a given type. It looks for the method on the type itself as well as in inherited classes or implemented interfaces.

This is most likely a typographical error or the method has been renamed.

Loading history...
2283
    }
2284
2285
    /**
2286
     * Set cli args
2287
     *
2288
     * This is a copy from the CommandLineController from TYPO3 < v9
2289
     *
2290
     * TODO: Rework
2291
     *
2292
     * @param array $argv
2293
     */
2294
    private function setCliArgs(array $argv) {
2295
        $cli_options = [];
2296
        $index = '_DEFAULT';
2297
        foreach ($argv as $token) {
2298
            // Options starting with a number is invalid - they could be negative values!
2299
            if ($token[0] === '-' && !MathUtility::canBeInterpretedAsInteger($token[1])) {
2300
                list($index, $opt) = explode('=', $token, 2);
2301
                if (isset($cli_options[$index])) {
2302
                    echo 'ERROR: Option ' . $index . ' was used twice!' . LF;
2303
                    die;
2304
                }
2305
                $cli_options[$index] = [];
2306
                if (isset($opt)) {
2307
                    $cli_options[$index][] = $opt;
2308
                }
2309
            } else {
2310
                $cli_options[$index][] = $token;
2311
            }
2312
        }
2313
2314
        $this->cliArgs = $cli_options;
2315
    }
2316
2317
2318
2319
    /**
2320
     * Function executed by crawler_im.php cli script.
2321
     *
2322
     * @return void
2323
     */
2324
    public function CLI_main_im($args = [])
2325
    {
2326
        $this->setAccessMode('cli_im');
2327
2328
        if(!empty($args)) {
2329
            $this->setCliArgs($args);
2330
        }
2331
2332
        // Force user to admin state and set workspace to "Live":
2333
        $this->backendUser->user['admin'] = 1;
2334
        $this->backendUser->setWorkspace(0);
2335
2336
        if ($this->cli_argValue('-o') === 'exec') {
0 ignored issues
show
Bug introduced by
The call to cli_argValue() misses a required argument $idx.

This check looks for function calls that miss required arguments.

Loading history...
2337
            $this->registerQueueEntriesInternallyOnly = true;
2338
        }
2339
2340
        if (isset($cliObj->cli_args['_DEFAULT'][2])) {
2341
            // Crawler is called over TYPO3 BE
2342
            $pageId = MathUtility::forceIntegerInRange($cliObj->cli_args['_DEFAULT'][2], 0);
0 ignored issues
show
Bug introduced by
The variable $cliObj does not exist. Did you forget to declare it?

This check marks access to variables or properties that have not been declared yet. While PHP has no explicit notion of declaring a variable, accessing it before a value is assigned to it is most likely a bug.

Loading history...
2343
        } else {
2344
            // Crawler is called over cli
2345
            $pageId = MathUtility::forceIntegerInRange($cliObj->cli_args['_DEFAULT'][1], 0);
2346
        }
2347
2348
        $configurationKeys = $this->getConfigurationKeys($cliObj);
0 ignored issues
show
Unused Code introduced by
The call to CrawlerController::getConfigurationKeys() has too many arguments starting with $cliObj.

This check compares calls to functions or methods with their respective definitions. If the call has more arguments than are defined, it raises an issue.

If a function is defined several times with a different number of parameters, the check may pick up the wrong definition and report false positives. One codebase where this has been known to happen is Wordpress.

In this case you can add the @ignore PhpDoc annotation to the duplicate definition and it will be ignored.

Loading history...
2349
2350
        if (!is_array($configurationKeys)) {
2351
            $configurations = $this->getUrlsForPageId($pageId);
2352
            if (is_array($configurations)) {
2353
                $configurationKeys = array_keys($configurations);
2354
            } else {
2355
                $configurationKeys = [];
2356
            }
2357
        }
2358
2359
        if ($this->cli_argValue('-o') === 'queue' || $this->cli_argValue('-o') === 'exec') {
0 ignored issues
show
Bug introduced by
The call to cli_argValue() misses a required argument $idx.

This check looks for function calls that miss required arguments.

Loading history...
2360
            $reason = new Reason();
2361
            $reason->setReason(Reason::REASON_GUI_SUBMIT);
2362
            $reason->setDetailText('The cli script of the crawler added to the queue');
2363
            EventDispatcher::getInstance()->post(
2364
                'invokeQueueChange',
2365
                $this->setID,
2366
                ['reason' => $reason]
2367
            );
2368
        }
2369
2370
        if ($this->extensionSettings['cleanUpOldQueueEntries']) {
2371
            $this->cleanUpOldQueueEntries();
2372
        }
2373
2374
        $this->setID = (int) GeneralUtility::md5int(microtime());
2375
        $this->getPageTreeAndUrls(
2376
            $pageId,
2377
            MathUtility::forceIntegerInRange($this->cli_argValue('-d'), 0, 99),
0 ignored issues
show
Bug introduced by
The call to cli_argValue() misses a required argument $idx.

This check looks for function calls that miss required arguments.

Loading history...
2378
            $this->getCurrentTime(),
2379
            MathUtility::forceIntegerInRange($cliObj->cli_isArg('-n') ? $this->cli_argValue('-n') : 30, 1, 1000),
0 ignored issues
show
Bug introduced by
The call to cli_argValue() misses a required argument $idx.

This check looks for function calls that miss required arguments.

Loading history...
2380
            $this->cli_argValue('-o') === 'queue' || $this->cli_argValue('-o') === 'exec',
0 ignored issues
show
Bug introduced by
The call to cli_argValue() misses a required argument $idx.

This check looks for function calls that miss required arguments.

Loading history...
2381
            $this->cli_argValue('-o') === 'url',
0 ignored issues
show
Bug introduced by
The call to cli_argValue() misses a required argument $idx.

This check looks for function calls that miss required arguments.

Loading history...
2382
            GeneralUtility::trimExplode(',', $this->cli_argValue('-proc'), true),
0 ignored issues
show
Bug introduced by
The call to cli_argValue() misses a required argument $idx.

This check looks for function calls that miss required arguments.

Loading history...
2383
            $configurationKeys
2384
        );
2385
2386
        if ($this->cli_argValue('-o') === 'url') {
0 ignored issues
show
Bug introduced by
The call to cli_argValue() misses a required argument $idx.

This check looks for function calls that miss required arguments.

Loading history...
2387
            $this->cli_echo(implode(chr(10), $this->downloadUrls) . chr(10), true);
0 ignored issues
show
Unused Code introduced by
The call to CrawlerController::cli_echo() has too many arguments starting with true.

This check compares calls to functions or methods with their respective definitions. If the call has more arguments than are defined, it raises an issue.

If a function is defined several times with a different number of parameters, the check may pick up the wrong definition and report false positives. One codebase where this has been known to happen is Wordpress.

In this case you can add the @ignore PhpDoc annotation to the duplicate definition and it will be ignored.

Loading history...
2388
        } elseif ($this->cli_argValue('-o') === 'exec') {
0 ignored issues
show
Bug introduced by
The call to cli_argValue() misses a required argument $idx.

This check looks for function calls that miss required arguments.

Loading history...
2389
            $this->cli_echo("Executing " . count($this->urlList) . " requests right away:\n\n");
2390
            $this->cli_echo(implode(chr(10), $this->urlList) . chr(10));
2391
            $this->cli_echo("\nProcessing:\n");
2392
2393
            foreach ($this->queueEntries as $queueRec) {
2394
                $p = unserialize($queueRec['parameters']);
2395
                $this->cli_echo($p['url'] . ' (' . implode(',', $p['procInstructions']) . ') => ');
2396
2397
                $result = $this->readUrlFromArray($queueRec);
2398
2399
                $requestResult = unserialize($result['content']);
2400
                if (is_array($requestResult)) {
2401
                    $resLog = is_array($requestResult['log']) ? chr(10) . chr(9) . chr(9) . implode(chr(10) . chr(9) . chr(9), $requestResult['log']) : '';
2402
                    $this->cli_echo('OK: ' . $resLog . chr(10));
2403
                } else {
2404
                    $this->cli_echo('Error checking Crawler Result: ' . substr(preg_replace('/\s+/', ' ', strip_tags($result['content'])), 0, 30000) . '...' . chr(10));
2405
                }
2406
            }
2407
        } elseif ($this->cli_argValue('-o') === 'queue') {
0 ignored issues
show
Bug introduced by
The call to cli_argValue() misses a required argument $idx.

This check looks for function calls that miss required arguments.

Loading history...
2408
            $this->cli_echo("Putting " . count($this->urlList) . " entries in queue:\n\n");
2409
            $this->cli_echo(implode(chr(10), $this->urlList) . chr(10));
2410
        } else {
2411
            $this->cli_echo(count($this->urlList) . " entries found for processing. (Use -o to decide action):\n\n", true);
0 ignored issues
show
Unused Code introduced by
The call to CrawlerController::cli_echo() has too many arguments starting with true.

This check compares calls to functions or methods with their respective definitions. If the call has more arguments than are defined, it raises an issue.

If a function is defined several times with a different number of parameters, the check may pick up the wrong definition and report false positives. One codebase where this has been known to happen is Wordpress.

In this case you can add the @ignore PhpDoc annotation to the duplicate definition and it will be ignored.

Loading history...
2412
            $this->cli_echo(implode(chr(10), $this->urlList) . chr(10), true);
0 ignored issues
show
Unused Code introduced by
The call to CrawlerController::cli_echo() has too many arguments starting with true.

This check compares calls to functions or methods with their respective definitions. If the call has more arguments than are defined, it raises an issue.

If a function is defined several times with a different number of parameters, the check may pick up the wrong definition and report false positives. One codebase where this has been known to happen is Wordpress.

In this case you can add the @ignore PhpDoc annotation to the duplicate definition and it will be ignored.

Loading history...
2413
        }
2414
    }
2415
2416
    /**
2417
     * Function executed by crawler_im.php cli script.
2418
     *
2419
     * @return bool
2420
     */
2421
    public function CLI_main_flush()
2422
    {
2423
        $this->setAccessMode('cli_flush');
2424
        $cliObj = GeneralUtility::makeInstance(FlushCommandLineController::class);
2425
2426
        // Force user to admin state and set workspace to "Live":
2427
        $this->backendUser->user['admin'] = 1;
2428
        $this->backendUser->setWorkspace(0);
2429
2430
        $pageId = MathUtility::forceIntegerInRange($cliObj->cli_args['_DEFAULT'][1], 0);
2431
        $fullFlush = ($pageId == 0);
2432
2433
        $mode = $this->cli_argValue('-o');
0 ignored issues
show
Bug introduced by
The call to cli_argValue() misses a required argument $idx.

This check looks for function calls that miss required arguments.

Loading history...
2434
2435
        switch ($mode) {
2436
            case 'all':
2437
                $result = $this->getLogEntriesForPageId($pageId, '', true, $fullFlush);
2438
                break;
2439
            case 'finished':
2440
            case 'pending':
2441
                $result = $this->getLogEntriesForPageId($pageId, $mode, true, $fullFlush);
2442
                break;
2443
            default:
2444
        }
2445
2446
        return $result !== false;
0 ignored issues
show
Bug introduced by
The variable $result does not seem to be defined for all execution paths leading up to this point.

If you define a variable conditionally, it can happen that it is not defined for all execution paths.

Let’s take a look at an example:

function myFunction($a) {
    switch ($a) {
        case 'foo':
            $x = 1;
            break;

        case 'bar':
            $x = 2;
            break;
    }

    // $x is potentially undefined here.
    echo $x;
}

In the above example, the variable $x is defined if you pass “foo” or “bar” as argument for $a. However, since the switch statement has no default case statement, if you pass any other value, the variable $x would be undefined.

Available Fixes

  1. Check for existence of the variable explicitly:

    function myFunction($a) {
        switch ($a) {
            case 'foo':
                $x = 1;
                break;
    
            case 'bar':
                $x = 2;
                break;
        }
    
        if (isset($x)) { // Make sure it's always set.
            echo $x;
        }
    }
    
  2. Define a default value for the variable:

    function myFunction($a) {
        $x = ''; // Set a default which gets overridden for certain paths.
        switch ($a) {
            case 'foo':
                $x = 1;
                break;
    
            case 'bar':
                $x = 2;
                break;
        }
    
        echo $x;
    }
    
  3. Add a value for the missing path:

    function myFunction($a) {
        switch ($a) {
            case 'foo':
                $x = 1;
                break;
    
            case 'bar':
                $x = 2;
                break;
    
            // We add support for the missing case.
            default:
                $x = '';
                break;
        }
    
        echo $x;
    }
    
Loading history...
2447
    }
2448
2449
    /**
2450
     * Obtains configuration keys from the CLI arguments
2451
     *
2452
     * @return mixed                        Array of keys or null if no keys found
2453
     */
2454
    protected function getConfigurationKeys()
2455
    {
2456
        $parameter = trim($this->cli_argValue('-conf'));
0 ignored issues
show
Bug introduced by
The call to cli_argValue() misses a required argument $idx.

This check looks for function calls that miss required arguments.

Loading history...
2457
        return ($parameter != '' ? GeneralUtility::trimExplode(',', $parameter) : []);
2458
    }
2459
2460
    /**
2461
     * Running the functionality of the CLI (crawling URLs from queue)
2462
     *
2463
     * @param int $countInARun
2464
     * @param int $sleepTime
2465
     * @param int $sleepAfterFinish
2466
     * @return string
2467
     */
2468
    public function CLI_run($countInARun, $sleepTime, $sleepAfterFinish)
2469
    {
2470
        $result = 0;
2471
        $counter = 0;
2472
2473
        // First, run hooks:
2474
        $this->CLI_runHooks();
2475
2476
        // Clean up the queue
2477
        if (intval($this->extensionSettings['purgeQueueDays']) > 0) {
2478
            $purgeDate = $this->getCurrentTime() - 24 * 60 * 60 * intval($this->extensionSettings['purgeQueueDays']);
2479
2480
            $del = $this->queryBuilder
2481
                ->delete('tx_crawler_queue')
2482
                ->where(
2483
                    'exec_time != 0 AND exec_time < ' . $purgeDate
2484
                );
2485
            if (false == $del) {
2486
2487
                $this->getLogger()->log(
2488
                    LogLevel::INFO,
2489
                    'Records could not be deleted.'
2490
                );
2491
            }
2492
        }
2493
2494
        // Select entries:
2495
        //TODO Shouldn't this reside within the transaction?
2496
        $rows = $this->queryBuilder
2497
            ->select('qid', 'scheduled')
2498
            ->from('tx_crawler_queue')
2499
            ->where(
2500
                $this->queryBuilder->expr()->eq('exec_time', 0),
2501
                $this->queryBuilder->expr()->eq('process_scheduled', 0),
2502
                $this->queryBuilder->expr()->lte('scheduled',  $this->getCurrentTime())
2503
            )
2504
            ->orderBy('scheduled')
2505
            ->addOrderBy('qid')
2506
            ->setMaxResults($countInARun)
2507
            ->execute()
2508
            ->fetchAll();
2509
2510
        if (count($rows) > 0) {
2511
            $quidList = [];
2512
2513
            foreach ($rows as $r) {
2514
                $quidList[] = $r['qid'];
2515
            }
2516
2517
            $processId = $this->CLI_buildProcessId();
2518
2519
            //reserve queue entries for process
2520
2521
            //$this->queryBuilder->getConnection()->executeQuery('BEGIN');
2522
            //TODO make sure we're not taking assigned queue-entires
2523
2524
            //save the number of assigned queue entrys to determine who many have been processed later
2525
            $numberOfAffectedRows = $this->queryBuilder
2526
                ->update('tx_crawler_queue')
2527
                ->where(
2528
                    $this->queryBuilder->expr()->in('qid', $quidList)
2529
                )
2530
                ->set('process_scheduled', $this->queryBuilder->createNamedParamter($this->getCurrentTime(), \PDO::PARAM_INT))
0 ignored issues
show
Bug introduced by
The method createNamedParamter() does not exist on TYPO3\CMS\Core\Database\Query\QueryBuilder. Did you maybe mean createNamedParameter()?

This check marks calls to methods that do not seem to exist on an object.

This is most likely the result of a method being renamed without all references to it being renamed likewise.

Loading history...
2531
                ->set('process_id', $processId)
2532
                ->execute();
2533
2534
2535
            GeneralUtility::makeInstance(ConnectionPool::class)->getConnectionForTable('tx_crawler_process')
2536
                ->update(
2537
                    'tx_crawler_process',
2538
                    [ 'assigned_items_count' => (int)$numberOfAffectedRows ],
2539
                    [ 'process_id' => (int) $processId ]
2540
                );
2541
2542
            if ($numberOfAffectedRows == count($quidList)) {
2543
                //$this->queryBuilder->getConnection()->executeQuery('COMMIT');
2544
            } else {
2545
                //$this->queryBuilder->getConnection()->executeQuery('ROLLBACK');
2546
                $this->CLI_debug("Nothing processed due to multi-process collision (" . $this->CLI_buildProcessId() . ")");
2547
                return ($result | self::CLI_STATUS_ABORTED);
2548
            }
2549
2550
            foreach ($rows as $r) {
2551
                $result |= $this->readUrl($r['qid']);
2552
2553
                $counter++;
2554
                usleep(intval($sleepTime)); // Just to relax the system
2555
2556
                // if during the start and the current read url the cli has been disable we need to return from the function
2557
                // mark the process NOT as ended.
2558
                if ($this->getDisabled()) {
2559
                    return ($result | self::CLI_STATUS_ABORTED);
2560
                }
2561
2562
                if (!$this->CLI_checkIfProcessIsActive($this->CLI_buildProcessId())) {
2563
                    $this->CLI_debug("conflict / timeout (" . $this->CLI_buildProcessId() . ")");
2564
2565
                    //TODO might need an additional returncode
2566
                    $result |= self::CLI_STATUS_ABORTED;
2567
                    break; //possible timeout
2568
                }
2569
            }
2570
2571
            sleep(intval($sleepAfterFinish));
2572
2573
            $msg = 'Rows: ' . $counter;
2574
            $this->CLI_debug($msg . " (" . $this->CLI_buildProcessId() . ")");
2575
        } else {
2576
            $this->CLI_debug("Nothing within queue which needs to be processed (" . $this->CLI_buildProcessId() . ")");
2577
        }
2578
2579
        if ($counter > 0) {
2580
            $result |= self::CLI_STATUS_PROCESSED;
2581
        }
2582
2583
        return $result;
2584
    }
2585
2586
    /**
2587
     * Activate hooks
2588
     *
2589
     * @return void
2590
     */
2591
    public function CLI_runHooks()
2592
    {
2593
        global $TYPO3_CONF_VARS;
2594
        if (is_array($TYPO3_CONF_VARS['EXTCONF']['crawler']['cli_hooks'])) {
2595
            foreach ($TYPO3_CONF_VARS['EXTCONF']['crawler']['cli_hooks'] as $objRef) {
2596
                $hookObj = GeneralUtility::makeInstance($objRef);
2597
                if (is_object($hookObj)) {
2598
                    $hookObj->crawler_init($this);
2599
                }
2600
            }
2601
        }
2602
    }
2603
2604
    /**
2605
     * Try to acquire a new process with the given id
2606
     * also performs some auto-cleanup for orphan processes
2607
     * @todo preemption might not be the most elegant way to clean up
2608
     *
2609
     * @param string $id identification string for the process
2610
     * @return boolean
2611
     */
2612
    public function CLI_checkAndAcquireNewProcess($id)
2613
    {
2614
        $ret = true;
2615
2616
        $systemProcessId = getmypid();
2617
        if ($systemProcessId < 1) {
2618
            return false;
2619
        }
2620
2621
        $processCount = 0;
2622
        $orphanProcesses = [];
2623
2624
        //$this->queryBuilder->getConnection()->executeQuery('BEGIN');
2625
2626
        $statement = $this->queryBuilder
2627
            ->select('process_id', 'ttl')
2628
            ->from('tx_crawler_process')
2629
            ->where(
2630
                'active = 1 AND deleted = 0'
2631
            )
2632
            ->execute();
2633
2634
        $currentTime = $this->getCurrentTime();
2635
2636
        while ($row = $statement->fetch()) {
2637
            if ($row['ttl'] < $currentTime) {
2638
                $orphanProcesses[] = $row['process_id'];
2639
            } else {
2640
                $processCount++;
2641
            }
2642
        }
2643
2644
        // if there are less than allowed active processes then add a new one
2645
        if ($processCount < intval($this->extensionSettings['processLimit'])) {
2646
            $this->CLI_debug("add process " . $this->CLI_buildProcessId() . " (" . ($processCount + 1) . "/" . intval($this->extensionSettings['processLimit']) . ")");
2647
2648
            GeneralUtility::makeInstance(ConnectionPool::class)->getConnectionForTable('tx_crawler_process')->insert(
2649
                'tx_crawler_process',
2650
                [
2651
                    'process_id' => $id,
2652
                    'active' => 1,
2653
                    'ttl' => $currentTime + (int)$this->extensionSettings['processMaxRunTime'],
2654
                    'system_process_id' => $systemProcessId
2655
                ]
2656
            );
2657
        } else {
2658
            $this->CLI_debug("Processlimit reached (" . ($processCount) . "/" . intval($this->extensionSettings['processLimit']) . ")");
2659
            $ret = false;
2660
        }
2661
2662
        $this->CLI_releaseProcesses($orphanProcesses, true); // maybe this should be somehow included into the current lock
2663
        $this->CLI_deleteProcessesMarkedDeleted();
0 ignored issues
show
Deprecated Code introduced by
The method AOE\Crawler\Controller\C...rocessesMarkedDeleted() has been deprecated with message: since crawler v7.0.0, will be removed in crawler v8.0.0.
Please Consider using $this->processRepository->deleteProcessesMarkedAsDeleted()

This method has been deprecated. The supplier of the class has supplied an explanatory message.

The explanatory message should give you some clue as to whether and when the method will be removed from the class and what other method or class to use instead.

Loading history...
2664
2665
        //$this->queryBuilder->getConnection()->executeQuery('COMMIT');
2666
2667
        return $ret;
2668
    }
2669
2670
    /**
2671
     * Release a process and the required resources
2672
     *
2673
     * @param  mixed    $releaseIds   string with a single process-id or array with multiple process-ids
2674
     * @param  boolean  $withinLock   show whether the DB-actions are included within an existing lock
2675
     * @return boolean
2676
     */
2677
    public function CLI_releaseProcesses($releaseIds, $withinLock = false)
2678
    {
2679
        if (!is_array($releaseIds)) {
2680
            $releaseIds = [$releaseIds];
2681
        }
2682
2683
        if (!(count($releaseIds) > 0)) {
2684
            return false;   //nothing to release
2685
        }
2686
2687
        if (!$withinLock) {
2688
            //$this->queryBuilder->getConnection()->executeQuery('BEGIN');
2689
        }
2690
2691
        // some kind of 2nd chance algo - this way you need at least 2 processes to have a real cleanup
2692
        // this ensures that a single process can't mess up the entire process table
2693
2694
        // mark all processes as deleted which have no "waiting" queue-entires and which are not active
2695
2696
        $this->queryBuilder
2697
        ->update('tx_crawler_queue', 'q')
2698
        ->where(
2699
            'q.process_id IN(SELECT p.process_id FROM tx_crawler_process as p WHERE p.active = 0 and p.deleted = 0)'
2700
        )
2701
        ->set('q.process_scheduled', 0)
2702
        ->set('q.process_id', '')
2703
        ->execute();
2704
2705
        // FIXME: Not entirely sure that this is equivalent to the previous version
2706
        $this->queryBuilder->resetQueryPart('set');
2707
2708
        $this->queryBuilder
2709
            ->update('tx_crawler_process', 'p')
2710
            ->where(
2711
                $this->queryBuilder->expr()->eq('p.active', 0),
2712
                'p.process_id IN(SELECT q.process_id FROM tx_crawler_queue as q WHERE q.exec_time = 0)'
2713
            )
2714
            ->set('p.system_process_id', 0)
2715
            ->execute();
2716
        // previous version for reference
2717
        /*
2718
        $GLOBALS['TYPO3_DB']->exec_UPDATEquery(
2719
            'tx_crawler_process',
2720
            'active=0 AND deleted=0
2721
            AND NOT EXISTS (
2722
                SELECT * FROM tx_crawler_queue
2723
                WHERE tx_crawler_queue.process_id = tx_crawler_process.process_id
2724
                AND tx_crawler_queue.exec_time = 0
2725
            )',
2726
            [
2727
                'deleted' => '1',
2728
                'system_process_id' => 0
2729
            ]
2730
        );*/
2731
        // mark all requested processes as non-active
2732
        $this->queryBuilder
2733
            ->update('tx_crawler_process')
2734
            ->where(
2735
                'NOT EXISTS (
2736
                SELECT * FROM tx_crawler_queue
2737
                    WHERE tx_crawler_queue.process_id = tx_crawler_process.process_id
2738
                    AND tx_crawler_queue.exec_time = 0
2739
                )',
2740
                $this->queryBuilder->expr()->in('process_id', $this->queryBuilder->createNamedParameter($releaseIds, Connection::PARAM_STR_ARRAY)),
2741
                $this->queryBuilder->expr()->eq('deleted', 0)
2742
            )
2743
            ->set('active', 0)
2744
            ->execute();
2745
        $this->queryBuilder->resetQueryPart('set');
2746
        $this->queryBuilder
2747
            ->update('tx_crawler_queue')
2748
            ->where(
2749
                $this->queryBuilder->expr()->eq('exec_time', 0),
2750
                $this->queryBuilder->expr()->in('process_id', $this->queryBuilder->createNamedParameter($releaseIds, Connection::PARAM_STR_ARRAY)),
2751
                $this->queryBuilder->expr()->eq('deleted', 0)
2752
            )
2753
            ->set('process_scheduled', 0)
2754
            ->set('process_id', '')
2755
            ->execute();
2756
2757
        if (!$withinLock) {
2758
            //$this->queryBuilder->getConnection()->executeQuery('COMMIT');
2759
        }
2760
2761
        return true;
2762
    }
2763
2764
    /**
2765
     * Delete processes marked as deleted
2766
     *
2767
     * @return void
2768
     *
2769
     * @deprecated since crawler v7.0.0, will be removed in crawler v8.0.0.
2770
     * Please Consider using $this->processRepository->deleteProcessesMarkedAsDeleted()
2771
     */
2772
    public function CLI_deleteProcessesMarkedDeleted()
2773
    {
2774
        $this->queryBuilder
2775
            ->delete('tx_crawler_process')
2776
            ->where('deleted = 1');
2777
    }
2778
2779
    /**
2780
     * Check if there are still resources left for the process with the given id
2781
     * Used to determine timeouts and to ensure a proper cleanup if there's a timeout
2782
     *
2783
     * @param  string  identification string for the process
2784
     * @return boolean determines if the process is still active / has resources
2785
     *
2786
     * TODO: Please consider moving this to Domain Model for Process or in ProcessRepository
2787
     */
2788 1
    public function CLI_checkIfProcessIsActive($pid)
2789
    {
2790 1
        $ret = false;
2791
2792 1
        $statement = $this->queryBuilder
2793 1
            ->from('tx_crawler_process')
2794 1
            ->select('active')
2795 1
            ->where(
2796 1
                $this->queryBuilder->expr()->eq('process_id', intval($pid))
2797
            )
2798 1
            ->orderBy('ttl')
2799 1
            ->execute();
2800
2801 1
        if ($row = $statement->fetch(0)) {
2802 1
            $ret = intVal($row['active']) == 1;
2803
        }
2804
2805 1
        return $ret;
2806
    }
2807
2808
    /**
2809
     * Create a unique Id for the current process
2810
     *
2811
     * @return string  the ID
2812
     */
2813 2
    public function CLI_buildProcessId()
2814
    {
2815 2
        if (!$this->processID) {
2816 1
            $this->processID = GeneralUtility::shortMD5($this->microtime(true));
2817
        }
2818 2
        return $this->processID;
2819
    }
2820
2821
    /**
2822
     * @param bool $get_as_float
2823
     *
2824
     * @return mixed
2825
     */
2826
    protected function microtime($get_as_float = false)
2827
    {
2828
        return microtime($get_as_float);
2829
    }
2830
2831
    /**
2832
     * Prints a message to the stdout (only if debug-mode is enabled)
2833
     *
2834
     * @param  string $msg  the message
2835
     */
2836
    public function CLI_debug($msg)
2837
    {
2838
        if (intval($this->extensionSettings['processDebug'])) {
2839
            echo $msg . "\n";
2840
            flush();
2841
        }
2842
    }
2843
2844
    /**
2845
     * Get URL content by making direct request to TYPO3.
2846
     *
2847
     * @param  string $url          Page URL
2848
     * @param  int    $crawlerId    Crawler-ID
2849
     * @return array
2850
     */
2851 2
    protected function sendDirectRequest($url, $crawlerId)
2852
    {
2853 2
        $parsedUrl = parse_url($url);
2854 2
        if (!is_array($parsedUrl)) {
2855
            return [];
2856
        }
2857
2858 2
        $requestHeaders = $this->buildRequestHeaderArray($parsedUrl, $crawlerId);
2859
2860 2
        $cmd = escapeshellcmd($this->extensionSettings['phpPath']);
2861 2
        $cmd .= ' ';
2862 2
        $cmd .= escapeshellarg(ExtensionManagementUtility::extPath('crawler') . 'cli/bootstrap.php');
2863 2
        $cmd .= ' ';
2864 2
        $cmd .= escapeshellarg($this->getFrontendBasePath());
2865 2
        $cmd .= ' ';
2866 2
        $cmd .= escapeshellarg($url);
2867 2
        $cmd .= ' ';
2868 2
        $cmd .= escapeshellarg(base64_encode(serialize($requestHeaders)));
2869
2870 2
        $startTime = microtime(true);
2871 2
        $content = $this->executeShellCommand($cmd);
2872 2
        $this->log($url . ' ' . (microtime(true) - $startTime));
2873
2874
        $result = [
2875 2
            'request' => implode("\r\n", $requestHeaders) . "\r\n\r\n",
2876 2
            'headers' => '',
2877 2
            'content' => $content
2878
        ];
2879
2880 2
        return $result;
2881
    }
2882
2883
    /**
2884
     * Cleans up entries that stayed for too long in the queue. These are:
2885
     * - processed entries that are over 1.5 days in age
2886
     * - scheduled entries that are over 7 days old
2887
     *
2888
     * @return void
2889
     */
2890
    public function cleanUpOldQueueEntries()
2891
    {
2892
        $processedAgeInSeconds = $this->extensionSettings['cleanUpProcessedAge'] * 86400; // 24*60*60 Seconds in 24 hours
2893
        $scheduledAgeInSeconds = $this->extensionSettings['cleanUpScheduledAge'] * 86400;
2894
2895
        $now = time();
2896
        $condition = '(exec_time<>0 AND exec_time<' . ($now - $processedAgeInSeconds) . ') OR scheduled<=' . ($now - $scheduledAgeInSeconds);
2897
        $this->flushQueue($condition);
2898
    }
2899
2900
    /**
2901
     * Initializes a TypoScript Frontend necessary for using TypoScript and TypoLink functions
2902
     *
2903
     * @param int $id
2904
     * @param int $typeNum
2905
     *
2906
     * @return void
2907
     */
2908
    protected function initTSFE($id = 1, $typeNum = 0)
2909
    {
2910
        EidUtility::initTCA();
0 ignored issues
show
Deprecated Code introduced by
The method TYPO3\CMS\Frontend\Utility\EidUtility::initTCA() has been deprecated with message: since TYPO3 v9.4, will be removed in TYPO3 v10.0. Is not needed anymore within eID scripts as TCA is now available at any time

This method has been deprecated. The supplier of the class has supplied an explanatory message.

The explanatory message should give you some clue as to whether and when the method will be removed from the class and what other method or class to use instead.

Loading history...
2911
        if (!is_object($GLOBALS['TT'])) {
2912
            $GLOBALS['TT'] = new TimeTracker(false);
2913
            $GLOBALS['TT']->start();
2914
        }
2915
2916
        $GLOBALS['TSFE'] = GeneralUtility::makeInstance(TypoScriptFrontendController::class, $GLOBALS['TYPO3_CONF_VARS'], $id, $typeNum);
2917
        $GLOBALS['TSFE']->sys_page = GeneralUtility::makeInstance(PageRepository::class);
2918
        $GLOBALS['TSFE']->sys_page->init(true);
2919
        $GLOBALS['TSFE']->initFEuser();
2920
        $GLOBALS['TSFE']->determineId();
2921
        $GLOBALS['TSFE']->initTemplate();
2922
        $GLOBALS['TSFE']->rootLine = $GLOBALS['TSFE']->sys_page->getRootLine($id, '');
2923
        $GLOBALS['TSFE']->getConfigArray();
2924
    }
2925
2926
    /**
2927
     * Returns a md5 hash generated from a serialized configuration array.
2928
     *
2929
     * @param array $configuration
2930
     *
2931
     * @return string
2932
     */
2933 7
    protected function getConfigurationHash(array $configuration) {
2934 7
        unset($configuration['paramExpanded']);
2935 7
        unset($configuration['URLs']);
2936 7
        return md5(serialize($configuration));
2937
    }
2938
2939
    /**
2940
     * Check whether the Crawling Protocol should be http or https
2941
     *
2942
     * @param $crawlerConfiguration
2943
     * @param $pageConfiguration
2944
     *
2945
     * @return bool
2946
     */
2947 6
    protected function isCrawlingProtocolHttps($crawlerConfiguration, $pageConfiguration) {
2948 6
        switch($crawlerConfiguration) {
2949
            case -1:
2950 1
                return false;
2951 5
            case 0:
2952 3
                return $pageConfiguration;
2953 2
            case 1:
2954 1
                return true;
2955
            default:
2956 1
                return false;
2957
        }
2958
    }
2959
}
2960